Files
dsProject/dsLightRag/Util/OCR_URL_1_Shot.py

76 lines
2.9 KiB
Python
Raw Normal View History

2025-08-14 15:45:08 +08:00
import logging
from playwright.async_api import async_playwright
# 直接获取模块专属日志器(无需重复配置)
logger = logging.getLogger(__name__)
async def shot(url):
logger.info("开始执行截图任务")
async with async_playwright() as p:
logger.info("启动浏览器")
# 启动浏览器时添加更多参数以提高稳定性
browser = await p.chromium.launch(
headless=True,
args=[
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu"
]
)
logger.info("创建新页面")
page = await browser.new_page()
# 设置设备缩放比例
await page.set_viewport_size({"width": 1920, "height": 1080})
#await page.set_device_scale_factor(2) # 缩放比例设置为 2
# 增加超时时间和修改等待策略
await page.goto(url, wait_until='domcontentloaded', timeout=60000)
logger.info("页面加载完成,等待图片加载")
# 增加更长的等待时间确保图片加载完成
await page.wait_for_timeout(1500)
logger.info("获取页面高度信息")
# 获取页面总高度
total_height = await page.evaluate('() => document.body.scrollHeight')
viewport_height = await page.evaluate('() => window.innerHeight')
logger.info(f"页面总高度: {total_height}, 视口高度: {viewport_height}")
# 分多次滚动页面以触发懒加载
scroll_step = viewport_height // 2 # 每次滚动视口高度的一半
current_position = 0
logger.info("开始滚动页面以触发懒加载")
while current_position < total_height:
current_position += scroll_step
if current_position >= total_height:
current_position = total_height
logger.info(f"滚动到位置: {current_position}/{total_height}")
await page.evaluate(f'() => window.scrollTo(0, {current_position})')
# 等待一小段时间让懒加载触发
await page.wait_for_timeout(1000)
logger.info("滚动完成,回到顶部")
# 回到顶部再等待一下
await page.evaluate('() => window.scrollTo(0, 0)')
await page.wait_for_timeout(1000)
logger.info("开始截图")
img = await page.screenshot(full_page=True, type='png')
logger.info("截图完成")
await browser.close()
logger.info("浏览器已关闭")
return img
async def start_shot(url, result_image_path):
logger.info(f"主函数开始执行目标URL: {url}")
img = await shot(url)
# 保存图片到文件
with open(result_image_path, 'wb') as f:
f.write(img)
logger.info("图片保存完成")
return img