Files
dsProject/dsLightRag/Util/OCR_URL_1_Shot.py
2025-08-14 15:45:08 +08:00

76 lines
2.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging
from playwright.async_api import async_playwright
# 直接获取模块专属日志器(无需重复配置)
logger = logging.getLogger(__name__)
async def shot(url):
logger.info("开始执行截图任务")
async with async_playwright() as p:
logger.info("启动浏览器")
# 启动浏览器时添加更多参数以提高稳定性
browser = await p.chromium.launch(
headless=True,
args=[
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu"
]
)
logger.info("创建新页面")
page = await browser.new_page()
# 设置设备缩放比例
await page.set_viewport_size({"width": 1920, "height": 1080})
#await page.set_device_scale_factor(2) # 缩放比例设置为 2
# 增加超时时间和修改等待策略
await page.goto(url, wait_until='domcontentloaded', timeout=60000)
logger.info("页面加载完成,等待图片加载")
# 增加更长的等待时间确保图片加载完成
await page.wait_for_timeout(1500)
logger.info("获取页面高度信息")
# 获取页面总高度
total_height = await page.evaluate('() => document.body.scrollHeight')
viewport_height = await page.evaluate('() => window.innerHeight')
logger.info(f"页面总高度: {total_height}, 视口高度: {viewport_height}")
# 分多次滚动页面以触发懒加载
scroll_step = viewport_height // 2 # 每次滚动视口高度的一半
current_position = 0
logger.info("开始滚动页面以触发懒加载")
while current_position < total_height:
current_position += scroll_step
if current_position >= total_height:
current_position = total_height
logger.info(f"滚动到位置: {current_position}/{total_height}")
await page.evaluate(f'() => window.scrollTo(0, {current_position})')
# 等待一小段时间让懒加载触发
await page.wait_for_timeout(1000)
logger.info("滚动完成,回到顶部")
# 回到顶部再等待一下
await page.evaluate('() => window.scrollTo(0, 0)')
await page.wait_for_timeout(1000)
logger.info("开始截图")
img = await page.screenshot(full_page=True, type='png')
logger.info("截图完成")
await browser.close()
logger.info("浏览器已关闭")
return img
async def start_shot(url, result_image_path):
logger.info(f"主函数开始执行目标URL: {url}")
img = await shot(url)
# 保存图片到文件
with open(result_image_path, 'wb') as f:
f.write(img)
logger.info("图片保存完成")
return img