From 3f51637bcfcee9ed92b37c1d65b8dd39c2afdf82 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 09:36:35 +0800 Subject: [PATCH] 'commit' --- dsLightRag/Test/T1_Login.py | 5 ----- dsLightRag/Test/T3_GetArticle.py | 28 +++++++++++++++------------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/dsLightRag/Test/T1_Login.py b/dsLightRag/Test/T1_Login.py index 8c4c57e0..e313e8ab 100644 --- a/dsLightRag/Test/T1_Login.py +++ b/dsLightRag/Test/T1_Login.py @@ -4,13 +4,8 @@ # 微信爬爬猫---公众号文章抓取代码分析 # https://blog.csdn.net/yajuanpi4899/article/details/121584268 -import datetime import json import logging -import random -import re - -import requests """ # 查看selenium版本 diff --git a/dsLightRag/Test/T3_GetArticle.py b/dsLightRag/Test/T3_GetArticle.py index 02e9cce8..6dae6c0b 100644 --- a/dsLightRag/Test/T3_GetArticle.py +++ b/dsLightRag/Test/T3_GetArticle.py @@ -1,15 +1,17 @@ -""" -安装pdfkit库 -https://github.com/JazzCore/python-pdfkit/wiki/Installing-wkhtmltopdf +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService +from selenium.webdriver.common.by import By -我是在Windows上开发的,所以,下载的是:【注意要科学上网下载,否则太慢了~】 -https://release-assets.githubusercontent.com/github-production-release-asset/131323182/3200f380-aba8-11ea-8942-42fa5e27a312?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-07-15T02%3A10%3A32Z&rscd=attachment%3B+filename%3Dwkhtmltox-0.12.6-1.mxe-cross-win64.7z&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2025-07-15T01%3A10%3A07Z&ske=2025-07-15T02%3A10%3A32Z&sks=b&skv=2018-11-09&sig=IYNB2Gi%2FZ9tZfPXmo7PbqjbxmcLULpP%2Bex2z6lp2DvE%3D&jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc1MjU0MjU3NSwibmJmIjoxNzUyNTQyMjc1LCJwYXRoIjoicmVsZWFzZWFzc2V0cHJvZHVjdGlvbi5ibG9iLmNvcmUud2luZG93cy5uZXQifQ.LyZXiO_mRK2qX99CTJtVwypU4DLsK-_Js0wspzsL0Y4&response-content-disposition=attachment%3B%20filename%3Dwkhtmltox-0.12.6-1.mxe-cross-win64.7z&response-content-type=application%2Foctet-stream -解压到D:\wkhtmltox中,还要注意把路径加到环境变量中 +url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd' -conda activate py310 -pip3 install pdfkit -""" -import pdfkit -path_wk = r'D:\wkhtmltox\bin\wkhtmltopdf.exe' #wkhtmltopdf安装位置 -config = pdfkit.configuration(wkhtmltopdf = path_wk) -pdfkit.from_url('http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd', 'out.pdf',configuration=config) \ No newline at end of file +options = Options() +options.add_argument('-headless') # 无头参数,调试时可以注释掉 +service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") +driver = webdriver.Chrome(service=service, options=options) +driver.get(url) +# 可以只要txt +html_content = driver.find_element(By.CLASS_NAME, "rich_media").text +# 第一行是标题,分离出来 +title = html_content.split('\n')[0] +print(title)