'commit'

1 week ago · 3f51637bcf
parent a3227c1967
commit 3f51637bcf
2 changed files with 15 additions and 18 deletions
--- a/dsLightRag/Test/T1_Login.py
+++ b/dsLightRag/Test/T1_Login.py
@ -4,13 +4,8 @@
 # 微信爬爬猫---公众号文章抓取代码分析
 # https://blog.csdn.net/yajuanpi4899/article/details/121584268

-import datetime
 import json
 import logging
-import random
-import re
-
-import requests

 """
 # 查看selenium版本
--- a/dsLightRag/Test/T3_GetArticle.py
+++ b/dsLightRag/Test/T3_GetArticle.py
@ -1,15 +1,17 @@
-"""
-安装pdfkit库
-https://github.com/JazzCore/python-pdfkit/wiki/Installing-wkhtmltopdf
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service as ChromeService
+from selenium.webdriver.common.by import By

-我是在Windows上开发的，所以，下载的是：【注意要科学上网下载，否则太慢了~】
-https://release-assets.githubusercontent.com/github-production-release-asset/131323182/3200f380-aba8-11ea-8942-42fa5e27a312?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-07-15T02%3A10%3A32Z&rscd=attachment%3B+filename%3Dwkhtmltox-0.12.6-1.mxe-cross-win64.7z&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2025-07-15T01%3A10%3A07Z&ske=2025-07-15T02%3A10%3A32Z&sks=b&skv=2018-11-09&sig=IYNB2Gi%2FZ9tZfPXmo7PbqjbxmcLULpP%2Bex2z6lp2DvE%3D&jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc1MjU0MjU3NSwibmJmIjoxNzUyNTQyMjc1LCJwYXRoIjoicmVsZWFzZWFzc2V0cHJvZHVjdGlvbi5ibG9iLmNvcmUud2luZG93cy5uZXQifQ.LyZXiO_mRK2qX99CTJtVwypU4DLsK-_Js0wspzsL0Y4&response-content-disposition=attachment%3B%20filename%3Dwkhtmltox-0.12.6-1.mxe-cross-win64.7z&response-content-type=application%2Foctet-stream
-解压到D:\wkhtmltox中，还要注意把路径加到环境变量中
+url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd'

-conda activate py310
-pip3 install pdfkit
-"""
-import pdfkit
-path_wk = r'D:\wkhtmltox\bin\wkhtmltopdf.exe' #wkhtmltopdf安装位置
-config = pdfkit.configuration(wkhtmltopdf = path_wk)
-pdfkit.from_url('http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd', 'out.pdf',configuration=config)
+options = Options()
+options.add_argument('-headless')  # 无头参数，调试时可以注释掉
+service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
+driver = webdriver.Chrome(service=service, options=options)
+driver.get(url)
+# 可以只要txt
+html_content = driver.find_element(By.CLASS_NAME, "rich_media").text
+# 第一行是标题，分离出来
+title = html_content.split('\n')[0]
+print(title)