main
HuangHai 1 week ago
parent a3227c1967
commit 3f51637bcf

@ -4,13 +4,8 @@
# 微信爬爬猫---公众号文章抓取代码分析
# https://blog.csdn.net/yajuanpi4899/article/details/121584268
import datetime
import json
import logging
import random
import re
import requests
"""
# 查看selenium版本

@ -1,15 +1,17 @@
"""
安装pdfkit库
https://github.com/JazzCore/python-pdfkit/wiki/Installing-wkhtmltopdf
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
我是在Windows上开发的所以下载的是注意要科学上网下载否则太慢了~
https://release-assets.githubusercontent.com/github-production-release-asset/131323182/3200f380-aba8-11ea-8942-42fa5e27a312?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-07-15T02%3A10%3A32Z&rscd=attachment%3B+filename%3Dwkhtmltox-0.12.6-1.mxe-cross-win64.7z&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2025-07-15T01%3A10%3A07Z&ske=2025-07-15T02%3A10%3A32Z&sks=b&skv=2018-11-09&sig=IYNB2Gi%2FZ9tZfPXmo7PbqjbxmcLULpP%2Bex2z6lp2DvE%3D&jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc1MjU0MjU3NSwibmJmIjoxNzUyNTQyMjc1LCJwYXRoIjoicmVsZWFzZWFzc2V0cHJvZHVjdGlvbi5ibG9iLmNvcmUud2luZG93cy5uZXQifQ.LyZXiO_mRK2qX99CTJtVwypU4DLsK-_Js0wspzsL0Y4&response-content-disposition=attachment%3B%20filename%3Dwkhtmltox-0.12.6-1.mxe-cross-win64.7z&response-content-type=application%2Foctet-stream
解压到D:\wkhtmltox中还要注意把路径加到环境变量中
url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd'
conda activate py310
pip3 install pdfkit
"""
import pdfkit
path_wk = r'D:\wkhtmltox\bin\wkhtmltopdf.exe' #wkhtmltopdf安装位置
config = pdfkit.configuration(wkhtmltopdf = path_wk)
pdfkit.from_url('http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd', 'out.pdf',configuration=config)
options = Options()
options.add_argument('-headless') # 无头参数,调试时可以注释掉
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
# 可以只要txt
html_content = driver.find_element(By.CLASS_NAME, "rich_media").text
# 第一行是标题,分离出来
title = html_content.split('\n')[0]
print(title)

Loading…
Cancel
Save