|
|
|
@ -1,15 +1,17 @@
|
|
|
|
|
"""
|
|
|
|
|
安装pdfkit库
|
|
|
|
|
https://github.com/JazzCore/python-pdfkit/wiki/Installing-wkhtmltopdf
|
|
|
|
|
from selenium import webdriver
|
|
|
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
|
|
|
|
from selenium.webdriver.common.by import By
|
|
|
|
|
|
|
|
|
|
我是在Windows上开发的,所以,下载的是:【注意要科学上网下载,否则太慢了~】
|
|
|
|
|
https://release-assets.githubusercontent.com/github-production-release-asset/131323182/3200f380-aba8-11ea-8942-42fa5e27a312?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-07-15T02%3A10%3A32Z&rscd=attachment%3B+filename%3Dwkhtmltox-0.12.6-1.mxe-cross-win64.7z&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2025-07-15T01%3A10%3A07Z&ske=2025-07-15T02%3A10%3A32Z&sks=b&skv=2018-11-09&sig=IYNB2Gi%2FZ9tZfPXmo7PbqjbxmcLULpP%2Bex2z6lp2DvE%3D&jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc1MjU0MjU3NSwibmJmIjoxNzUyNTQyMjc1LCJwYXRoIjoicmVsZWFzZWFzc2V0cHJvZHVjdGlvbi5ibG9iLmNvcmUud2luZG93cy5uZXQifQ.LyZXiO_mRK2qX99CTJtVwypU4DLsK-_Js0wspzsL0Y4&response-content-disposition=attachment%3B%20filename%3Dwkhtmltox-0.12.6-1.mxe-cross-win64.7z&response-content-type=application%2Foctet-stream
|
|
|
|
|
解压到D:\wkhtmltox中,还要注意把路径加到环境变量中
|
|
|
|
|
url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd'
|
|
|
|
|
|
|
|
|
|
conda activate py310
|
|
|
|
|
pip3 install pdfkit
|
|
|
|
|
"""
|
|
|
|
|
import pdfkit
|
|
|
|
|
path_wk = r'D:\wkhtmltox\bin\wkhtmltopdf.exe' #wkhtmltopdf安装位置
|
|
|
|
|
config = pdfkit.configuration(wkhtmltopdf = path_wk)
|
|
|
|
|
pdfkit.from_url('http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd', 'out.pdf',configuration=config)
|
|
|
|
|
options = Options()
|
|
|
|
|
options.add_argument('-headless') # 无头参数,调试时可以注释掉
|
|
|
|
|
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
|
|
|
|
|
driver = webdriver.Chrome(service=service, options=options)
|
|
|
|
|
driver.get(url)
|
|
|
|
|
# 可以只要txt
|
|
|
|
|
html_content = driver.find_element(By.CLASS_NAME, "rich_media").text
|
|
|
|
|
# 第一行是标题,分离出来
|
|
|
|
|
title = html_content.split('\n')[0]
|
|
|
|
|
print(title)
|
|
|
|
|