parent
af3f8098c4
commit
2ad3154fe8
@ -0,0 +1,100 @@
|
|||||||
|
import datetime
|
||||||
|
import random
|
||||||
|
import requests
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
|
||||||
|
def init_wechat_browser():
|
||||||
|
"""初始化微信爬虫浏览器实例"""
|
||||||
|
options = Options()
|
||||||
|
options.add_argument('-headless')
|
||||||
|
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
|
||||||
|
return webdriver.Chrome(service=service, options=options)
|
||||||
|
|
||||||
|
def get_wechat_articles(account_name, account_id, token, cookies, header):
|
||||||
|
"""获取指定公众号的文章列表"""
|
||||||
|
article_urls = []
|
||||||
|
|
||||||
|
# 搜索微信公众号的接口地址
|
||||||
|
search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
|
||||||
|
query_id = {
|
||||||
|
'action': 'search_biz',
|
||||||
|
'token': token,
|
||||||
|
'lang': 'zh_CN',
|
||||||
|
'f': 'json',
|
||||||
|
'ajax': '1',
|
||||||
|
'random': random.random(),
|
||||||
|
'query': account_name,
|
||||||
|
'begin': '0',
|
||||||
|
'count': '5'
|
||||||
|
}
|
||||||
|
|
||||||
|
# 完整实现搜索和获取文章逻辑
|
||||||
|
search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
|
||||||
|
lists = search_response.json().get('list')[0]
|
||||||
|
fakeid = lists.get('fakeid')
|
||||||
|
|
||||||
|
# 微信公众号文章接口
|
||||||
|
appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
|
||||||
|
query_id_data = {
|
||||||
|
'token': token,
|
||||||
|
'lang': 'zh_CN',
|
||||||
|
'f': 'json',
|
||||||
|
'ajax': '1',
|
||||||
|
'random': random.random(),
|
||||||
|
'action': 'list_ex',
|
||||||
|
'begin': '0',
|
||||||
|
'count': '5',
|
||||||
|
'query': '',
|
||||||
|
'fakeid': fakeid,
|
||||||
|
'type': '9'
|
||||||
|
}
|
||||||
|
|
||||||
|
query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
|
||||||
|
fakeid_list = query_fakeid_response.json().get('app_msg_list')
|
||||||
|
|
||||||
|
for item in fakeid_list:
|
||||||
|
article_urls.append({
|
||||||
|
'title': item.get('title'),
|
||||||
|
'url': item.get('link'),
|
||||||
|
'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
})
|
||||||
|
|
||||||
|
return article_urls
|
||||||
|
|
||||||
|
def get_article_content(url):
|
||||||
|
"""
|
||||||
|
获取微信公众号文章内容
|
||||||
|
:param url: 文章URL
|
||||||
|
:return: 文章内容文本
|
||||||
|
"""
|
||||||
|
options = Options()
|
||||||
|
options.add_argument('-headless')
|
||||||
|
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
|
||||||
|
driver = webdriver.Chrome(service=service, options=options)
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver.get(url)
|
||||||
|
html_content = driver.find_element(By.CLASS_NAME, "rich_media").text
|
||||||
|
|
||||||
|
# 处理内容,提取空行后的文本
|
||||||
|
lines = html_content.split('\n')
|
||||||
|
content_after_empty_line = ""
|
||||||
|
found_empty_line = False
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
if not found_empty_line and line.strip() == "":
|
||||||
|
found_empty_line = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
if found_empty_line:
|
||||||
|
content_after_empty_line += line + "\n"
|
||||||
|
|
||||||
|
if not found_empty_line:
|
||||||
|
content_after_empty_line = html_content
|
||||||
|
|
||||||
|
return content_after_empty_line.replace("\n\n", "\n")
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
Binary file not shown.
@ -1,45 +0,0 @@
|
|||||||
from selenium import webdriver
|
|
||||||
from selenium.webdriver.chrome.options import Options
|
|
||||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
|
|
||||||
def get_article_content(url):
|
|
||||||
"""
|
|
||||||
获取微信公众号文章内容
|
|
||||||
:param url: 文章URL
|
|
||||||
:return: 文章内容文本
|
|
||||||
"""
|
|
||||||
options = Options()
|
|
||||||
options.add_argument('-headless')
|
|
||||||
service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe")
|
|
||||||
driver = webdriver.Chrome(service=service, options=options)
|
|
||||||
|
|
||||||
try:
|
|
||||||
driver.get(url)
|
|
||||||
html_content = driver.find_element(By.CLASS_NAME, "rich_media").text
|
|
||||||
|
|
||||||
# 处理内容,提取空行后的文本
|
|
||||||
lines = html_content.split('\n')
|
|
||||||
content_after_empty_line = ""
|
|
||||||
found_empty_line = False
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
if not found_empty_line and line.strip() == "":
|
|
||||||
found_empty_line = True
|
|
||||||
continue
|
|
||||||
|
|
||||||
if found_empty_line:
|
|
||||||
content_after_empty_line += line + "\n"
|
|
||||||
|
|
||||||
if not found_empty_line:
|
|
||||||
content_after_empty_line = html_content
|
|
||||||
|
|
||||||
return content_after_empty_line.replace("\n\n", "\n")
|
|
||||||
finally:
|
|
||||||
driver.quit()
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
# 示例用法
|
|
||||||
url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd'
|
|
||||||
content = get_article_content(url)
|
|
||||||
print(content)
|
|
Loading…
Reference in new issue