|
|
|
@ -5,6 +5,7 @@
|
|
|
|
|
# https://blog.csdn.net/yajuanpi4899/article/details/121584268
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import asyncio
|
|
|
|
|
import datetime
|
|
|
|
|
import json
|
|
|
|
|
import logging
|
|
|
|
@ -12,9 +13,10 @@ import random
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
import asyncio
|
|
|
|
|
|
|
|
|
|
from Util.PostgreSQLUtil import init_postgres_pool
|
|
|
|
|
from Util.WxGzhUtil import init_wechat_browser, get_wechat_articles
|
|
|
|
|
from Util.WxGzhUtil import init_wechat_browser, get_article_content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def get_wechat_sources():
|
|
|
|
|
"""从t_wechat_source表获取微信公众号列表"""
|
|
|
|
@ -109,7 +111,6 @@ if __name__ == '__main__':
|
|
|
|
|
|
|
|
|
|
# 爬取文章
|
|
|
|
|
for item in gzlist:
|
|
|
|
|
article_urls = get_wechat_articles(item["account_name"], item["account_id"], token, cookies, header)
|
|
|
|
|
account_name = item["account_name"]
|
|
|
|
|
account_id = item["account_id"]
|
|
|
|
|
# 搜索微信公众号的接口地址
|
|
|
|
@ -155,27 +156,15 @@ if __name__ == '__main__':
|
|
|
|
|
|
|
|
|
|
for item in fakeid_list:
|
|
|
|
|
# 采集item示例
|
|
|
|
|
new_article = {
|
|
|
|
|
'title': item.get('title'),
|
|
|
|
|
'article_url': item.get('link'),
|
|
|
|
|
'account_id': account_id,
|
|
|
|
|
'account_name': account_name,
|
|
|
|
|
'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime(
|
|
|
|
|
'%Y-%m-%d %H:%M:%S'),
|
|
|
|
|
'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
|
}
|
|
|
|
|
logging.info("new_article:", new_article)
|
|
|
|
|
article_urls.append({"title": item.get('title'), "url": item.get('link'),
|
|
|
|
|
"publish_time": datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime(
|
|
|
|
|
'%Y-%m-%d %H:%M:%S')})
|
|
|
|
|
article_url = item.get('link')
|
|
|
|
|
article_title = item.get('title')
|
|
|
|
|
publish_time = datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
|
|
|
|
|
|
# 直接获取并显示文章内容
|
|
|
|
|
print(f"正在处理文章: {article_title} ({publish_time})")
|
|
|
|
|
content = get_article_content(article_url)
|
|
|
|
|
print(f"文章内容预览: {content[:200]}...")
|
|
|
|
|
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
for x in article_urls:
|
|
|
|
|
print(x)
|
|
|
|
|
# 将返回的地址写入到文件
|
|
|
|
|
with open('article_urls.txt', 'w', encoding='utf-8') as f:
|
|
|
|
|
for record in article_urls:
|
|
|
|
|
f.write(record['title'] + " " + record['publish_time'] + " " + record['url'] + '\n')
|
|
|
|
|
|
|
|
|
|
# 关闭浏览器
|
|
|
|
|
driver.quit()
|
|
|
|
|