|
|
|
@ -27,6 +27,8 @@ async def get_wechat_sources():
|
|
|
|
|
return [dict(row) for row in rows]
|
|
|
|
|
finally:
|
|
|
|
|
await pool.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
# 查看selenium版本
|
|
|
|
|
pip show selenium
|
|
|
|
@ -100,7 +102,7 @@ if __name__ == '__main__':
|
|
|
|
|
article_urls = []
|
|
|
|
|
# 初始化浏览器
|
|
|
|
|
driver = init_wechat_browser()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 获取公众号列表
|
|
|
|
|
loop = asyncio.new_event_loop()
|
|
|
|
|
asyncio.set_event_loop(loop)
|
|
|
|
@ -108,7 +110,7 @@ if __name__ == '__main__':
|
|
|
|
|
gzlist = loop.run_until_complete(get_wechat_sources())
|
|
|
|
|
finally:
|
|
|
|
|
loop.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 爬取文章
|
|
|
|
|
for item in gzlist:
|
|
|
|
|
account_name = item["account_name"]
|
|
|
|
@ -159,12 +161,14 @@ if __name__ == '__main__':
|
|
|
|
|
article_url = item.get('link')
|
|
|
|
|
article_title = item.get('title')
|
|
|
|
|
publish_time = datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 直接获取并显示文章内容
|
|
|
|
|
if '试卷' in article_title: # 过滤掉试卷
|
|
|
|
|
continue
|
|
|
|
|
print(f"正在处理文章: {article_title} ({publish_time})")
|
|
|
|
|
content = get_article_content(article_url)
|
|
|
|
|
print(f"文章内容预览: {content[:200]}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
# 关闭浏览器
|
|
|
|
|
driver.quit()
|
|
|
|
|