diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index 153e43a6..65af9f41 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -27,6 +27,8 @@ async def get_wechat_sources(): return [dict(row) for row in rows] finally: await pool.close() + + """ # 查看selenium版本 pip show selenium @@ -100,7 +102,7 @@ if __name__ == '__main__': article_urls = [] # 初始化浏览器 driver = init_wechat_browser() - + # 获取公众号列表 loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) @@ -108,7 +110,7 @@ if __name__ == '__main__': gzlist = loop.run_until_complete(get_wechat_sources()) finally: loop.close() - + # 爬取文章 for item in gzlist: account_name = item["account_name"] @@ -159,12 +161,14 @@ if __name__ == '__main__': article_url = item.get('link') article_title = item.get('title') publish_time = datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S') - + # 直接获取并显示文章内容 + if '试卷' in article_title: # 过滤掉试卷 + continue print(f"正在处理文章: {article_title} ({publish_time})") content = get_article_content(article_url) print(f"文章内容预览: {content[:200]}...") - + time.sleep(1) # 关闭浏览器 driver.quit()