diff --git a/dsLightRag/.idea/dsLightRag.iml b/dsLightRag/.idea/dsLightRag.iml index 9c386072..850c697c 100644 --- a/dsLightRag/.idea/dsLightRag.iml +++ b/dsLightRag/.idea/dsLightRag.iml @@ -1,12 +1,10 @@ - - - - - - + + + \ No newline at end of file diff --git a/dsLightRag/.idea/misc.xml b/dsLightRag/.idea/misc.xml index 2cad77d0..85f1c6fa 100644 --- a/dsLightRag/.idea/misc.xml +++ b/dsLightRag/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/dsLightRag/Config/__pycache__/Config.cpython-310.pyc b/dsLightRag/Config/__pycache__/Config.cpython-310.pyc index 19f47c1d..d65c6fc0 100644 Binary files a/dsLightRag/Config/__pycache__/Config.cpython-310.pyc and b/dsLightRag/Config/__pycache__/Config.cpython-310.pyc differ diff --git a/dsLightRag/Topic/JiHe/kv_store_llm_response_cache.json b/dsLightRag/Topic/JiHe/kv_store_llm_response_cache.json index aaa8518c..d4b77ba4 100644 --- a/dsLightRag/Topic/JiHe/kv_store_llm_response_cache.json +++ b/dsLightRag/Topic/JiHe/kv_store_llm_response_cache.json @@ -537,5 +537,18 @@ "create_time": 1752818371, "update_time": 1752818371, "_id": "hybrid:keywords:37a1721ae1b1e0a8858a85a94a139d9a" + }, + "hybrid:keywords:6d5e4e5a7a264a947d2e6e9cb8befec7": { + "return": "{\"high_level_keywords\": [\"\\u51e0\\u4f55\\u8bc1\\u660e\", \"\\u4e09\\u89d2\\u5f62\", \"\\u5185\\u90e8\\u70b9\", \"\\u89d2\\u5ea6\\u5173\\u7cfb\"], \"low_level_keywords\": [\"\\u4e09\\u89d2\\u5f62ABC\", \"\\u70b9P\", \"\\u2220BPC\", \"\\u2220A\"]}", + "cache_type": "keywords", + "chunk_id": null, + "embedding": null, + "embedding_shape": null, + "embedding_min": null, + "embedding_max": null, + "original_prompt": "求证:在三角形ABC中,P为其内部任意一点。请证明:∠BPC > ∠A。", + "create_time": 1752821457, + "update_time": 1752821457, + "_id": "hybrid:keywords:6d5e4e5a7a264a947d2e6e9cb8befec7" } } \ No newline at end of file diff --git a/dsLightRag/WxGzh/T1_LoginGetCookie.py b/dsLightRag/WxGzh/T1_LoginGetCookie.py deleted file mode 100644 index 83eb3e0d..00000000 --- a/dsLightRag/WxGzh/T1_LoginGetCookie.py +++ /dev/null @@ -1,78 +0,0 @@ -# 详解(一)Python + Selenium 批量采集微信公众号,搭建自己的微信公众号每日AI简报,告别信息焦虑 -# https://blog.csdn.net/k352733625/article/details/149222945 - -# 微信爬爬猫---公众号文章抓取代码分析 -# https://blog.csdn.net/yajuanpi4899/article/details/121584268 - -import json -import logging - -from torch.distributed.elastic.timer import expires - -""" -# 查看selenium版本 -pip show selenium -4.34.2 - -# 查看Chrome浏览器版本 -chrome://version/ -138.0.7204.101 (正式版本) (64 位) - -# 下载驱动包 -https://googlechromelabs.github.io/chrome-for-testing/ -https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip -""" -import time -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.chrome.service import Service as ChromeService - -if __name__ == '__main__': - # 定义一个空的字典,存放cookies内容 - cookies = {} - # 设置headers - 使用微信内置浏览器的User-Agent - header = { - "HOST": "mp.weixin.qq.com", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - "Accept-Encoding": "gzip, deflate, br", - "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4", - "Connection": "keep-alive" - } - # 用webdriver启动谷歌浏览器 - logging.info("启动浏览器,打开微信公众号登录界面") - options = Options() - - service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") - driver = webdriver.Chrome(service=service, options=options) - # 打开微信公众号登录页面 - driver.get('https://mp.weixin.qq.com/') - # 等待5秒钟 - time.sleep(2) - # # 拿手机扫二维码! - logging.info("请拿手机扫码二维码登录公众号") - time.sleep(20) - - # 重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息 - driver.get('https://mp.weixin.qq.com/') - # 获取cookies - cookie_items = driver.get_cookies() - expiry=-1 - # 获取到的cookies是列表形式,将cookies转成json形式并存入本地名为cookie的文本中 - for cookie_item in cookie_items: - cookies[cookie_item['name']] = cookie_item['value'] - if('expiry' in cookie_item and cookie_item['expiry'] > expiry): - expiry = cookie_item['expiry'] - - if "slave_sid" not in cookies: - logging.info("登录公众号失败,获取cookie失败") - exit() - - # 将cookies写入文件 - cookies["expiry"] = expiry - with open('cookies.txt', mode='w', encoding="utf-8") as f: - f.write(json.dumps(cookies, indent=4, ensure_ascii=False)) - # 关闭浏览器 - driver.quit() - # 输出提示 - print("成功获取了cookies内容!") diff --git a/dsLightRag/WxGzh/T2_CollectArticle.py b/dsLightRag/WxGzh/T2_CollectArticle.py deleted file mode 100644 index 6ea92942..00000000 --- a/dsLightRag/WxGzh/T2_CollectArticle.py +++ /dev/null @@ -1,251 +0,0 @@ -# 详解(一)Python + Selenium 批量采集微信公众号,搭建自己的微信公众号每日AI简报,告别信息焦虑 -# https://blog.csdn.net/k352733625/article/details/149222945 - -# 微信爬爬猫---公众号文章抓取代码分析 -# https://blog.csdn.net/yajuanpi4899/article/details/121584268 - -""" -# 查看selenium版本 -pip show selenium -4.34.2 - -# 查看Chrome浏览器版本 -chrome://version/ -138.0.7204.101 (正式版本) (64 位) - -# 下载驱动包 -https://googlechromelabs.github.io/chrome-for-testing/ -https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip -""" -import asyncio -import datetime -import json -import logging -import random -import re -import time - -import requests -from selenium.webdriver.chrome.options import Options - -from Util.LightRagUtil import initialize_pg_rag, initialize_rag -from Util.PostgreSQLUtil import init_postgres_pool -from Util.WxGzhUtil import init_wechat_browser, get_article_content - -# 删除重复的日志配置,只保留以下内容 -logger = logging.getLogger('WxGzh') -logger.setLevel(logging.INFO) - -# 确保只添加一个handler -if not logger.handlers: - handler = logging.StreamHandler() - handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) - logger.addHandler(handler) - -# 添加微信请求头 -header = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36', - 'Referer': 'https://mp.weixin.qq.com/' -} - -async def get_wechat_sources(): - """从t_wechat_source表获取微信公众号列表""" - try: - pool = await init_postgres_pool() - async with pool.acquire() as conn: - rows = await conn.fetch('SELECT * FROM t_wechat_source') - return [dict(row) for row in rows] - finally: - await pool.close() - - -async def is_article_exist(pool, article_url): - """检查文章URL是否已存在数据库中""" - try: - async with pool.acquire() as conn: - row = await conn.fetchrow(''' - SELECT 1 - FROM t_wechat_articles - WHERE url = $1 LIMIT 1 - ''', article_url) - return row is not None - except Exception as e: - logging.error(f"检查文章存在性失败: {e}") - return False # 出错时默认返回False,避免影响正常流程 - - -async def save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, source_id): - # 在这里调用 lightrag进行知识库构建 - WORKING_DIR = f"../Topic/ChangChun" - docx_name = f"{account_name}_{article_title}" # 组合来源和标题作为文档名 - logger.info(f"开始处理文档: {docx_name}") - try: - # 注意:默认设置使用NetworkX - rag = await initialize_rag(WORKING_DIR) - await rag.ainsert(content) - logger.info(f"索引完成: {docx_name}") - except Exception as e: - print(f"An error occurred: {e}") - finally: - await rag.finalize_storages() - - try: - async with pool.acquire() as conn: - await conn.execute(''' - INSERT INTO t_wechat_articles - (title, source, url, publish_time, content, source_id) - VALUES ($1, $2, $3, $4, $5, $6) - ''', article_title, account_name, article_url, - publish_time, content, source_id) - logger.info(f"保存文档到知识库成功: {docx_name}") - except Exception as e: - logging.error(f"保存文章失败: {e}") - - -async def initialize_wechat_session(): - """初始化微信会话,获取cookies和token""" - with open('cookies.txt', 'r', encoding='utf-8') as f: - content = f.read() - cookies = json.loads(content) - global driver # 添加这行 - expiry = cookies["expiry"] - if expiry: - current_timestamp = time.time() - if current_timestamp > expiry: - logger.error("Cookie已过期") - exit() - - del cookies["expiry"] - - options = Options() - options.add_argument('-headless') - driver = init_wechat_browser() - - url = 'https://mp.weixin.qq.com' - response = requests.get(url=url, allow_redirects=False, cookies=cookies) - - if 'Location' in response.headers: - redirect_url = response.headers.get("Location") - token_match = re.findall(r'token=(\d+)', redirect_url) - if token_match: - token = token_match[0] - return cookies, token - - return None, None - - -async def get_wechat_account_list(cookies, token, account_name): - """获取指定公众号的fakeid""" - search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' - query_id = { - 'action': 'search_biz', - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'query': account_name, - 'begin': '0', - 'count': '5' - } - - search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) - lists = search_response.json().get('list')[0] - return lists.get('fakeid') - - -async def get_article_list(cookies, token, fakeid): - """获取公众号文章列表""" - appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' - query_id_data = { - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'action': 'list_ex', - 'begin': '0', - 'count': '5', - 'query': '', - 'fakeid': fakeid, - 'type': '9' - } - - query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) - return query_fakeid_response.json().get('app_msg_list') - - -async def process_single_article(article_info, account_info, cookies, token): - """处理单篇文章""" - article_url = article_info.get('link') - article_title = article_info.get('title') - publish_time = datetime.datetime.fromtimestamp(int(article_info.get("update_time"))) - - if '试卷' in article_title: - return False - - try: - pool = await init_postgres_pool() - # 先检查文章是否已存在 - if await is_article_exist(pool, article_url): - logger.info(f'文章已存在,跳过保存: {account_info["account_name"]}-{article_title}') - return False - content = get_article_content(article_url) - await save_article_to_db(pool, article_title, account_info["account_name"], - article_url, publish_time, content, account_info["id"]) - return True - except Exception as e: - logger.error(f"处理文章时出错: {e}") - return False - finally: - if 'pool' in locals(): - await pool.close() - - -async def process_wechat_account(account_info, cookies, token): - """处理单个公众号的所有文章""" - cnt = 0 - fakeid = await get_wechat_account_list(cookies, token, account_info["account_name"]) - articles = await get_article_list(cookies, token, fakeid) - - for article in articles: - success = await process_single_article(article, account_info, cookies, token) - if success: - cnt += 1 - time.sleep(1) - - logger.info(f"成功获取公众号: {account_info['account_name']} {cnt}篇文章。") - return cnt - - -async def main(): - """主函数""" - while True: - try: - logger.info("开始执行微信公众号文章采集任务") - cookies, token = await initialize_wechat_session() - if not cookies or not token: - logger.error("初始化微信会话失败") - continue - - account_list = await get_wechat_sources() - for account in account_list: - await process_wechat_account(account, cookies, token) - - logger.info("本次采集任务完成,等待30分钟后再次执行") - await asyncio.sleep(30 * 60) # 30分钟 - except Exception as e: - logger.error(f"主循环发生错误: {e}") - await asyncio.sleep(30 * 60) # 出错后也等待30分钟 - finally: - if 'driver' in globals(): - driver.quit() - - -if __name__ == '__main__': - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - try: - loop.run_until_complete(main()) - finally: - loop.close() diff --git a/dsLightRag/WxGzh/__init__.py b/dsLightRag/WxGzh/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/dsLightRag/WxGzh/cookies.txt b/dsLightRag/WxGzh/cookies.txt deleted file mode 100644 index 7183301e..00000000 --- a/dsLightRag/WxGzh/cookies.txt +++ /dev/null @@ -1,17 +0,0 @@ -{ - "_clsk": "2gtve8|1752546228205|1|1|mp.weixin.qq.com/weheat-agent/payload/record", - "xid": "16332bed01be1055e236ad45b33af8df", - "data_bizuin": "3514353238", - "slave_user": "gh_4f88a4e194da", - "slave_sid": "QzBRX1FWTXNMaEdJYnc4ODBaM3FJU3RRbjVJNFE2N2IzMXFyVGlRQ0V5YklvNGFOc3NBWHdjV2J5OVg5U0JBVXdfdGhSU3lObXRheG1TdFUyXzVFcTFYS3E1NTh2aTlnSlBOOUluMUljUnBkYktjeUJDM216WVJNYzJKQkx2eW9Ib1duUk1yWXI3RndTa2dK", - "rand_info": "CAESIFwUSYus3XR5tFa1+b5ytJeuGAQS02d07zNBJNfi+Ftk", - "data_ticket": "9gQ088/vC7+jqxfFxBKS2aRx/JjmzJt+8HyuDLJtQBgpVej1hfSG1A0FQKWBbHQh", - "bizuin": "3514353238", - "mm_lang": "zh_CN", - "slave_bizuin": "3514353238", - "uuid": "8c5dc8e06af66d00a4b8e8596c8662eb", - "ua_id": "y1HZNMSzYCWuaUJDAAAAAApPVJ0a_arX_A5zqoUh6P8=", - "wxuin": "52546211515015", - "_clck": "msq32d|1|fxm|0", - "expiry": 1787106233 -} \ No newline at end of file diff --git a/dsLightRag/requirements.txt b/dsLightRag/requirements.txt new file mode 100644 index 00000000..45675be9 Binary files /dev/null and b/dsLightRag/requirements.txt differ