From 8bfd82e6de7ffe432b3674d2ab764e823f841b58 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 14 Jul 2025 14:32:49 +0800 Subject: [PATCH 01/46] 'commit' --- dsLightRag/.idea/dsLightRag.iml | 2 +- dsLightRag/.idea/misc.xml | 2 +- dsLightRag/Test/TestCrawl.py | 67 +++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 2 deletions(-) create mode 100644 dsLightRag/Test/TestCrawl.py diff --git a/dsLightRag/.idea/dsLightRag.iml b/dsLightRag/.idea/dsLightRag.iml index 4ceb6f94..880d61c1 100644 --- a/dsLightRag/.idea/dsLightRag.iml +++ b/dsLightRag/.idea/dsLightRag.iml @@ -2,7 +2,7 @@ - + diff --git a/dsLightRag/.idea/misc.xml b/dsLightRag/.idea/misc.xml index 0bad5868..0f9b3bc1 100644 --- a/dsLightRag/.idea/misc.xml +++ b/dsLightRag/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/dsLightRag/Test/TestCrawl.py b/dsLightRag/Test/TestCrawl.py new file mode 100644 index 00000000..b9d6ff78 --- /dev/null +++ b/dsLightRag/Test/TestCrawl.py @@ -0,0 +1,67 @@ +# 详解(一)Python + Selenium 批量采集微信公众号,搭建自己的微信公众号每日AI简报,告别信息焦虑 +# https://blog.csdn.net/k352733625/article/details/149222945 +import logging +# 1、安装Firefox软件【最新】 +# https://www.firefox.com.cn/download/#product-desktop-release + +# 2、下载geckodriver驱动【最新】 +# https://splinter-docs-zh-cn.readthedocs.io/zh/latest/drivers/firefox.html +# https://github.com/mozilla/geckodriver/releases + +""" +# 查看selenium版本 +pip show selenium +4.34.2 + +# 查看Chrome浏览器版本 +chrome://version/ +138.0.7204.101 (正式版本) (64 位) + +# 下载驱动包 +https://googlechromelabs.github.io/chrome-for-testing/ +https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip +""" +import time, random, re, json, requests +from selenium import webdriver +from selenium.webdriver import Chrome +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.service import Service as ChromeService +import requests +import json +import datetime + + +def weChat_login(): + # 定义一个空的字典,存放cookies内容 + post = {} + # 用webdriver启动谷歌浏览器 + logging.info("启动浏览器,打开微信公众号登录界面") + options = Options() + options.add_argument('-headless') # 无头参数 + service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") + driver = webdriver.Chrome(service=service) + # 打开微信公众号登录页面 + driver.get('https://mp.weixin.qq.com/') + # 等待5秒钟 + time.sleep(2) + # # 拿手机扫二维码! + logging.info("请拿手机扫码二维码登录公众号") + time.sleep(20) + # 重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息 + driver.get('https://mp.weixin.qq.com/') + # 获取cookies + cookie_items = driver.get_cookies() + # 获取到的cookies是列表形式,将cookies转成json形式并存入本地名为cookie的文本中 + for cookie_item in cookie_items: + post[cookie_item['name']] = cookie_item['value'] + + if "slave_sid" not in post: + logging.info("登录公众号失败,获取cookie失败") + return None + cookie_str = json.dumps(post) + return cookie_str + +if __name__ == '__main__': + cookie_str = weChat_login() + print(cookie_str) \ No newline at end of file From 397b04baaf0de1909c01afa3935199ef50830113 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 14 Jul 2025 14:46:57 +0800 Subject: [PATCH 02/46] 'commit' --- dsLightRag/Test/TestCrawl.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/dsLightRag/Test/TestCrawl.py b/dsLightRag/Test/TestCrawl.py index b9d6ff78..8fcde632 100644 --- a/dsLightRag/Test/TestCrawl.py +++ b/dsLightRag/Test/TestCrawl.py @@ -1,6 +1,10 @@ # 详解(一)Python + Selenium 批量采集微信公众号,搭建自己的微信公众号每日AI简报,告别信息焦虑 # https://blog.csdn.net/k352733625/article/details/149222945 import logging +import re + +import requests + # 1、安装Firefox软件【最新】 # https://www.firefox.com.cn/download/#product-desktop-release @@ -21,18 +25,16 @@ chrome://version/ https://googlechromelabs.github.io/chrome-for-testing/ https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip """ -import time, random, re, json, requests +import time from selenium import webdriver -from selenium.webdriver import Chrome -from selenium.webdriver.firefox.options import Options -from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService -import requests import json -import datetime -def weChat_login(): + + +if __name__ == '__main__': # 定义一个空的字典,存放cookies内容 post = {} # 用webdriver启动谷歌浏览器 @@ -58,10 +60,13 @@ def weChat_login(): if "slave_sid" not in post: logging.info("登录公众号失败,获取cookie失败") - return None - cookie_str = json.dumps(post) - return cookie_str + exit() + cookies = json.dumps(post) + print(cookies) -if __name__ == '__main__': - cookie_str = weChat_login() - print(cookie_str) \ No newline at end of file + print(driver.current_url) + + #url = 'https://mp.weixin.qq.com' + #response = requests.get(url=url, allow_redirects=False, cookies=cookies) + #token = re.findall(r'token=(\d+)', str(response.headers.get("Location")))[0] + #logging.info("微信token:" + token) From 351caa4b5bfa863475cbe58956ca32d665a1d478 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 14 Jul 2025 14:51:43 +0800 Subject: [PATCH 03/46] 'commit' --- dsLightRag/Test/TestCrawl.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/dsLightRag/Test/TestCrawl.py b/dsLightRag/Test/TestCrawl.py index 8fcde632..ba3d7b66 100644 --- a/dsLightRag/Test/TestCrawl.py +++ b/dsLightRag/Test/TestCrawl.py @@ -62,9 +62,18 @@ if __name__ == '__main__': logging.info("登录公众号失败,获取cookie失败") exit() cookies = json.dumps(post) - print(cookies) - print(driver.current_url) + # 方法3:使用requests库发送请求获取重定向URL + url = 'https://mp.weixin.qq.com' + response = requests.get(url=url, allow_redirects=False, cookies=post) + if 'Location' in response.headers: + redirect_url = response.headers.get("Location") + print("重定向URL:", redirect_url) + token_match = re.findall(r'token=(\d+)', redirect_url) + if token_match: + token = token_match[0] + print("获取到的token:", token) + logging.info("微信token:" + token) #url = 'https://mp.weixin.qq.com' #response = requests.get(url=url, allow_redirects=False, cookies=cookies) From be105894af0613741967ec2eb153cce1ef09e82f Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 14 Jul 2025 15:07:59 +0800 Subject: [PATCH 04/46] 'commit' --- dsLightRag/Test/TestCrawl.py | 94 +++++++++++++++++++++++++++++++----- 1 file changed, 82 insertions(+), 12 deletions(-) diff --git a/dsLightRag/Test/TestCrawl.py b/dsLightRag/Test/TestCrawl.py index ba3d7b66..95e70278 100644 --- a/dsLightRag/Test/TestCrawl.py +++ b/dsLightRag/Test/TestCrawl.py @@ -1,6 +1,11 @@ # 详解(一)Python + Selenium 批量采集微信公众号,搭建自己的微信公众号每日AI简报,告别信息焦虑 # https://blog.csdn.net/k352733625/article/details/149222945 + +# 微信爬爬猫---公众号文章抓取代码分析 +# https://blog.csdn.net/yajuanpi4899/article/details/121584268 +import datetime import logging +import random import re import requests @@ -12,6 +17,10 @@ import requests # https://splinter-docs-zh-cn.readthedocs.io/zh/latest/drivers/firefox.html # https://github.com/mozilla/geckodriver/releases +# 3、Python爬虫实战系列:微信公众号文章爬取的5种技术方案总结及代码示例! +# 方案5:微信公众号后台引用链接方式爬取 +# https://blog.csdn.net/Python_trys/article/details/146506009 + """ # 查看selenium版本 pip show selenium @@ -31,12 +40,14 @@ from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService import json - - - if __name__ == '__main__': # 定义一个空的字典,存放cookies内容 - post = {} + cookies = {} + # 设置headers + header = { + "HOST": "mp.weixin.qq.com", + "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0" + } # 用webdriver启动谷歌浏览器 logging.info("启动浏览器,打开微信公众号登录界面") options = Options() @@ -50,22 +61,23 @@ if __name__ == '__main__': # # 拿手机扫二维码! logging.info("请拿手机扫码二维码登录公众号") time.sleep(20) + # 重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息 driver.get('https://mp.weixin.qq.com/') # 获取cookies cookie_items = driver.get_cookies() # 获取到的cookies是列表形式,将cookies转成json形式并存入本地名为cookie的文本中 for cookie_item in cookie_items: - post[cookie_item['name']] = cookie_item['value'] + cookies[cookie_item['name']] = cookie_item['value'] - if "slave_sid" not in post: + if "slave_sid" not in cookies: logging.info("登录公众号失败,获取cookie失败") exit() - cookies = json.dumps(post) + # cookies = json.dumps(post) # 注释掉这一行 # 方法3:使用requests库发送请求获取重定向URL url = 'https://mp.weixin.qq.com' - response = requests.get(url=url, allow_redirects=False, cookies=post) + response = requests.get(url=url, allow_redirects=False, cookies=cookies) if 'Location' in response.headers: redirect_url = response.headers.get("Location") print("重定向URL:", redirect_url) @@ -75,7 +87,65 @@ if __name__ == '__main__': print("获取到的token:", token) logging.info("微信token:" + token) - #url = 'https://mp.weixin.qq.com' - #response = requests.get(url=url, allow_redirects=False, cookies=cookies) - #token = re.findall(r'token=(\d+)', str(response.headers.get("Location")))[0] - #logging.info("微信token:" + token) + article_urls = [] + gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}] + for item in gzlist: + account_name = item["account_name"] + account_id = item["account_id"] + # 搜索微信公众号的接口地址 + search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' + # 搜索微信公众号接口需要传入的参数,有三个变量:微信公众号token、随机数random、搜索的微信公众号名字 + query_id = { + 'action': 'search_biz', + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'query': account_name, + 'begin': '0', + 'count': '5' + } + # 打开搜索微信公众号接口地址,需要传入相关参数信息如:cookies、params、headers + search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) + # 取搜索结果中的第一个公众号 + lists = search_response.json().get('list')[0] + # 获取这个公众号的fakeid,后面爬取公众号文章需要此字段 + fakeid = lists.get('fakeid') + logging.info("fakeid:" + fakeid) + # 微信公众号文章接口地址 + appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' + # 搜索文章需要传入几个参数:登录的公众号token、要爬取文章的公众号fakeid、随机数random + query_id_data = { + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'action': 'list_ex', + 'begin': '0', # 不同页,此参数变化,变化规则为每页加5 + 'count': '5', + 'query': '', + 'fakeid': fakeid, + 'type': '9' + } + # 打开搜索的微信公众号文章列表页 + query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) + fakeid_list = query_fakeid_response.json().get('app_msg_list') + item = fakeid_list[0] + # 采集item示例 + new_article = { + 'title': item.get('title'), + 'article_url': item.get('link'), + 'account_id': account_id, + 'account_name': account_name, + 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'), + 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + } + logging.info("new_article:", new_article) + article_urls.append(item.get('link')) + time.sleep(2) + + for article_url in article_urls: + print("正在爬取文章:" + article_url) + From 9ed09f13d6cd12662def6bf3a3c4ff3af0a6c9df Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 14 Jul 2025 15:19:13 +0800 Subject: [PATCH 05/46] 'commit' --- dsLightRag/Test/TestCrawl.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/dsLightRag/Test/TestCrawl.py b/dsLightRag/Test/TestCrawl.py index 95e70278..7a9a4e6d 100644 --- a/dsLightRag/Test/TestCrawl.py +++ b/dsLightRag/Test/TestCrawl.py @@ -132,19 +132,20 @@ if __name__ == '__main__': # 打开搜索的微信公众号文章列表页 query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) fakeid_list = query_fakeid_response.json().get('app_msg_list') - item = fakeid_list[0] - # 采集item示例 - new_article = { - 'title': item.get('title'), - 'article_url': item.get('link'), - 'account_id': account_id, - 'account_name': account_name, - 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'), - 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') - } - logging.info("new_article:", new_article) - article_urls.append(item.get('link')) - time.sleep(2) + + for item in fakeid_list: + # 采集item示例 + new_article = { + 'title': item.get('title'), + 'article_url': item.get('link'), + 'account_id': account_id, + 'account_name': account_name, + 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'), + 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + } + logging.info("new_article:", new_article) + article_urls.append(item.get('link')) + time.sleep(1) for article_url in article_urls: print("正在爬取文章:" + article_url) From 69e8e833e6c6299d75cb0312118cb5b0f4782dac Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 14 Jul 2025 15:41:30 +0800 Subject: [PATCH 06/46] 'commit' --- .../Test/Test/Logs/article_bfc50bb7d7.html | 162 ++++++++++++++++++ dsLightRag/Test/TestCrawl.py | 148 ++++++++++++++-- 2 files changed, 295 insertions(+), 15 deletions(-) create mode 100644 dsLightRag/Test/Test/Logs/article_bfc50bb7d7.html diff --git a/dsLightRag/Test/Test/Logs/article_bfc50bb7d7.html b/dsLightRag/Test/Test/Logs/article_bfc50bb7d7.html new file mode 100644 index 00000000..cd460649 --- /dev/null +++ b/dsLightRag/Test/Test/Logs/article_bfc50bb7d7.html @@ -0,0 +1,162 @@ + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dsLightRag/Test/TestCrawl.py b/dsLightRag/Test/TestCrawl.py index 7a9a4e6d..63d2eb14 100644 --- a/dsLightRag/Test/TestCrawl.py +++ b/dsLightRag/Test/TestCrawl.py @@ -7,20 +7,10 @@ import datetime import logging import random import re +import os import requests -# 1、安装Firefox软件【最新】 -# https://www.firefox.com.cn/download/#product-desktop-release - -# 2、下载geckodriver驱动【最新】 -# https://splinter-docs-zh-cn.readthedocs.io/zh/latest/drivers/firefox.html -# https://github.com/mozilla/geckodriver/releases - -# 3、Python爬虫实战系列:微信公众号文章爬取的5种技术方案总结及代码示例! -# 方案5:微信公众号后台引用链接方式爬取 -# https://blog.csdn.net/Python_trys/article/details/146506009 - """ # 查看selenium版本 pip show selenium @@ -43,17 +33,25 @@ import json if __name__ == '__main__': # 定义一个空的字典,存放cookies内容 cookies = {} - # 设置headers + # 设置headers - 使用微信内置浏览器的User-Agent header = { "HOST": "mp.weixin.qq.com", - "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0" + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4", + "Connection": "keep-alive" } # 用webdriver启动谷歌浏览器 logging.info("启动浏览器,打开微信公众号登录界面") options = Options() - options.add_argument('-headless') # 无头参数 + # options.add_argument('-headless') # 无头参数,调试时可以注释掉 + + # 设置微信内置浏览器的User-Agent + options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)') + service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") - driver = webdriver.Chrome(service=service) + driver = webdriver.Chrome(service=service, options=options) # 打开微信公众号登录页面 driver.get('https://mp.weixin.qq.com/') # 等待5秒钟 @@ -143,10 +141,130 @@ if __name__ == '__main__': 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'), 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') } + print("new_article:", new_article) logging.info("new_article:", new_article) article_urls.append(item.get('link')) time.sleep(1) + # 确保Logs目录存在 + logs_dir = "./Test/Logs" + if not os.path.exists(logs_dir): + os.makedirs(logs_dir) + for article_url in article_urls: print("正在爬取文章:" + article_url) + try: + # 使用requests直接获取文章内容,模拟微信环境 + wechat_headers = { + "User-Agent": "Mozilla/5.0 (Linux; Android 10; MI 8 Build/QKQ1.190828.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/83.0.4103.101 Mobile Safari/537.36 XWEB/1768 MMWEBSDK/20210302 MMWEBID/6253 MicroMessenger/8.0.2.1860(0x28000234) Process/toolsmp WeChat/arm64 Weixin NetType/WIFI Language/zh_CN ABI/arm64", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "Accept-Encoding": "gzip, deflate", + "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", + "X-Requested-With": "com.tencent.mm", + "Referer": "https://mp.weixin.qq.com/" + } + + # 使用selenium打开文章链接,设置请求头 + driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': wechat_headers}) + driver.get(article_url) + # 增加等待时间,确保页面完全加载 + time.sleep(5) + + # 检查是否需要登录 + if "请在微信客户端中打开链接" in driver.page_source or "请在微信中打开此链接" in driver.page_source: + print(f"文章需要在微信中打开,尝试使用requests直接获取:{article_url}") + # 尝试使用requests直接获取 + response = requests.get(article_url, headers=wechat_headers, cookies=cookies) + if "请在微信客户端中打开链接" in response.text or "请在微信中打开此链接" in response.text: + print(f"使用requests仍然无法获取,跳过此文章:{article_url}") + continue + else: + # 保存获取到的HTML内容 + filename = f"article_{article_url.split('sn=')[1][:10] if 'sn=' in article_url else 'unknown'}" + save_path = f"{logs_dir}/{filename}.html" + with open(save_path, "w", encoding="utf-8") as f: + f.write(response.text) + print(f"已保存文章HTML内容:{save_path}") + continue + + # 使用更可靠的选择器查找标题和内容 + try: + # 尝试多种可能的标题选择器 + title_selectors = [ + '//h1[@class="rich_media_title"]', + '//h1[@id="activity-name"]', + '//h2[@class="rich_media_title"]', + '//div[@class="rich_media_content"]//h1', + '//div[@id="js_article"]//h1' + ] + + title = None + for selector in title_selectors: + try: + title_element = driver.find_element('xpath', selector) + title = title_element.text.strip() + if title: + break + except: + continue + + if not title: + # 如果所有选择器都失败,尝试从页面标题获取 + title = driver.title.replace(" - 微信公众号", "").strip() + + # 尝试多种可能的内容选择器 + content_selectors = [ + '//div[@class="rich_media_content"]', + '//div[@id="js_content"]', + '//div[@class="rich_media_wrp"]' + ] + + content = None + for selector in content_selectors: + try: + content_element = driver.find_element('xpath', selector) + content = content_element.text.strip() + if content: + break + except: + continue + + if not content: + # 如果无法获取内容,至少保存页面源码 + content = "无法提取正文内容,保存页面源码:\n" + driver.page_source + + # 创建文件名(使用标题,但去除不合法的文件名字符) + if not title: + title = "未知标题_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "未知标题" + + filename = re.sub(r'[\\/:*?"<>|]', '_', title) + + # 保存文章内容到文件 + save_path = f"{logs_dir}/{filename}.txt" + with open(save_path, "w", encoding="utf-8") as f: + f.write(f"标题:{title}\n\n") + f.write(f"链接:{article_url}\n\n") + f.write(f"内容:\n{content}") + + print(f"文章《{title}》保存成功:{save_path}") + + except Exception as e: + print(f"提取文章内容失败:{str(e)}") + # 保存页面源码以便分析 + error_filename = "error_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "error_page" + error_path = f"{logs_dir}/{error_filename}.html" + with open(error_path, "w", encoding="utf-8") as f: + f.write(driver.page_source) + print(f"已保存页面源码到:{error_path}") + + # 避免频繁请求被封 + time.sleep(random.uniform(3, 7)) + + except Exception as e: + print(f"爬取文章失败:{article_url},错误信息:{str(e)}") + continue + + # 关闭浏览器 + driver.quit() + print("所有文章爬取完成!") From 674d27e936c2342d398841ba9f96ab9b1579443d Mon Sep 17 00:00:00 2001 From: "Kalman.CHENG" <123204464@qq.com> Date: Mon, 14 Jul 2025 15:49:52 +0800 Subject: [PATCH 07/46] =?UTF-8?q?=E6=95=99=E8=82=B2=E5=9E=82=E7=9B=B4?= =?UTF-8?q?=E9=A2=86=E5=9F=9F=E5=A4=A7=E6=A8=A1=E5=9E=8B=E5=B9=B3=E5=8F=B0?= =?UTF-8?q?=20modify=20by=20Kalman.CHENG=20=E2=98=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api/controller/DmController.py | 6 +- .../api/controller/DocumentController.py | 127 +++++++++- .../api/controller/LoginController.py | 7 +- .../api/controller/ThemeController.py | 73 +++++- dsAiTeachingModel/config/Config.py | 36 +-- dsAiTeachingModel/utils/Database.py | 230 ++++++++++++------ dsAiTeachingModel/utils/JwtUtil.py | 2 +- dsAiTeachingModel/utils/LightRagUtil.py | 2 +- dsAiTeachingModel/utils/PageUtil.py | 48 ++++ 9 files changed, 423 insertions(+), 108 deletions(-) create mode 100644 dsAiTeachingModel/utils/PageUtil.py diff --git a/dsAiTeachingModel/api/controller/DmController.py b/dsAiTeachingModel/api/controller/DmController.py index 41583878..7dbd0e51 100644 --- a/dsAiTeachingModel/api/controller/DmController.py +++ b/dsAiTeachingModel/api/controller/DmController.py @@ -13,11 +13,11 @@ router = APIRouter(dependencies=[Depends(get_current_user)]) async def get_stage_subject_list(): # 先查询学段list select_stage_sql: str = "select stage_id, stage_name from t_dm_stage where b_use = 1 order by sort_id;" - stage_list = await find_by_sql(select_stage_sql, ()) + stage_list = await find_by_sql(select_stage_sql,()) for stage in stage_list: # 再查询学科list - select_subject_sql: str = "select subject_id, subject_name from t_dm_subject where stage_id = %s order by sort_id;" - subject_list = await find_by_sql(select_subject_sql, (stage["stage_id"],)) + select_subject_sql: str = "select subject_id, subject_name from t_dm_subject where stage_id = " + str(stage["stage_id"]) + " order by sort_id;" + subject_list = await find_by_sql(select_subject_sql,()) stage["subject_list"] = subject_list return {"success": True, "message": "成功!", "data": stage_list} diff --git a/dsAiTeachingModel/api/controller/DocumentController.py b/dsAiTeachingModel/api/controller/DocumentController.py index 34576442..d88da710 100644 --- a/dsAiTeachingModel/api/controller/DocumentController.py +++ b/dsAiTeachingModel/api/controller/DocumentController.py @@ -1,13 +1,132 @@ # routes/LoginController.py +import os -from fastapi import APIRouter, Request, Response, Depends +from fastapi import APIRouter, Request, Response, Depends, UploadFile, File from auth.dependencies import get_current_user +from utils.PageUtil import * +from utils.ParseRequest import * # 创建一个路由实例,需要依赖get_current_user,登录后才能访问 router = APIRouter(dependencies=[Depends(get_current_user)]) +# 创建上传文件的目录 +UPLOAD_DIR = "upload_file" +if not os.path.exists(UPLOAD_DIR): + os.makedirs(UPLOAD_DIR) -@router.get("/") -async def test(request: Request, response: Response): - return {"success": True, "message": "成功!"} +# 合法文件扩展名 +supported_suffix_types = ['doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx'] + +# 【Document-1】文档管理列表 +@router.get("/list") +async def list(request: Request): + # 获取参数 + person_id = await get_request_str_param(request, "person_id", True, True) + stage_id = await get_request_num_param(request, "stage_id", False, True, -1) + subject_id = await get_request_num_param(request, "subject_id", False, True, -1) + document_suffix = await get_request_str_param(request, "document_suffix", False, True) + document_name = await get_request_str_param(request, "document_name", False, True) + page_number = await get_request_num_param(request, "page_number", False, True, 1) + page_size = await get_request_num_param(request, "page_size", False, True, 10) + + print(person_id, stage_id, subject_id, document_suffix, document_name, page_number, page_size) + + # 拼接查询SQL语句 + + select_document_sql: str = " SELECT * FROM t_ai_teaching_model_document WHERE is_deleted = 0 and person_id = '" + person_id + "'" + if stage_id != -1: + select_document_sql += " AND stage_id = " + str(stage_id) + if subject_id != -1: + select_document_sql += " AND subject_id = " + str(subject_id) + if document_suffix != "": + select_document_sql += " AND document_suffix = '" + document_suffix + "'" + if document_name != "": + select_document_sql += " AND document_name = '" + document_name + "'" + select_document_sql += " ORDER BY create_time DESC " + + # 查询文档列表 + page = await get_page_data_by_sql(select_document_sql, page_number, page_size) + for item in page["list"]: + theme_info = await find_by_id("t_ai_teaching_model_theme", "id", item["theme_id"]) + item["theme_info"] = theme_info + + return {"success": True, "message": "查询成功!", "data": page} + + +# 【Document-2】保存文档管理 +@router.post("/save") +async def save(request: Request, file: UploadFile = File(...)): + # 获取参数 + id = await get_request_num_param(request, "id", False, True, 0) + stage_id = await get_request_num_param(request, "stage_id", False, True, -1) + subject_id = await get_request_num_param(request, "subject_id", False, True, -1) + theme_id = await get_request_num_param(request, "theme_id", True, True, None) + person_id = await get_request_str_param(request, "person_id", True, True) + bureau_id = await get_request_str_param(request, "bureau_id", True, True) + # 先获取theme主题信息 + theme_object = await find_by_id("t_ai_teaching_model_theme", "id", theme_id) + if theme_object is None: + return {"success": False, "message": "主题不存在!"} + # 获取文件名 + document_name = file.filename + # 检查文件名在该主题下是否重复 + select_theme_document_sql: str = "SELECT * FROM t_ai_teaching_model_document WHERE is_deleted = 0 and document_name = '" + document_name + "'" + if id != 0: + select_theme_document_sql += " AND id <> " + id + theme_document = await find_by_sql(select_theme_document_sql, ()) + if theme_document is not None: + return {"success": False, "message": "该主题下文档名称重复!"} + # 获取文件扩展名 + document_suffix = file.filename.split(".")[-1] + # 检查文件扩展名 + if document_suffix not in supported_suffix_types: + return {"success": False, "message": "不支持的文件类型!"} + # 构造文件保存路径 + document_dir = UPLOAD_DIR + os.sep + str(theme_object["short_name"]) + "_" + str(theme_object["id"]) + os.sep + if not os.path.exists(document_dir): + os.makedirs(document_dir) + document_path = os.path.join(document_dir, file.filename) + # 保存文件 + try: + with open(document_path, "wb") as buffer: + buffer.write(await file.read()) + except Exception as e: + return {"success": False, "message": f"文件保存失败!{e}"} + + # 构造保存文档SQL语句 + param = {"stage_id": stage_id, "subject_id": subject_id, "document_name": document_name, "theme_id": theme_id, "document_path": document_path, "document_suffix": document_suffix, "person_id": person_id, "bureau_id": bureau_id} + + # 保存数据 + if id == 0: + param["train_flag"] = 0 + # 插入数据 + id = await insert("t_ai_teaching_model_document", param, False) + return {"success": True, "message": "保存成功!", "data": {"insert_id" : id}} + else: + # 更新数据 + await update("t_ai_teaching_model_document", param, "id", id) + return {"success": True, "message": "更新成功!", "data": {"update_id" : id}} + +# 【Document-3】获取文档信息 +@router.get("/get") +async def get(request: Request): + # 获取参数 + id = await get_request_num_param(request, "id", True, True, None) + # 查询数据 + document_object = await find_by_id("t_ai_teaching_model_document", "id", id) + if document_object is None: + return {"success": False, "message": "未查询到该文档信息!"} + theme_info = await find_by_id("t_ai_teaching_model_theme", "id", document_object["theme_id"]) + document_object["theme_info"] = theme_info + return {"success": True, "message": "查询成功!", "data": {"document": document_object}} + + +@router.post("/delete") +async def delete(request: Request): + # 获取参数 + id = await get_request_num_param(request, "id", True, True, None) + result = await delete_by_id("t_ai_teaching_model_document", "id", id) + if not result: + return {"success": False, "message": "删除失败!"} + return {"success": True, "message": "删除成功!"} \ No newline at end of file diff --git a/dsAiTeachingModel/api/controller/LoginController.py b/dsAiTeachingModel/api/controller/LoginController.py index 4a004d45..307fd3b6 100644 --- a/dsAiTeachingModel/api/controller/LoginController.py +++ b/dsAiTeachingModel/api/controller/LoginController.py @@ -13,7 +13,7 @@ from utils.CookieUtil import * from utils.Database import * from utils.JwtUtil import * from utils.ParseRequest import * -from config.Config import * +from Config.Config import * # 创建一个路由实例 router = APIRouter() @@ -108,8 +108,9 @@ async def login(request: Request, response: Response): return {"success": False, "message": "用户名和密码不能为空"} password = md5_encrypt(password) - select_user_sql: str = "SELECT person_id, person_name, identity_id, login_name, xb, bureau_id, org_id, pwdmd5 FROM t_sys_loginperson WHERE login_name = %s AND b_use = 1" - user = await find_one_by_sql(select_user_sql, (username,)) + select_user_sql: str = "SELECT person_id, person_name, identity_id, login_name, xb, bureau_id, org_id, pwdmd5 FROM t_sys_loginperson WHERE login_name = '" + username + "' AND b_use = 1" + userlist = await find_by_sql(select_user_sql,()) + user = userlist[0] if userlist else None logging.info(f"查询结果: {user}") if user and user['pwdmd5'] == password: # 验证的cas用户密码,md5加密的版本 token = create_access_token({"user_id": user['person_id'], "identity_id": user['identity_id']}) diff --git a/dsAiTeachingModel/api/controller/ThemeController.py b/dsAiTeachingModel/api/controller/ThemeController.py index 26903275..297817d1 100644 --- a/dsAiTeachingModel/api/controller/ThemeController.py +++ b/dsAiTeachingModel/api/controller/ThemeController.py @@ -3,12 +3,15 @@ from fastapi import APIRouter, Depends from utils.ParseRequest import * from auth.dependencies import * -from utils.Database import * +from utils.PageUtil import * # 创建一个路由实例,需要依赖get_current_user,登录后才能访问 router = APIRouter(dependencies=[Depends(get_current_user)]) - +# 功能:【Theme-1】主题管理列表 +# 作者:Kalman.CHENG ☆ +# 时间:2025-07-14 +# 备注: @router.get("/list") async def list(request: Request): # 获取参数 @@ -24,9 +27,9 @@ async def list(request: Request): # 拼接查询SQL语句 select_theme_sql: str = " SELECT * FROM t_ai_teaching_model_theme WHERE is_deleted = 0 and person_id = '" + person_id + "'" if stage_id != -1: - select_theme_sql += " and stage_id = " + stage_id + select_theme_sql += " and stage_id = " + str(stage_id) if subject_id != -1: - select_theme_sql += " and subject_id = " + subject_id + select_theme_sql += " and subject_id = " + str(subject_id) if theme_name != "": select_theme_sql += " and theme_name = '" + theme_name + "'" select_theme_sql += " ORDER BY create_time DESC" @@ -37,16 +40,76 @@ async def list(request: Request): return {"success": True, "message": "查询成功!", "data": page} +# 功能:【Theme-2】保存主题管理 +# 作者:Kalman.CHENG ☆ +# 时间:2025-07-14 +# 备注: @router.post("/save") async def save(request: Request): # 获取参数 id = await get_request_num_param(request, "id", False, True, 0) theme_name = await get_request_str_param(request, "theme_name", True, True) + short_name = await get_request_str_param(request, "short_name", True, True) theme_icon = await get_request_str_param(request, "theme_icon", False, True) stage_id = await get_request_num_param(request, "stage_id", True, True, None) subject_id = await get_request_num_param(request, "subject_id", True, True, None) person_id = await get_request_str_param(request, "person_id", True, True) bureau_id = await get_request_str_param(request, "bureau_id", True, True) - # 业务逻辑处理 + + # 校验参数 + check_theme_sql = "SELECT theme_name FROM t_ai_teaching_model_theme WHERE is_deleted = 0 and bureau_id = '" + bureau_id + "' and theme_name = '" + theme_name + "'" + if id != 0: + check_theme_sql += " and id <> " + id + print(check_theme_sql) + check_theme_result = await find_by_sql(check_theme_sql,()) + if check_theme_result: + return {"success": False, "message": "该主题名称已存在!"} + + check_short_name_sql = "SELECT short_name FROM t_ai_teaching_model_theme WHERE is_deleted = 0 and bureau_id = '" + bureau_id + "' and short_name = '" + short_name + "'" + if id != 0: + check_short_name_sql += " and id <> " + id + print(check_short_name_sql) + check_short_name_result = await find_by_sql(check_short_name_sql,()) + if check_short_name_result: + return {"success": False, "message": "该主题英文简称已存在!"} + + # 组装参数 + param = {"theme_name": theme_name,"short_name": short_name,"theme_icon": theme_icon,"stage_id": stage_id,"subject_id": subject_id,"person_id": person_id,"bureau_id": bureau_id} + + # 保存数据 + if id == 0: + param["search_flag"] = 0 + param["train_flag"] = 0 + # 插入数据 + id = await insert("t_ai_teaching_model_theme", param, False) + return {"success": True, "message": "保存成功!", "data": {"insert_id" : id}} + else: + # 更新数据 + await update("t_ai_teaching_model_theme", param, "id", id, False) + return {"success": True, "message": "更新成功!", "data": {"update_id" : id}} + + +# 功能:【Theme-3】获取主题信息 +# 作者:Kalman.CHENG ☆ +# 时间:2025-07-14 +# 备注: +@router.get("/get") +async def get(request: Request): + # 获取参数 + id = await get_request_num_param(request, "id", True, True, None) + theme_obj = await find_by_id("t_ai_teaching_model_theme", "id", id) + if theme_obj is None: + return {"success": False, "message": "未查询到该主题信息!"} + return {"success": True, "message": "查询成功!", "data": {"theme": theme_obj}} + + +@router.post("/delete") +async def delete(request: Request): + # 获取参数 + id = await get_request_num_param(request, "id", True, True, None) + result = await delete_by_id("t_ai_teaching_model_theme", "id", id) + if not result: + return {"success": False, "message": "删除失败!"} + return {"success": True, "message": "删除成功!"} diff --git a/dsAiTeachingModel/config/Config.py b/dsAiTeachingModel/config/Config.py index 3d4a460b..1b4ca3f3 100644 --- a/dsAiTeachingModel/config/Config.py +++ b/dsAiTeachingModel/config/Config.py @@ -1,13 +1,18 @@ -# 大模型 【DeepSeek深度求索官方】 -#LLM_API_KEY = "sk-44ae895eeb614aa1a9c6460579e322f1" -#LLM_BASE_URL = "https://api.deepseek.com" -#LLM_MODEL_NAME = "deepseek-chat" +# 阿里云的配置信息 +ALY_AK = 'LTAI5tE4tgpGcKWhbZg6C4bh' +ALY_SK = 'oizcTOZ8izbGUouboC00RcmGE8vBQ1' -# 阿里云提供的大模型服务 -LLM_API_KEY="sk-f6da0c787eff4b0389e4ad03a35a911f" +# 大模型 【DeepSeek深度求索官方】训练时用这个 +# LLM_API_KEY = "sk-44ae895eeb614aa1a9c6460579e322f1" +# LLM_BASE_URL = "https://api.deepseek.com" +# LLM_MODEL_NAME = "deepseek-chat" + +# 阿里云提供的大模型服务 【阿里云在处理文字材料时,容易引发绿网拦截,导致数据上报异常】 +LLM_API_KEY = "sk-f6da0c787eff4b0389e4ad03a35a911f" LLM_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1" #LLM_MODEL_NAME = "qwen-plus" # 不要使用通义千问,会导致化学方程式不正确! LLM_MODEL_NAME = "deepseek-v3" +#LLM_MODEL_NAME = "deepseek-r1" # 使用更牛B的r1模型 EMBED_MODEL_NAME = "BAAI/bge-m3" EMBED_API_KEY = "sk-pbqibyjwhrgmnlsmdygplahextfaclgnedetybccknxojlyl" @@ -15,21 +20,20 @@ EMBED_BASE_URL = "https://api.siliconflow.cn/v1" EMBED_DIM = 1024 EMBED_MAX_TOKEN_SIZE = 8192 - NEO4J_URI = "bolt://localhost:7687" NEO4J_USERNAME = "neo4j" NEO4J_PASSWORD = "DsideaL147258369" +NEO4J_AUTH = (NEO4J_USERNAME, NEO4J_PASSWORD) - -# MYSQL配置信息 -MYSQL_HOST = "127.0.0.1" -MYSQL_PORT = 22066 -MYSQL_USER = "root" -MYSQL_PASSWORD = "DsideaL147258369" -MYSQL_DB_NAME = "base_db" -MYSQL_POOL_SIZE = 200 +# POSTGRESQL配置信息 +AGE_GRAPH_NAME = "dickens" +POSTGRES_HOST = "10.10.14.208" +POSTGRES_PORT = 5432 +POSTGRES_USER = "postgres" +POSTGRES_PASSWORD = "postgres" +POSTGRES_DATABASE = "rag" # JWT配置信息 JWT_SECRET_KEY = "ZXZnZWVr5b+r5LmQ5L2g55qE5Ye66KGM" ALGORITHM = "HS256" -ACCESS_TOKEN_EXPIRE_MINUTES = 300000 # 访问令牌过期时间(分钟) +ACCESS_TOKEN_EXPIRE_MINUTES = 300000 # 访问令牌过期时间(分钟) \ No newline at end of file diff --git a/dsAiTeachingModel/utils/Database.py b/dsAiTeachingModel/utils/Database.py index db606938..85580029 100644 --- a/dsAiTeachingModel/utils/Database.py +++ b/dsAiTeachingModel/utils/Database.py @@ -1,25 +1,23 @@ # Database.py +import datetime import logging -import math +import asyncpg -import aiomysql -import asyncio -from config.Config import * +from Config.Config import * # 创建一个全局的连接池 pool = None -async def create_pool(loop): +async def create_pool(): global pool - pool = await aiomysql.create_pool( - host=MYSQL_HOST, - port=MYSQL_PORT, - user=MYSQL_USER, - password=MYSQL_PASSWORD, - db=MYSQL_DB_NAME, - minsize=1, # 设置连接池最小连接数 - maxsize=MYSQL_POOL_SIZE, # 设置连接池最大连接数 - cursorclass=aiomysql.DictCursor # 指定游标为字典模式 + pool = await asyncpg.create_pool( + host=POSTGRES_HOST, + port=POSTGRES_PORT, + user=POSTGRES_USER, + password=POSTGRES_PASSWORD, + database=POSTGRES_DATABASE, + min_size=1, # 设置连接池最小连接数 + max_size=100 # 设置连接池最大连接数 ) async def get_connection(): @@ -30,18 +28,17 @@ async def get_connection(): async def close_pool(): if pool is not None: - pool.close() - await pool.wait_closed() + await pool.close() # 初始化连接池的函数 async def init_database(): - loop = asyncio.get_event_loop() - await create_pool(loop) + await create_pool() # 关闭连接池的函数 async def shutdown_database(): await close_pool() + # 根据sql语句查询数据 async def find_by_sql(sql: str, params: tuple): if pool is None: @@ -49,79 +46,162 @@ async def find_by_sql(sql: str, params: tuple): return None try: async with pool.acquire() as conn: - async with conn.cursor() as cur: - await cur.execute(sql, params) - result = await cur.fetchall() - if result: - return result - else: - return None + result = await conn.fetch(sql, *params) + # 将 asyncpg.Record 转换为字典 + result_dict = [dict(record) for record in result] + if result_dict: + return result_dict + else: + return None except Exception as e: logging.error(f"数据库查询错误: {e}") return None +# 插入数据 +async def insert(tableName, param, onlyForParam=False): + current_time = datetime.datetime.now() + columns = [] + values = [] + placeholders = [] + + for key, value in param.items(): + if value is not None: + if isinstance(value, (int, float)): + columns.append(key) + values.append(value) + placeholders.append(f"${len(values)}") + elif isinstance(value, str): + columns.append(key) + values.append(value) + placeholders.append(f"${len(values)}") + else: + columns.append(key) + values.append(None) + placeholders.append("NULL") + + if not onlyForParam: + if 'is_deleted' not in param: + columns.append("is_deleted") + values.append(0) + placeholders.append(f"${len(values)}") + + if 'create_time' not in param: + columns.append("create_time") + values.append(current_time) + placeholders.append(f"${len(values)}") + + if 'update_time' not in param: + columns.append("update_time") + values.append(current_time) + placeholders.append(f"${len(values)}") + + # 构造 SQL 语句 + column_names = ", ".join(columns) + placeholder_names = ", ".join(placeholders) + sql = f"INSERT INTO {tableName} ({column_names}) VALUES ({placeholder_names}) RETURNING id" -# 根据sql语句查询数据 -async def find_one_by_sql(sql: str, params: tuple): - if pool is None: - logging.error("数据库连接池未创建") - return None try: async with pool.acquire() as conn: - async with conn.cursor() as cur: - await cur.execute(sql, params) - result = await cur.fetchone() + result = await conn.fetchrow(sql, *values) if result: - return result + return result['id'] else: + logging.error("插入数据失败: 未返回ID") return None except Exception as e: logging.error(f"数据库查询错误: {e}") - return None + logging.error(f"执行的SQL语句: {sql}") + logging.error(f"参数: {values}") + raise Exception(f"为表[{tableName}]插入数据失败: {e}") + + +# 更新数据 +async def update(table_name, param, property_name, property_value, only_for_param=False): + current_time = datetime.datetime.now() + set_clauses = [] + values = [] + + # 处理要更新的参数 + for key, value in param.items(): + if value is not None: + if isinstance(value, (int, float)): + set_clauses.append(f"{key} = ${len(values) + 1}") + values.append(value) + elif isinstance(value, str): + set_clauses.append(f"{key} = ${len(values) + 1}") + values.append(value) + else: + set_clauses.append(f"{key} = NULL") + values.append(None) + + if not only_for_param: + if 'update_time' not in param: + set_clauses.append(f"update_time = ${len(values) + 1}") + values.append(current_time) -# 查询数据条数 -async def get_total_data_count(total_data_sql): - total_data_count = 0 - total_data_count_sql = "select count(1) as count from (" + total_data_sql + ") as temp_table" - result = await find_one_by_sql(total_data_count_sql, ()) - if result: - total_data_count = result.get("count") - return total_data_count + # 构造 SQL 语句 + set_clause = ", ".join(set_clauses) + sql = f"UPDATE {table_name} SET {set_clause} WHERE {property_name} = ${len(values) + 1} RETURNING id" + print(sql) + # 添加条件参数 + values.append(property_value) -def get_page_by_total_row(total_data_count, page_number, page_size): - total_page = (page_size != 0) and math.floor((total_data_count + page_size - 1) / page_size) or 0 - if page_number <= 0: - page_number = 1 - if 0 < total_page < page_number: - page_number = total_page - offset = page_size * page_number - page_size - limit = page_size - return total_data_count, total_page, offset, limit + try: + async with pool.acquire() as conn: + result = await conn.fetchrow(sql, *values) + if result: + return result['id'] + else: + logging.error("更新数据失败: 未返回ID") + return None + except Exception as e: + logging.error(f"数据库查询错误: {e}") + logging.error(f"执行的SQL语句: {sql}") + logging.error(f"参数: {values}") + raise Exception(f"为表[{table_name}]更新数据失败: {e}") -async def get_page_data_by_sql(total_data_sql: str, page_number: int, page_size: int): - if pool is None: - logging.error("数据库连接池未创建") - return None - total_row: int = 0 - total_page: int = 0 - total_data_sql = total_data_sql.replace(";", "") - total_data_sql = total_data_sql.replace(" FROM ", " from ") - - # 查询总数 - total_data_count = await get_total_data_count(total_data_sql) - if total_data_count == 0: - return {"page_number": page_number, "page_size": page_size, "total_row": 0, "total_page": 0, "list": []} + +# 获取Bean +# 通过主键查询 +async def find_by_id(table_name, property_name, property_value): + if table_name and property_name and property_value is not None: + # 构造 SQL 语句 + sql = f"SELECT * FROM {table_name} WHERE is_deleted = 0 AND {property_name} = $1" + logging.debug(sql) + + # 执行查询 + result = await find_by_sql(sql, (property_value,)) + if not result: + logging.error("查询失败: 未找到数据") + return None + # 返回第一条数据 + return result[0] else: - total_row, total_page, offset, limit = get_page_by_total_row(total_data_count, page_number, page_size) - - # 构造执行分页查询的sql语句 - page_data_sql = total_data_sql + " LIMIT %d, %d " % (offset, limit) - print(page_data_sql) - # 执行分页查询 - page_data = await find_by_sql(page_data_sql, ()) - if page_data: - return {"page_number": page_number, "page_size": page_size, "total_row": total_row, "total_page": total_page, "list": page_data} + logging.error("参数不全") + return None + +# 通过主键删除 +# 逻辑删除 +async def delete_by_id(table_name, property_name, property_value): + if table_name and property_name and property_value is not None: + sql = f"UPDATE {table_name} SET is_deleted = 1, update_time = now() WHERE {property_name} = $1 and is_deleted = 0" + logging.debug(sql) + # 执行删除 + try: + async with pool.acquire() as conn: + result = await conn.execute(sql, property_value) + if result: + return True + else: + logging.error("删除失败: 未找到数据") + return False + except Exception as e: + logging.error(f"数据库查询错误: {e}") + logging.error(f"执行的SQL语句: {sql}") + logging.error(f"参数: {property_value}") + raise Exception(f"为表[{table_name}]删除数据失败: {e}") else: - return {"page_number": page_number, "page_size": page_size, "total_row": 0, "total_page": 0, "list": []} + logging.error("参数不全") + return False \ No newline at end of file diff --git a/dsAiTeachingModel/utils/JwtUtil.py b/dsAiTeachingModel/utils/JwtUtil.py index 4118a695..90b30808 100644 --- a/dsAiTeachingModel/utils/JwtUtil.py +++ b/dsAiTeachingModel/utils/JwtUtil.py @@ -2,7 +2,7 @@ from datetime import datetime, timedelta from jose import JWTError, jwt -from config.Config import * +from Config.Config import * def create_access_token(data: dict): diff --git a/dsAiTeachingModel/utils/LightRagUtil.py b/dsAiTeachingModel/utils/LightRagUtil.py index 4b038c1d..528f5963 100644 --- a/dsAiTeachingModel/utils/LightRagUtil.py +++ b/dsAiTeachingModel/utils/LightRagUtil.py @@ -8,7 +8,7 @@ from lightrag import LightRAG from lightrag.kg.shared_storage import initialize_pipeline_status from lightrag.llm.openai import openai_complete_if_cache, openai_embed from lightrag.utils import EmbeddingFunc, logger, set_verbose_debug -from config.Config import * +from Config.Config import * async def print_stream(stream): diff --git a/dsAiTeachingModel/utils/PageUtil.py b/dsAiTeachingModel/utils/PageUtil.py new file mode 100644 index 00000000..ae84601e --- /dev/null +++ b/dsAiTeachingModel/utils/PageUtil.py @@ -0,0 +1,48 @@ +import math +from utils.Database import * + + +# 查询数据条数 +async def get_total_data_count(total_data_sql): + total_data_count = 0 + total_data_count_sql = "select count(*) as num from (" + total_data_sql + ") as temp_table" + result = await find_by_sql(total_data_count_sql,()) + row = result[0] if result else None + if row: + total_data_count = row.get("num") + return total_data_count + + +def get_page_by_total_row(total_data_count, page_number, page_size): + total_page = (page_size != 0) and math.floor((total_data_count + page_size - 1) / page_size) or 0 + if page_number <= 0: + page_number = 1 + if 0 < total_page < page_number: + page_number = total_page + offset = page_size * page_number - page_size + limit = page_size + return total_data_count, total_page, offset, limit + + +async def get_page_data_by_sql(total_data_sql: str, page_number: int, page_size: int): + total_row: int = 0 + total_page: int = 0 + total_data_sql = total_data_sql.replace(";", "") + total_data_sql = total_data_sql.replace(" FROM ", " from ") + + # 查询总数 + total_data_count = await get_total_data_count(total_data_sql) + if total_data_count == 0: + return {"page_number": page_number, "page_size": page_size, "total_row": 0, "total_page": 0, "list": []} + else: + total_row, total_page, offset, limit = get_page_by_total_row(total_data_count, page_number, page_size) + + # 构造执行分页查询的sql语句 + page_data_sql = total_data_sql + " LIMIT %d offset %d " % (limit, offset) + print(page_data_sql) + # 执行分页查询 + page_data = await find_by_sql(page_data_sql, ()) + if page_data: + return {"page_number": page_number, "page_size": page_size, "total_row": total_row, "total_page": total_page, "list": page_data} + else: + return {"page_number": page_number, "page_size": page_size, "total_row": 0, "total_page": 0, "list": []} From 0f161ab1c3f11536f555679f93b96a8ba565bc22 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 08:58:55 +0800 Subject: [PATCH 08/46] 'commit' --- dsLightRag/Test/T1_Login.py | 162 ++++++++++++++++++++ dsLightRag/Test/T2_GetList.py | 162 ++++++++++++++++++++ dsLightRag/Test/TestCrawl.py | 270 ---------------------------------- 3 files changed, 324 insertions(+), 270 deletions(-) create mode 100644 dsLightRag/Test/T1_Login.py create mode 100644 dsLightRag/Test/T2_GetList.py delete mode 100644 dsLightRag/Test/TestCrawl.py diff --git a/dsLightRag/Test/T1_Login.py b/dsLightRag/Test/T1_Login.py new file mode 100644 index 00000000..6db1e6f1 --- /dev/null +++ b/dsLightRag/Test/T1_Login.py @@ -0,0 +1,162 @@ +# 详解(一)Python + Selenium 批量采集微信公众号,搭建自己的微信公众号每日AI简报,告别信息焦虑 +# https://blog.csdn.net/k352733625/article/details/149222945 + +# 微信爬爬猫---公众号文章抓取代码分析 +# https://blog.csdn.net/yajuanpi4899/article/details/121584268 + +""" +安装pdfkit库 +复制 +pip3 install pdfkit -i http://pypi.douban.com/simple --trusted-host pypi.douban.com +1. +import pdfkit +pdfkit.from_url('公众号文章地址', 'out.pdf') +""" +import datetime +import logging +import random +import re + +import requests + +""" +# 查看selenium版本 +pip show selenium +4.34.2 + +# 查看Chrome浏览器版本 +chrome://version/ +138.0.7204.101 (正式版本) (64 位) + +# 下载驱动包 +https://googlechromelabs.github.io/chrome-for-testing/ +https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip +""" +import time +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService + +if __name__ == '__main__': + # 定义一个空的字典,存放cookies内容 + cookies = {} + # 设置headers - 使用微信内置浏览器的User-Agent + header = { + "HOST": "mp.weixin.qq.com", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4", + "Connection": "keep-alive" + } + # 用webdriver启动谷歌浏览器 + logging.info("启动浏览器,打开微信公众号登录界面") + options = Options() + # options.add_argument('-headless') # 无头参数,调试时可以注释掉 + + # 设置微信内置浏览器的User-Agent + options.add_argument( + '--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)') + + service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") + driver = webdriver.Chrome(service=service, options=options) + # 打开微信公众号登录页面 + driver.get('https://mp.weixin.qq.com/') + # 等待5秒钟 + time.sleep(2) + # # 拿手机扫二维码! + logging.info("请拿手机扫码二维码登录公众号") + time.sleep(20) + + # 重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息 + driver.get('https://mp.weixin.qq.com/') + # 获取cookies + cookie_items = driver.get_cookies() + # 获取到的cookies是列表形式,将cookies转成json形式并存入本地名为cookie的文本中 + for cookie_item in cookie_items: + cookies[cookie_item['name']] = cookie_item['value'] + + if "slave_sid" not in cookies: + logging.info("登录公众号失败,获取cookie失败") + exit() + # cookies = json.dumps(post) # 注释掉这一行 + + # 方法3:使用requests库发送请求获取重定向URL + url = 'https://mp.weixin.qq.com' + response = requests.get(url=url, allow_redirects=False, cookies=cookies) + if 'Location' in response.headers: + redirect_url = response.headers.get("Location") + print("重定向URL:", redirect_url) + token_match = re.findall(r'token=(\d+)', redirect_url) + if token_match: + token = token_match[0] + print("获取到的token:", token) + logging.info("微信token:" + token) + + article_urls = [] + gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}] + for item in gzlist: + account_name = item["account_name"] + account_id = item["account_id"] + # 搜索微信公众号的接口地址 + search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' + # 搜索微信公众号接口需要传入的参数,有三个变量:微信公众号token、随机数random、搜索的微信公众号名字 + query_id = { + 'action': 'search_biz', + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'query': account_name, + 'begin': '0', + 'count': '5' + } + # 打开搜索微信公众号接口地址,需要传入相关参数信息如:cookies、params、headers + search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) + # 取搜索结果中的第一个公众号 + lists = search_response.json().get('list')[0] + # 获取这个公众号的fakeid,后面爬取公众号文章需要此字段 + fakeid = lists.get('fakeid') + logging.info("fakeid:" + fakeid) + # 微信公众号文章接口地址 + appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' + # 搜索文章需要传入几个参数:登录的公众号token、要爬取文章的公众号fakeid、随机数random + query_id_data = { + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'action': 'list_ex', + 'begin': '0', # 不同页,此参数变化,变化规则为每页加5 + 'count': '5', + 'query': '', + 'fakeid': fakeid, + 'type': '9' + } + # 打开搜索的微信公众号文章列表页 + query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) + fakeid_list = query_fakeid_response.json().get('app_msg_list') + + for item in fakeid_list: + # 采集item示例 + new_article = { + 'title': item.get('title'), + 'article_url': item.get('link'), + 'account_id': account_id, + 'account_name': account_name, + 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime( + '%Y-%m-%d %H:%M:%S'), + 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + } + logging.info("new_article:", new_article) + article_urls.append({"title":item.get('title'),"url":item.get('link'),"publish_time":datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')}) + time.sleep(1) + + for x in article_urls: + print(x) + + # 关闭浏览器 + driver.quit() + print("所有文章爬取完成!") diff --git a/dsLightRag/Test/T2_GetList.py b/dsLightRag/Test/T2_GetList.py new file mode 100644 index 00000000..6db1e6f1 --- /dev/null +++ b/dsLightRag/Test/T2_GetList.py @@ -0,0 +1,162 @@ +# 详解(一)Python + Selenium 批量采集微信公众号,搭建自己的微信公众号每日AI简报,告别信息焦虑 +# https://blog.csdn.net/k352733625/article/details/149222945 + +# 微信爬爬猫---公众号文章抓取代码分析 +# https://blog.csdn.net/yajuanpi4899/article/details/121584268 + +""" +安装pdfkit库 +复制 +pip3 install pdfkit -i http://pypi.douban.com/simple --trusted-host pypi.douban.com +1. +import pdfkit +pdfkit.from_url('公众号文章地址', 'out.pdf') +""" +import datetime +import logging +import random +import re + +import requests + +""" +# 查看selenium版本 +pip show selenium +4.34.2 + +# 查看Chrome浏览器版本 +chrome://version/ +138.0.7204.101 (正式版本) (64 位) + +# 下载驱动包 +https://googlechromelabs.github.io/chrome-for-testing/ +https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip +""" +import time +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService + +if __name__ == '__main__': + # 定义一个空的字典,存放cookies内容 + cookies = {} + # 设置headers - 使用微信内置浏览器的User-Agent + header = { + "HOST": "mp.weixin.qq.com", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4", + "Connection": "keep-alive" + } + # 用webdriver启动谷歌浏览器 + logging.info("启动浏览器,打开微信公众号登录界面") + options = Options() + # options.add_argument('-headless') # 无头参数,调试时可以注释掉 + + # 设置微信内置浏览器的User-Agent + options.add_argument( + '--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)') + + service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") + driver = webdriver.Chrome(service=service, options=options) + # 打开微信公众号登录页面 + driver.get('https://mp.weixin.qq.com/') + # 等待5秒钟 + time.sleep(2) + # # 拿手机扫二维码! + logging.info("请拿手机扫码二维码登录公众号") + time.sleep(20) + + # 重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息 + driver.get('https://mp.weixin.qq.com/') + # 获取cookies + cookie_items = driver.get_cookies() + # 获取到的cookies是列表形式,将cookies转成json形式并存入本地名为cookie的文本中 + for cookie_item in cookie_items: + cookies[cookie_item['name']] = cookie_item['value'] + + if "slave_sid" not in cookies: + logging.info("登录公众号失败,获取cookie失败") + exit() + # cookies = json.dumps(post) # 注释掉这一行 + + # 方法3:使用requests库发送请求获取重定向URL + url = 'https://mp.weixin.qq.com' + response = requests.get(url=url, allow_redirects=False, cookies=cookies) + if 'Location' in response.headers: + redirect_url = response.headers.get("Location") + print("重定向URL:", redirect_url) + token_match = re.findall(r'token=(\d+)', redirect_url) + if token_match: + token = token_match[0] + print("获取到的token:", token) + logging.info("微信token:" + token) + + article_urls = [] + gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}] + for item in gzlist: + account_name = item["account_name"] + account_id = item["account_id"] + # 搜索微信公众号的接口地址 + search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' + # 搜索微信公众号接口需要传入的参数,有三个变量:微信公众号token、随机数random、搜索的微信公众号名字 + query_id = { + 'action': 'search_biz', + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'query': account_name, + 'begin': '0', + 'count': '5' + } + # 打开搜索微信公众号接口地址,需要传入相关参数信息如:cookies、params、headers + search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) + # 取搜索结果中的第一个公众号 + lists = search_response.json().get('list')[0] + # 获取这个公众号的fakeid,后面爬取公众号文章需要此字段 + fakeid = lists.get('fakeid') + logging.info("fakeid:" + fakeid) + # 微信公众号文章接口地址 + appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' + # 搜索文章需要传入几个参数:登录的公众号token、要爬取文章的公众号fakeid、随机数random + query_id_data = { + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'action': 'list_ex', + 'begin': '0', # 不同页,此参数变化,变化规则为每页加5 + 'count': '5', + 'query': '', + 'fakeid': fakeid, + 'type': '9' + } + # 打开搜索的微信公众号文章列表页 + query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) + fakeid_list = query_fakeid_response.json().get('app_msg_list') + + for item in fakeid_list: + # 采集item示例 + new_article = { + 'title': item.get('title'), + 'article_url': item.get('link'), + 'account_id': account_id, + 'account_name': account_name, + 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime( + '%Y-%m-%d %H:%M:%S'), + 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + } + logging.info("new_article:", new_article) + article_urls.append({"title":item.get('title'),"url":item.get('link'),"publish_time":datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')}) + time.sleep(1) + + for x in article_urls: + print(x) + + # 关闭浏览器 + driver.quit() + print("所有文章爬取完成!") diff --git a/dsLightRag/Test/TestCrawl.py b/dsLightRag/Test/TestCrawl.py deleted file mode 100644 index 63d2eb14..00000000 --- a/dsLightRag/Test/TestCrawl.py +++ /dev/null @@ -1,270 +0,0 @@ -# 详解(一)Python + Selenium 批量采集微信公众号,搭建自己的微信公众号每日AI简报,告别信息焦虑 -# https://blog.csdn.net/k352733625/article/details/149222945 - -# 微信爬爬猫---公众号文章抓取代码分析 -# https://blog.csdn.net/yajuanpi4899/article/details/121584268 -import datetime -import logging -import random -import re -import os - -import requests - -""" -# 查看selenium版本 -pip show selenium -4.34.2 - -# 查看Chrome浏览器版本 -chrome://version/ -138.0.7204.101 (正式版本) (64 位) - -# 下载驱动包 -https://googlechromelabs.github.io/chrome-for-testing/ -https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip -""" -import time -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.chrome.service import Service as ChromeService -import json - -if __name__ == '__main__': - # 定义一个空的字典,存放cookies内容 - cookies = {} - # 设置headers - 使用微信内置浏览器的User-Agent - header = { - "HOST": "mp.weixin.qq.com", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - "Accept-Encoding": "gzip, deflate, br", - "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4", - "Connection": "keep-alive" - } - # 用webdriver启动谷歌浏览器 - logging.info("启动浏览器,打开微信公众号登录界面") - options = Options() - # options.add_argument('-headless') # 无头参数,调试时可以注释掉 - - # 设置微信内置浏览器的User-Agent - options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)') - - service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") - driver = webdriver.Chrome(service=service, options=options) - # 打开微信公众号登录页面 - driver.get('https://mp.weixin.qq.com/') - # 等待5秒钟 - time.sleep(2) - # # 拿手机扫二维码! - logging.info("请拿手机扫码二维码登录公众号") - time.sleep(20) - - # 重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息 - driver.get('https://mp.weixin.qq.com/') - # 获取cookies - cookie_items = driver.get_cookies() - # 获取到的cookies是列表形式,将cookies转成json形式并存入本地名为cookie的文本中 - for cookie_item in cookie_items: - cookies[cookie_item['name']] = cookie_item['value'] - - if "slave_sid" not in cookies: - logging.info("登录公众号失败,获取cookie失败") - exit() - # cookies = json.dumps(post) # 注释掉这一行 - - # 方法3:使用requests库发送请求获取重定向URL - url = 'https://mp.weixin.qq.com' - response = requests.get(url=url, allow_redirects=False, cookies=cookies) - if 'Location' in response.headers: - redirect_url = response.headers.get("Location") - print("重定向URL:", redirect_url) - token_match = re.findall(r'token=(\d+)', redirect_url) - if token_match: - token = token_match[0] - print("获取到的token:", token) - logging.info("微信token:" + token) - - article_urls = [] - gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}] - for item in gzlist: - account_name = item["account_name"] - account_id = item["account_id"] - # 搜索微信公众号的接口地址 - search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' - # 搜索微信公众号接口需要传入的参数,有三个变量:微信公众号token、随机数random、搜索的微信公众号名字 - query_id = { - 'action': 'search_biz', - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'query': account_name, - 'begin': '0', - 'count': '5' - } - # 打开搜索微信公众号接口地址,需要传入相关参数信息如:cookies、params、headers - search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) - # 取搜索结果中的第一个公众号 - lists = search_response.json().get('list')[0] - # 获取这个公众号的fakeid,后面爬取公众号文章需要此字段 - fakeid = lists.get('fakeid') - logging.info("fakeid:" + fakeid) - # 微信公众号文章接口地址 - appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' - # 搜索文章需要传入几个参数:登录的公众号token、要爬取文章的公众号fakeid、随机数random - query_id_data = { - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'action': 'list_ex', - 'begin': '0', # 不同页,此参数变化,变化规则为每页加5 - 'count': '5', - 'query': '', - 'fakeid': fakeid, - 'type': '9' - } - # 打开搜索的微信公众号文章列表页 - query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) - fakeid_list = query_fakeid_response.json().get('app_msg_list') - - for item in fakeid_list: - # 采集item示例 - new_article = { - 'title': item.get('title'), - 'article_url': item.get('link'), - 'account_id': account_id, - 'account_name': account_name, - 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S'), - 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') - } - print("new_article:", new_article) - logging.info("new_article:", new_article) - article_urls.append(item.get('link')) - time.sleep(1) - - # 确保Logs目录存在 - logs_dir = "./Test/Logs" - if not os.path.exists(logs_dir): - os.makedirs(logs_dir) - - for article_url in article_urls: - print("正在爬取文章:" + article_url) - try: - # 使用requests直接获取文章内容,模拟微信环境 - wechat_headers = { - "User-Agent": "Mozilla/5.0 (Linux; Android 10; MI 8 Build/QKQ1.190828.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/83.0.4103.101 Mobile Safari/537.36 XWEB/1768 MMWEBSDK/20210302 MMWEBID/6253 MicroMessenger/8.0.2.1860(0x28000234) Process/toolsmp WeChat/arm64 Weixin NetType/WIFI Language/zh_CN ABI/arm64", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", - "Accept-Encoding": "gzip, deflate", - "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", - "X-Requested-With": "com.tencent.mm", - "Referer": "https://mp.weixin.qq.com/" - } - - # 使用selenium打开文章链接,设置请求头 - driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': wechat_headers}) - driver.get(article_url) - # 增加等待时间,确保页面完全加载 - time.sleep(5) - - # 检查是否需要登录 - if "请在微信客户端中打开链接" in driver.page_source or "请在微信中打开此链接" in driver.page_source: - print(f"文章需要在微信中打开,尝试使用requests直接获取:{article_url}") - # 尝试使用requests直接获取 - response = requests.get(article_url, headers=wechat_headers, cookies=cookies) - if "请在微信客户端中打开链接" in response.text or "请在微信中打开此链接" in response.text: - print(f"使用requests仍然无法获取,跳过此文章:{article_url}") - continue - else: - # 保存获取到的HTML内容 - filename = f"article_{article_url.split('sn=')[1][:10] if 'sn=' in article_url else 'unknown'}" - save_path = f"{logs_dir}/{filename}.html" - with open(save_path, "w", encoding="utf-8") as f: - f.write(response.text) - print(f"已保存文章HTML内容:{save_path}") - continue - - # 使用更可靠的选择器查找标题和内容 - try: - # 尝试多种可能的标题选择器 - title_selectors = [ - '//h1[@class="rich_media_title"]', - '//h1[@id="activity-name"]', - '//h2[@class="rich_media_title"]', - '//div[@class="rich_media_content"]//h1', - '//div[@id="js_article"]//h1' - ] - - title = None - for selector in title_selectors: - try: - title_element = driver.find_element('xpath', selector) - title = title_element.text.strip() - if title: - break - except: - continue - - if not title: - # 如果所有选择器都失败,尝试从页面标题获取 - title = driver.title.replace(" - 微信公众号", "").strip() - - # 尝试多种可能的内容选择器 - content_selectors = [ - '//div[@class="rich_media_content"]', - '//div[@id="js_content"]', - '//div[@class="rich_media_wrp"]' - ] - - content = None - for selector in content_selectors: - try: - content_element = driver.find_element('xpath', selector) - content = content_element.text.strip() - if content: - break - except: - continue - - if not content: - # 如果无法获取内容,至少保存页面源码 - content = "无法提取正文内容,保存页面源码:\n" + driver.page_source - - # 创建文件名(使用标题,但去除不合法的文件名字符) - if not title: - title = "未知标题_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "未知标题" - - filename = re.sub(r'[\\/:*?"<>|]', '_', title) - - # 保存文章内容到文件 - save_path = f"{logs_dir}/{filename}.txt" - with open(save_path, "w", encoding="utf-8") as f: - f.write(f"标题:{title}\n\n") - f.write(f"链接:{article_url}\n\n") - f.write(f"内容:\n{content}") - - print(f"文章《{title}》保存成功:{save_path}") - - except Exception as e: - print(f"提取文章内容失败:{str(e)}") - # 保存页面源码以便分析 - error_filename = "error_" + article_url.split("sn=")[1][:10] if "sn=" in article_url else "error_page" - error_path = f"{logs_dir}/{error_filename}.html" - with open(error_path, "w", encoding="utf-8") as f: - f.write(driver.page_source) - print(f"已保存页面源码到:{error_path}") - - # 避免频繁请求被封 - time.sleep(random.uniform(3, 7)) - - except Exception as e: - print(f"爬取文章失败:{article_url},错误信息:{str(e)}") - continue - - # 关闭浏览器 - driver.quit() - print("所有文章爬取完成!") - From d57b2b94c566b734e3c32c455b22252237ce3eef Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 09:09:08 +0800 Subject: [PATCH 09/46] 'commit' --- dsLightRag/Test/T1_Login.py | 98 +++----------------------------- dsLightRag/Test/T2_GetList.py | 45 +++++---------- dsLightRag/Test/article_urls.txt | 0 dsLightRag/Test/cookies.txt | 1 + 4 files changed, 22 insertions(+), 122 deletions(-) create mode 100644 dsLightRag/Test/article_urls.txt create mode 100644 dsLightRag/Test/cookies.txt diff --git a/dsLightRag/Test/T1_Login.py b/dsLightRag/Test/T1_Login.py index 6db1e6f1..8c4c57e0 100644 --- a/dsLightRag/Test/T1_Login.py +++ b/dsLightRag/Test/T1_Login.py @@ -4,15 +4,8 @@ # 微信爬爬猫---公众号文章抓取代码分析 # https://blog.csdn.net/yajuanpi4899/article/details/121584268 -""" -安装pdfkit库 -复制 -pip3 install pdfkit -i http://pypi.douban.com/simple --trusted-host pypi.douban.com -1. -import pdfkit -pdfkit.from_url('公众号文章地址', 'out.pdf') -""" import datetime +import json import logging import random import re @@ -52,11 +45,6 @@ if __name__ == '__main__': # 用webdriver启动谷歌浏览器 logging.info("启动浏览器,打开微信公众号登录界面") options = Options() - # options.add_argument('-headless') # 无头参数,调试时可以注释掉 - - # 设置微信内置浏览器的User-Agent - options.add_argument( - '--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)') service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") driver = webdriver.Chrome(service=service, options=options) @@ -81,82 +69,10 @@ if __name__ == '__main__': exit() # cookies = json.dumps(post) # 注释掉这一行 - # 方法3:使用requests库发送请求获取重定向URL - url = 'https://mp.weixin.qq.com' - response = requests.get(url=url, allow_redirects=False, cookies=cookies) - if 'Location' in response.headers: - redirect_url = response.headers.get("Location") - print("重定向URL:", redirect_url) - token_match = re.findall(r'token=(\d+)', redirect_url) - if token_match: - token = token_match[0] - print("获取到的token:", token) - logging.info("微信token:" + token) - - article_urls = [] - gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}] - for item in gzlist: - account_name = item["account_name"] - account_id = item["account_id"] - # 搜索微信公众号的接口地址 - search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' - # 搜索微信公众号接口需要传入的参数,有三个变量:微信公众号token、随机数random、搜索的微信公众号名字 - query_id = { - 'action': 'search_biz', - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'query': account_name, - 'begin': '0', - 'count': '5' - } - # 打开搜索微信公众号接口地址,需要传入相关参数信息如:cookies、params、headers - search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) - # 取搜索结果中的第一个公众号 - lists = search_response.json().get('list')[0] - # 获取这个公众号的fakeid,后面爬取公众号文章需要此字段 - fakeid = lists.get('fakeid') - logging.info("fakeid:" + fakeid) - # 微信公众号文章接口地址 - appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' - # 搜索文章需要传入几个参数:登录的公众号token、要爬取文章的公众号fakeid、随机数random - query_id_data = { - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'action': 'list_ex', - 'begin': '0', # 不同页,此参数变化,变化规则为每页加5 - 'count': '5', - 'query': '', - 'fakeid': fakeid, - 'type': '9' - } - # 打开搜索的微信公众号文章列表页 - query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) - fakeid_list = query_fakeid_response.json().get('app_msg_list') - - for item in fakeid_list: - # 采集item示例 - new_article = { - 'title': item.get('title'), - 'article_url': item.get('link'), - 'account_id': account_id, - 'account_name': account_name, - 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime( - '%Y-%m-%d %H:%M:%S'), - 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') - } - logging.info("new_article:", new_article) - article_urls.append({"title":item.get('title'),"url":item.get('link'),"publish_time":datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')}) - time.sleep(1) - - for x in article_urls: - print(x) - - # 关闭浏览器 + # 将cookies写入文件 + with open('cookies.txt', mode='w', encoding="utf-8") as f: + f.write(json.dumps(cookies)) + # 关闭浏览器 driver.quit() - print("所有文章爬取完成!") + # 输出提示 + print("成功获取了cookies内容!") diff --git a/dsLightRag/Test/T2_GetList.py b/dsLightRag/Test/T2_GetList.py index 6db1e6f1..d49c40b9 100644 --- a/dsLightRag/Test/T2_GetList.py +++ b/dsLightRag/Test/T2_GetList.py @@ -13,6 +13,7 @@ import pdfkit pdfkit.from_url('公众号文章地址', 'out.pdf') """ import datetime +import json import logging import random import re @@ -38,8 +39,12 @@ from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService if __name__ == '__main__': - # 定义一个空的字典,存放cookies内容 - cookies = {} + # 从文件cookies.txt中获取 + with open('cookies.txt', 'r', encoding='utf-8') as f: + content = f.read() + # 使用json还原为json对象 + cookies = json.loads(content) + options = Options() # 设置headers - 使用微信内置浏览器的User-Agent header = { "HOST": "mp.weixin.qq.com", @@ -49,37 +54,9 @@ if __name__ == '__main__': "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.5;q=0.4", "Connection": "keep-alive" } - # 用webdriver启动谷歌浏览器 - logging.info("启动浏览器,打开微信公众号登录界面") - options = Options() - # options.add_argument('-headless') # 无头参数,调试时可以注释掉 - - # 设置微信内置浏览器的User-Agent - options.add_argument( - '--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/4.0.1301.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2875.116 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63010200)') service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") driver = webdriver.Chrome(service=service, options=options) - # 打开微信公众号登录页面 - driver.get('https://mp.weixin.qq.com/') - # 等待5秒钟 - time.sleep(2) - # # 拿手机扫二维码! - logging.info("请拿手机扫码二维码登录公众号") - time.sleep(20) - - # 重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息 - driver.get('https://mp.weixin.qq.com/') - # 获取cookies - cookie_items = driver.get_cookies() - # 获取到的cookies是列表形式,将cookies转成json形式并存入本地名为cookie的文本中 - for cookie_item in cookie_items: - cookies[cookie_item['name']] = cookie_item['value'] - - if "slave_sid" not in cookies: - logging.info("登录公众号失败,获取cookie失败") - exit() - # cookies = json.dumps(post) # 注释掉这一行 # 方法3:使用requests库发送请求获取重定向URL url = 'https://mp.weixin.qq.com' @@ -151,11 +128,17 @@ if __name__ == '__main__': 'collection_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') } logging.info("new_article:", new_article) - article_urls.append({"title":item.get('title'),"url":item.get('link'),"publish_time":datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S')}) + article_urls.append({"title": item.get('title'), "url": item.get('link'), + "publish_time": datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime( + '%Y-%m-%d %H:%M:%S')}) time.sleep(1) for x in article_urls: print(x) + # 将返回的地址写入到文件 + with open('article_urls.txt', 'w', encoding='utf-8') as f: + for url in article_urls: + f.write(url + '\n') # 关闭浏览器 driver.quit() diff --git a/dsLightRag/Test/article_urls.txt b/dsLightRag/Test/article_urls.txt new file mode 100644 index 00000000..e69de29b diff --git a/dsLightRag/Test/cookies.txt b/dsLightRag/Test/cookies.txt new file mode 100644 index 00000000..4999d643 --- /dev/null +++ b/dsLightRag/Test/cookies.txt @@ -0,0 +1 @@ +{"_clsk": "1v8cz8t|1752541383487|1|1|mp.weixin.qq.com/weheat-agent/payload/record", "xid": "fff1911b542cde79c5c47a38cb3929c8", "data_bizuin": "3514353238", "slave_user": "gh_4f88a4e194da", "slave_sid": "cDlUaWlaek5RZHV6SUIyVWNNZlJGYTJQdHY5YzUyN29LMG94RlptUV9lbkVDUWxmaTBURFE5YWNKeVRkYlZSdU9VRnNjWXRKN2xfZ2pZd0JWal82aVpsRDhqUnJXQkdYMml4SlhrdGtGY2k2MG95YTlQVEFVanpIR01oZ3p4dldiME9hRE1zcGxZV0FlNTVV", "rand_info": "CAESIPFuk5/nui6QoQ6zEO2B5RfaUmjuQjTJOQVg9mBuI/XG", "data_ticket": "AIy4PwNlFMRBDHcZ7jcXDXf/8fFLl5NS25Nj3tYuDL8H4W8EiURU4G9Dakn7aSUC", "bizuin": "3514353238", "mm_lang": "zh_CN", "slave_bizuin": "3514353238", "uuid": "91eaae9bc5e4f725e03ee2b7e75c8a2c", "ua_id": "bbkG1LsuVI1DszGdAAAAADm2HzejXloc87mSyGEMpdY=", "wxuin": "52541365079710", "_clck": "1l32fbr|1|fxm|0"} \ No newline at end of file From 8148f2a87d0463bf66a3577504b400db7b19e8e2 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 09:11:53 +0800 Subject: [PATCH 10/46] 'commit' --- dsLightRag/Test/T2_GetList.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dsLightRag/Test/T2_GetList.py b/dsLightRag/Test/T2_GetList.py index d49c40b9..53fe831a 100644 --- a/dsLightRag/Test/T2_GetList.py +++ b/dsLightRag/Test/T2_GetList.py @@ -45,6 +45,7 @@ if __name__ == '__main__': # 使用json还原为json对象 cookies = json.loads(content) options = Options() + options.add_argument('-headless') # 无头参数,调试时可以注释掉 # 设置headers - 使用微信内置浏览器的User-Agent header = { "HOST": "mp.weixin.qq.com", @@ -137,8 +138,8 @@ if __name__ == '__main__': print(x) # 将返回的地址写入到文件 with open('article_urls.txt', 'w', encoding='utf-8') as f: - for url in article_urls: - f.write(url + '\n') + for record in article_urls: + f.write(record['title']+" "+record['publish_time']+" "+record['url'] + '\n') # 关闭浏览器 driver.quit() From a3227c1967c44e528ab0f426860e924e6ed18e4d Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 09:24:10 +0800 Subject: [PATCH 11/46] 'commit' --- dsLightRag/Doc/2、Conda维护.txt | 10 +++++----- .../{T2_GetList.py => T2_GetArticleList.py} | 9 +-------- dsLightRag/Test/T3_GetArticle.py | 15 +++++++++++++++ dsLightRag/Test/article_urls.txt | 11 +++++++++++ dsLightRag/Test/out.pdf | Bin 0 -> 19544 bytes 5 files changed, 32 insertions(+), 13 deletions(-) rename dsLightRag/Test/{T2_GetList.py => T2_GetArticleList.py} (94%) create mode 100644 dsLightRag/Test/T3_GetArticle.py create mode 100644 dsLightRag/Test/out.pdf diff --git a/dsLightRag/Doc/2、Conda维护.txt b/dsLightRag/Doc/2、Conda维护.txt index 021764bf..80bfef40 100644 --- a/dsLightRag/Doc/2、Conda维护.txt +++ b/dsLightRag/Doc/2、Conda维护.txt @@ -5,7 +5,7 @@ conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/f conda config --set show_channel_urls yes # 创建虚拟环境 -conda create -n rag python=3.10 +conda create -n py310 python=3.10 # 查看当前存在哪些虚拟环境 conda env list @@ -15,16 +15,16 @@ conda info -e conda list # 激活虚拟环境 -conda activate rag +conda activate py310 # 对虚拟环境中安装额外的包 -conda install -n rag $package_name +conda install -n py310 $package_name # 删除虚拟环境 -conda remove -n rag --all +conda remove -n py310 --all # 删除环境中的某个包 -conda remove --name rag $package_name +conda remove --name py310 $package_name # 恢复默认镜像 conda config --remove-key channels diff --git a/dsLightRag/Test/T2_GetList.py b/dsLightRag/Test/T2_GetArticleList.py similarity index 94% rename from dsLightRag/Test/T2_GetList.py rename to dsLightRag/Test/T2_GetArticleList.py index 53fe831a..bbf4318f 100644 --- a/dsLightRag/Test/T2_GetList.py +++ b/dsLightRag/Test/T2_GetArticleList.py @@ -4,14 +4,7 @@ # 微信爬爬猫---公众号文章抓取代码分析 # https://blog.csdn.net/yajuanpi4899/article/details/121584268 -""" -安装pdfkit库 -复制 -pip3 install pdfkit -i http://pypi.douban.com/simple --trusted-host pypi.douban.com -1. -import pdfkit -pdfkit.from_url('公众号文章地址', 'out.pdf') -""" + import datetime import json import logging diff --git a/dsLightRag/Test/T3_GetArticle.py b/dsLightRag/Test/T3_GetArticle.py new file mode 100644 index 00000000..02e9cce8 --- /dev/null +++ b/dsLightRag/Test/T3_GetArticle.py @@ -0,0 +1,15 @@ +""" +安装pdfkit库 +https://github.com/JazzCore/python-pdfkit/wiki/Installing-wkhtmltopdf + +我是在Windows上开发的,所以,下载的是:【注意要科学上网下载,否则太慢了~】 +https://release-assets.githubusercontent.com/github-production-release-asset/131323182/3200f380-aba8-11ea-8942-42fa5e27a312?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-07-15T02%3A10%3A32Z&rscd=attachment%3B+filename%3Dwkhtmltox-0.12.6-1.mxe-cross-win64.7z&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2025-07-15T01%3A10%3A07Z&ske=2025-07-15T02%3A10%3A32Z&sks=b&skv=2018-11-09&sig=IYNB2Gi%2FZ9tZfPXmo7PbqjbxmcLULpP%2Bex2z6lp2DvE%3D&jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc1MjU0MjU3NSwibmJmIjoxNzUyNTQyMjc1LCJwYXRoIjoicmVsZWFzZWFzc2V0cHJvZHVjdGlvbi5ibG9iLmNvcmUud2luZG93cy5uZXQifQ.LyZXiO_mRK2qX99CTJtVwypU4DLsK-_Js0wspzsL0Y4&response-content-disposition=attachment%3B%20filename%3Dwkhtmltox-0.12.6-1.mxe-cross-win64.7z&response-content-type=application%2Foctet-stream +解压到D:\wkhtmltox中,还要注意把路径加到环境变量中 + +conda activate py310 +pip3 install pdfkit +""" +import pdfkit +path_wk = r'D:\wkhtmltox\bin\wkhtmltopdf.exe' #wkhtmltopdf安装位置 +config = pdfkit.configuration(wkhtmltopdf = path_wk) +pdfkit.from_url('http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd', 'out.pdf',configuration=config) \ No newline at end of file diff --git a/dsLightRag/Test/article_urls.txt b/dsLightRag/Test/article_urls.txt index e69de29b..c8d57b34 100644 --- a/dsLightRag/Test/article_urls.txt +++ b/dsLightRag/Test/article_urls.txt @@ -0,0 +1,11 @@ +长春中考上演“神仙打架”!省二力旺等五校过半考生超700分! 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd +独家专访赫行学校2025年中考“双黄蛋”!学霸靠啥杀出重围? 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=2&sn=c7733f7c2c6331e51e55af695f99a43e&chksm=84e1ab0cb396221a7d185dcb99acc9dce45cc5c66c3eef42680a215b710bb9bfa9fd10da4419#rd +长春40所学校中考成绩曝光!700+成批涌现!谁是最大黑马? 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=1&sn=1f314640ae6eec236b0e16271bd44362&chksm=84e1ab1db396220b73ae08898a026d887436501a6c42abe01d7fa4aef9063533fad89720d3b8#rd +喜报!长春外国语学校女子篮球队夺得冠军! 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=2&sn=31651043acb6ecbf4232e92e635196b6&chksm=84e1ab1db396220b0810c3bdf332128b110d1902658f2556eaeff67cec084a8a068a5ae9a275#rd +“趣闯盛夏·探无界”!探秘一实验银河小学夏令营 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=3&sn=8edf6ce8cebdaad55343b39639876c27&chksm=84e1ab1db396220b26b172b3b565f919f7ded4c2a5b78227294ea29a558a7666c33b8c1de660#rd +刚刚!2025年长春中考各批次控制线公布! 2025-07-12 10:04:32 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526196&idx=1&sn=282e5e824410a9a92a83dd800cb58a7c&chksm=84e1aba6b39622b03fe6422032474c9696f83541d9ff9b8b6a9f0f099ce459da430f720d05e4#rd +重磅消息!师大附属实验学校(经开)校长有新任命! 2025-07-12 10:04:32 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526196&idx=2&sn=9449c87935faf86ddcc5a674ea888913&chksm=84e1aba6b39622b03fd8413ff1e74b61f662ec8deb3887c2c5b8e5ad15470b15ae14b21e94ea#rd +市教育局最新发布!长春2025年中考成绩将于7月12日公布! 2025-07-11 15:22:13 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526176&idx=1&sn=a5b4104d2fe74ace32ab31faf5f1c44c&chksm=84e1abb2b39622a40b0e0969e84fb00c753cc8ffefb8624726afa2a7352ea725c7f967bf25f5#rd +长春市第十九中学2025年职称评聘拟通过人员名单的公示!有你认识的老师吗? 2025-07-11 15:22:13 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526176&idx=2&sn=693d13964e4be18718c0eb4fd13ba68f&chksm=84e1abb2b39622a442a9f7cea8ddc72820050b2896968f2d0ae283c7caca2dbe014a721feb2e#rd +高分喜报频传!长春这所小学靠啥成为“学霸制造机”? 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=1&sn=a0af7f484d6a3300a9b7f3d787a2594d&chksm=84e1ab9cb396228a56420696eb09071ff829d58e8e31bd652f849f3cbd0ee276b0baad7a1e89#rd +蝉联冠军!吉大尚德游泳队斩获骄人成绩! 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=2&sn=cabb682e99978bf2a58ff0e9e06dc53d&chksm=84e1ab9cb396228a5cd457cd7ee0728491e6b3dc34fbde02240624364cfa8a9e2c533052d2b4#rd diff --git a/dsLightRag/Test/out.pdf b/dsLightRag/Test/out.pdf new file mode 100644 index 0000000000000000000000000000000000000000..4aec06ad8c5bbf081626dcc93f2bfe0877c5ab8b GIT binary patch literal 19544 zcmd6P2{ct*__vgJRfbGu$dwSno$poV;hM>mDKbmuIdhq(gb*^6nauML8VnH$WeCZX z$`pwxeCJ%0(*OJSertVez1F(UJFbf$<_qVGt<9#_$TM zfB-3Le>9+R-rmO8$;f0&MHwgvAwU9`u)qhiQ=z$ywS<8q(1yfWI24Y8VqhpJKOBL; za6_>iP$&l{sgMwI>+E|WIVqpm0ofGH}vtZC&O{wAgM%3Ec#u4ZaCqqa4xxBfx1u373iIF2j z2Zn@gR|$b&FeqN22)&@7)Q1dorcmW&*MVFa3j z@TsfFkb?gpr>__|8#ox*o7*~`b+$1#zIYmXk@FmWG=DmZ+GF&212bTF&Kua9SUW=C zz-1MnS8c2vfl~(v41^{iP%&|^ak4iu0VE(K014h#;1nD};8elH*xW$O=IWNYD1HrAEN$#nZ4Hb}@IIXvSG9#~R|J0j2?D?rRa>(ift0<0D_G@E5aQLP zR1F~8R&7BO0|7AuShaPEg@C9~!E5f|4CqR*A{^i-z78B3g8orRQ_1iOIA~jies&(- zSm}RvjnD%go%-ju?g6?gCj7#tw^sX zHpMDfFu*7R=GSP>5}&caU1B}$f7OD{?esCeB?>o^JDF_#_gJoPtjqSEy?>eURzO*K zv5DXbb5lK;v?TALi*55O_HV4b+6*jfZ@l2b>gvH8%upgK<2Ew-%_H{x!RK5#dJU86 z$*oS*Qtk6;L%&sV(Qy&gDfIA@4xX;KbHh7`&usC+1ZVlle99yA{lu@9RSyIT$i7^8 zk{@^ba8oMX=a}2LXB*nI-?Yrvk_eb459iMmPSi5?yAODP5auU% z?;i?8a9ARuL_ghvW8CUQ^GqYb>ZD;22)EGPc&WSCHYx|uzA_LybMU2_G?LyVtQk)<5$3p^yV6jyl0$|$5zUR54n zX-l3T$>O;t!glI-cNx8J4%?2G*l4Z^5SJYkeF4K%a zxb!AVJs)SyaNmqkY58)Zd)>ox`MZj;UT$AXb_=JXMub|ESvZ-_<-8eHrk19%oQ)+9 zZlirZl`MC(g7k<>K6IbwMd=gPaAa+=E#H}QaI08lQ}&WgVQ^hQtv-;otLy#n0j}f} z-S@?75AwZ7#>d5kxPFpMQs2 z0b1O(^x&!G|IP3^K(KbC`C6CCt> zhrf0v_%_Y_Zxg)7gIwUJhJ@+shoML! zdxwbXfP?I&ZlG#fqJHiWWSye?Kd$5v!4SQC+r&?x}ncv!kO( zu{jNr2|@y&Ad&DtfMjR2g`rS8Ac5~)>He=Ef$v!~fTbFbitR-M{$<|6?G* zJF&GG{8$C>p6}SZ4HEcYt^zj{i%AJ#hlfAY#QRTY+;Fj)BJ=S*e-<0$n|L>fgO=EP z78)O}Wf51f4BKV9NQ<;NBWYrL+KI;1n%*Nd!k&;EM@R(05%TjY z{UmOwkII*bT-+&dzEV5Ps%fRZD&itm=#XnZaQ}_PyhN6bcDq?W@23kDoKnVz8G7gI zQXar2w2F;viOLM6HxGOfv&<(KiCBRFWCgXhfUzWvI+ce*#xh*)A9c` zp!|VN@SgA3y91QJWD@`=WRD>f!}HwMXwEuc6@62v9hBuTS@WZp5=PprcAr*=o}JfS zddf6kCs5T_FizP?Zt3}8{JahGb-ykoU2M-8B6G>6%UbGTPY(P;b%4Z`J#Iv@q2qn< zQA~AVm9X)GnnDxwx&dn&aUO!{c683P%!F#@A?>15pe@!WZhaN)#q-YGCLDtE2d$b=vKzz^GmV!9vt~B>S;?4Tt4ge{S zb>Fx@n9!>Yml6;zIQ5tn-aYQoT9sl78i5odI^n$Y@zEuleVJ{ zsNU(LpKo4nSTrK%^Jf={-HQ~wU!bqBT}b(toPyZHDTrS=1@X6>g4f&W`TrVH{=g}C z&v)$ILCXINrw|oT#~qhvlqd1^wS%I%;(DbSzk8DJ=9C_gQ^ae=&(BAQkXlS!)Kn~` zE~BDJQ&oeJQ5m;Wuq?Ji8{`b0y$`0x3~3B8YYM2V!sPX1XeRfY#D&Y{8ux|1;hxvL zJyI6OSEMtVZ%L&nn=fQ(~V@Sa}0ot~UQ}CYe*t?As}2EpVGcF|)#5LCaftCr%?ttcXI4 z^iXjG74<@+{^FIF*-~QKN~ z?r9v0*crr=C0>68QKjfAUe8Cv^+7GBxdD`Cbd-hlb>BNF#>aw`LIY3=D#f7oq0aD6 zN=b?gKVY$Z(P)EmR>Ldo6QQ@Ui0@vY;N1dU-F2IS{Fju1+(RkIUnvFox0Hg{+v)iK z8c_Z~DR|F!?A-y%UsK92i_dHP;={7F_yp1T`g)I~|GxMXe_6OLUgvq>XlB;d;=|2_ zdj>2%IO#Zu$Wz3l@4RbPIKJqrR?nt&me8y#Tmh7a8I*&r9S$y0F z*w28APx7+YV)N-7|Jmp}hIphL{&diXPbBOQNcjV$p#CMLp!QG->Q_oZ{Vk>7^>%vx zKZX>%6QFIoCU=FsV(60{=Re;=pT^s2TDQzOG-iSp%nD5 zl!E?SO2O;x^!$GeDR?J9+jdb3-t!%Mw~>PWYf1qZAL^lz!mFh(`aJrPNeB`$v8Qh# zWGbSO7Y>tcc#c>g)aVqX?Olc@b?am}9taeZeYo!FDo!YAmG;iRe11sW7Pms#O~`vkvl$d)Ktmu4ZoOmuCrz>)3E-rm%mFq zPLI=0N){;@zxQd;SVtF&=%CaVmVBz!IA4$Uo}IdNa+FstKB&edy-J=2L% zdTx{MANNnM-3eJeJV^A$_ztNygVBu)aOt7oC;J%e)A*hBb>a88w?v+%BA$B*1^t6i zem-==fCRDQ{-sMAa-xdj(n>0q;E=5W{~5l*fW)y?@@JSi#@m!lk>}ceMN%7T2&1c> z;((#5Sp|e?bDzsT#W0mSCTs38iqV%@Znmh(Ufb`)e8Zmr0|cF+DhD_BagstHJCQ@o z*31M15+)9g5Wd}(fW^dZoPbap6byWABOI@>V-7y_pycFeX>JV!8ToML_CO_ggMnaz zyupu?UFsn3?fAVFe-$@yG_bTW1E2Yu0kv#%#gAIIO~hYS0D^Ep?9{@!upC;n}`1Cusbn6~)R=w%{fZFd*zngbG{dOG6S=^@s+yk0upPh7@ zt9!qs5Q?9OZRpA1GW6?D?(-ZrnmST#5esbZn7h0tTS)jk8$`VJPDH$0psTy?0Wmw= zBp?9c+X5sQUIwNEJ|=HtzZGs)G_V4Yrz|ERDk{pYU~Xh@<6vXzsAV8+Vy-TuVq)fG zX#keAG%#}jV&YpmV))294-AgvMZsVg7#6|^Ya7Q7YHN*1Hr;jyhu1d z8jAs*Ao9ro)4|+G)Y=TlD1ic1i8>g8L4GuXpBH$-1_N%z@TkWNd0;>-7$^n`Ja7b} z?gqAiJ!Y4I84H}LI+|ErfFSvKu}~yXHJC~PdIpTC2^@k2=f~~|gBSb?0~io5#{y3x zfT_F{8pm$m!~Acs$af|7P*Y$2B)C!+-fB1lsRJ)J=T_G1 z<)1%rn6I5(a+srl_+cY!gGHW3KX@89N&Ka!$c#zMTtVFB3*WboF031ZEaDCp_uIJc z+jB%yjbX^XwU_sgre5G{a;B^By~1#=CNZ$K(a@cv;*$C0g42VgQxDVpzTP&hEh)~s z?|&+a-m<}G_F-m8b1p;ViGTVxd?@;=B`A0w;Zh-rA;i=;M!Uw1bo2BCYg?7`bp9ea zL7OgAHSnc;KSsuJUy+hU!u_q-qnArLZ%g+Dzvhg{4VDgAa4{_CzA=wG?KQ_DaaCz3 z@MSIu?22Q8eiF{*CR7yOW%+^AZloi%>9HsyooVR#X0EYd<}Rw#SLU|amCsf0sS*8i z_Qt^2mmPx!p2i97?2B!eVR`YoW1jxmB)yBr1l^t(wb!^J zrfcSfYItqz*-KKEUW-vr(TqQZj7c6`vAdn3ohHRQy^k$*^io}!F-`uA!N&c#dMSaH z@1vjZbUu=j=kqwyGqA5!6T)d^bcy@U+>6e8i1+<~W?DX1>>H45WwCazU>SuWHy zosJ35jdsgmX?$z6RBUUly##OYaVmcK(Dd=fDl}NIb}Fmkw$PU|9xVBHv?MM?EajBR zP0>0Utib3}=uUiCc36q7NO%9fWV0@2bZJ~TQ3S=&>o3_N)S_fcq79_(e<$Z4Q(Hu_J z{xpd^$@+=uB*a^%Y&kIRaZ^!YQaQ@G{zIqwy1E3ga5vr0*1b)47|zAZ$+`F&6*cJzK<*Lz;eP9_G@+<^8hMPNi;6sE$JK5WuFSBW~%9iUTbcwOP z4)aIGn-t{O6}7Nzohw&!;tWw-NQ@VSUW1lwq1dU!8v-tG-?2VUL^(9K-p@MSQB@P( zcsD7Yhd$|nFJ-(b4)^w>?7D5OQ@kxhfjm85%_=*^OXLk<}~ z%6)1(I#+OKK4m|bsc97|bZB&Z758>kC791`-8d?M@ohMR3{I-!#)D3tuh*U!B1zK* z8izSure(!PWIAXC)or_`6-cQox*2Wsbw`|wNBRRGhtzy;FHv|FgeFI$5zFKW{jFp! zno1tL&OQVtidUvSMJ7ewx=2jZLd{>zwK@5`p0wh-z}Hl<4l?n2n-W)kZGX&Kw^9Y} zl=CGkx3S>Nkw&)ecZr0r>CcvzoLqS_$&hA9a5+j+x41M^(1FeIB_&CYl4(nopWONJ z&)!)JtDzr5-RXKe>9ZH8A6tz)aaurzziKlgvI;A_iIMe_Yel{`3{XOaBCGH1`%1^< z9h#NDa;O}$IXTZv|88o2o{sk8ypnC!J#?u(a!rGLOd;Uz`mq;oRb&Tr9Goh&Z7ybp%boVZF}%JGF8i^ z(B@6qD1Yn|R~qx$RK_zEuU#{mRR19%=Wb-^RcyjwJeqoAdTZUg_@*sQn?jF$2^?hQ z(ICl9B`s*PPyF<9@!=z&15QOchCy9j zBvihCT^`jtIMBueWB16Pm6NKv>$IQ2Q%EITmdo9i9;YO&^dfP!Oz|~NxN5^!tE!41 zv!tQE>(%T;adv&jm9KUtw#Oc@uV3uW551ZZFH{CUeV``;bHPoF!QYD>i8V)+jSDd3x$)ob0Sz45IVu`G+kB_>L zoO0sc!1u9^{zNUOp3KUg!V-sZI7gOLS(Ac-r0#LkY2|R*PO2kUunVv3UYEnCU<=~8 z`%<3Vb)_978!nbF>^JtbIPBT%ynfE^)>8MG8aajO8vWJ>u6a`Q=-XZ=t;DCqJuI)k zzZnG^3~m(Qjgc2gc1u--y-v!kOcH*da#}n5CSSni+_Bb+5swNiS$vu(d-zoq>MONw zb=EI1d%X0T2+H8kgSe@<>m}WOt6G#Tc)zydo6HM8zY;z&CWCq6fJdJ=F3_rb5)nHU zI~*0iKU#yDzTfPm!zwjD>jxtWbdD~#PFQuwgC3H^WKqRuDQ@5G$8N#T?$;m?LF$@P zaB1_&>E$OCJXEX*?m6!VJZ=}iT&mb7@#r?<+!KapCleCpP`d8wS?<#+7n-}Ms`>1M zXUw=IYFhM@F;NnfhdR|52f{T{ZlW$bYD*TWWRTztVQCdl_w~1EyjN1e=G(>yJveZ% znE6tHR>q04?3}~tT3Mr%UmH6En~HGD<59{Ms_)EB_-8M3Ka(ZzxT5*GV=iAef_hk! zvql8#Ua5Pg!*96rmgEOvQr}fmmYk6~7q|Yocj`doiCcV$(I9VE>T4QG-~50-=q_}g)0b`*{!)~S&&nwh%-<4R~t}& zY=iP+qG#VEdwu%u^zaSIllN>Nbm^bVPG9(d^DfKdI78odp*z0Yi~f4``lZ56$m6=t z_ck)Dn!k?RHN7guWL%n6GZ4F)=gKQ7 zPE%77u^lA-r^MF<>;7nhB%y@$Z7VY7Cim2Wc^0$s_*>WMFj%3_^ zzBx8eye9Vx9%AoWT7EX0;^A{ixU!f$O=L8|Je7;4DzU+xa9!OEfeK|;@mz0mzZvAC zI&+mLKcP5awu++-N$!!3b|&K24Qi4O{tCB~tV<_U|Il$cTDs!AeQvS&(-RHN&8*#e zqHGsRIQpk%N<-(b(|H}$SO|(T_@kvI8XCeR4eKNtHr>GyCEm=LRty8v?#u`KR|zDFS!`*J6^0I3x|u{73LFnXnVs%K zhY3D{U-I;v@o@4xk@xY;vNp9(S$*6+58kz0VprIQIrl`}+Df7Kqq&A&dWflFZq_u> zuH>oaH<=cLWKVjIKha$N+&c{2ko}BWera2ixZc<=DVwC)?C$gRaT;^!wEISvDLR&^ zVJZrOD~`8O3ai>@mhO4!v0!kXW{6n+!XwpiiS-GyFczZ-m$>sRvlrjd&!tEBhA)bn z@l`_+6cgVc5^(9&tbO$v^2>KbR_SnWoS?5@z6%v<2$}3w756>Y?DvAWzrSF@@~V$# zWV18k6>JUr02$7i5mQ_wK$gosC1cUXUYv57&^Wy)Fuihgoxg5I96c`PI7F6L?tbXw ztpstZu@KceJ!aA9n0!m4<`2w$9zs^(wgTy{MrIjmuRhr}g|kWAc=hmf24((3I*aSr zFxT;}b7r>)Rs5pk8QG%WRS;zF3mn0H>-+q@*_z$+(ThN0@v8i&$1{}0Lbur+h(CW1 z&7Vm_AAY^>lNR^)=ES!f3}0%`Rai~;9}#>UMAodA*fl*-lAiivZJ%xsO_yXZf!i{r z;B#IdQQ7gU5;(ap6Iw}X3A$9nvE)bm)zux$b*)}SvZ|MBGFrSPLf5~dKkf0FE1!=m z*KOIHB8;cFuWsvB*@>BrQM_DI7C8Pblhtc_lt zU@vX5Tq@2AUnZXlE{`|!Jd!di9G%FTUL7B zpguj%QfyN*{o&&}QI2Qhgp3A5LKYMFMZwTaYAxS8?q6sWXjiq{b2_Er-zqLy z$%K5cx1J6u7xy-O_RmYZZf}}r8k*$1mw2hQ$w~dk{23gywcppa?=R?TJwyFw+`@RE zo{meI^~ifCxZROFoii)}6wfYet)Ay$3TcIDiJCYEXg=_mEobaAocZ$V>{1v!^BN92&XNZ2Ud%Kx^$RCB27|Mf+Fo(PRT-?czkWUR;8Rf0rw2pt0-`6P zqu1J&kkU+-#9Jnsn_60wIzmH<BtcDnrLRO z1_rJY5ghn>gMPA8=ia^zAD8R~=4&b5m$TPn)R^es67RD=_mJG_9*vO(<02*Z;ZVjB zg*2zQ(1QuMw8%T{&Bx-s-z45xHmq_QEoh?RC^EbM3^HM_B-b~WR2n)u^(L?NTEI%% zxRXuvlm5n5uglzf930#WZqOYxnR1*f%p;V3SJv^E>5gs7JwNWWcE^~m`p9lC z%B&3d^}CKi>=J0ZjJw?V1M1ob$I7V4pY!g!eEICvyn++@!wH?Y6{Cd$C(_s2>|`iz zztF_+Y9S>a<(6xZzkDgQ=IG8~SwnXbGT9hPVO;l87iu-;SA7W4L(q+I#myNWQch?J z4IgA)wWhghIVSpG1Tr}&#?P!yzgCDiZ^2U;GFUG^G&j(&fAiYC?v58ldg6}0Lek1y zf|-3?weL!9zRiI#s9AqTWEPRXd!8|MDs(?P($d?|c&xy)?BfjuhYCTy2SS{IlcfrQ zH>`rYGy#D^n_*zcxQS9CCj6GhZx|@Y=ICay<=1lG=G3FBER&roTeib5xRb^(u@w zMr|Ovq5NNIo;p=8tjfs0L?DROgE#9}vu(pwGs-`_P$HLF=VVcrj>#GpozBtcBcE{@ zUQ~TZwxssGylv2OG~wN>(e32In~zT2J{CTxS4vc_S1*fU>ApRaz@x1s@8=}DCamhC=jQXJFzKGgU(ftrAy-)L6-OV2d z0?iVlZq#w;>0;pKtE*|3%oIq|2ajS3@;l^{*Ar4Iy~U#EBcAY``v*3j^f1HlMoPgj z;`F!Cg0`X{yBzh__;U9vBkqOHodrggbhY1?ot_?;@;_@MVWtPv40#F;(TteS(bAHvym*}3oO`)15N(qQ}I555DDz;LMK&W(QVu%Kf@L9fCMPZ9!99X-ZS*G`cg4 zNt~GTqI(@qvn0W=C0?fmjZ$(VJrQX!Tx>{G#wXP=CGj@{^q`@-uaVoq^B{Tg6;W)A!>Ok2R|x+LF!&QAAg& z(H|zVZJM)b{k~u?2rjN?u-1ylEHrz331=Xu%JlG*teJbs>D5_sZS&wlC5F1&cP5wH zvBXS&kuWaors6c&=$GVnmlMg=n;9qg$X(8tCkhQ4zH&1QxZ~?LsgY2qXI1ah#iQ&d zJpST!Y@K_)-H_CUfyYI?^6XEl^ClQP1HG0vtqu4#9x(kA{HRCSA^u1S*R$KbYLnuj zqgJu1vEGkp8Ozk@#CW=WwVLi;B5R#F&~C;ZTlzG+=vL$r-h`L!0gw^b_+p!Of5aCr z-M1SG(+=~Mz2%2_IIRTN*$WeN8q7_m@695X!$tFDpT0bOu&&g23Uk{gQJvA!wDvRk zrBt8sd4P%HyZj&qdBu)KOV{utdiG9Px7&q=Q z|Co2?_D59ik<@2JJHMz~;Qo($!a^;yyU-(q23~E!bl_m1r zro1EP6`>%)&3r$K;~J2;>pbDy#7p-lyo8UtL@g-83z}Cj(mwl%j~*~34v!^8Fja}* z(&7i~>>eH?I)3;ZQ&rS~#|W~Ek{2EdGgeXcM&DwpN~pS3?`8H;{#&mL%ZF?_=b@M| zN@l0G){jXS5X5te_DM5lwkJexAIfU_5EoWU_TFWxs%OM_$hT)~DLKr7a>a2?thw%B z*h*~Qs(Q2M!KIO=soWU2Q1ne@_tvV-Yp^E=--<4Ng&Zfpy>Kl2+`C<@50cFQIcs z-sblZ2vq?#%#J&t&DpKeBcme^Hnz!Ns_t2}vnWh3L`j`3Y6w;vb79D#K0%R8tw*g0 z5#VE_BD>|Q$_}(2M{hCm1W zh!v)}t&@MoLQ(YF?AZ{(9f+#sZ~?v z=bw`dvfb+IQ+w2+Rny)iAj(Z(QDmhx($R#Q5|BCLhdD$w9lyylI6+ULI`0b6Dx)Vq8p`E_>n7$)2v$ z7FPntEhiW0J%cPOUe4n7p?510OidX&&sE>vWZW=SVp(=)vlDz z4^=)(qJG)x3|D%mBH55M+*<5PCcFBcG(A%_R(F!wKDpC8qL>ovbxNo@i;G!b-KD@< z_{k=>jZMDZfQ_$JiSTp;n}(eZb1LVlqhnZIbJpfAHcOUw zEuBf)#B^}iD%_vZOC4CS%4!RDX{+er?3~D|W0U)K*2CC$rfc(})GTF5D@(l2{gnZ! zB;RKj-M_j%Gf^Bg`L@|YEV0}(A_Q^Zyk(VI=y_ace`!MQ;Nmg&=wn|`e4yOlc-by0 zj)i^9WyXpdHTvC>5u*_+K~*o)>|#~YIePz1Y;e1mhd5#-*D4;BGdX_ckb=d-1;{7;vajrkytY;Y>+Q~bX?pU=f%e7Uz(#HZ2NV28W4`|}+zTkEx;g-1RvBwk8(`lt1V|q+GY9r>xe(jg*Xp zf&s9-dPkp>PXpYqt%CwWvlzguKYif9R_@>BpiGw(*wGE_fdj9R0vo?{kU%CP_yGlQ zAc(x9v|A3!cAtaaVc0Fb9o-$h-;cVaySI~L!Q_suATwkqsEh%2>jPb|z3UZpS;pEN zNJRl&ieRK_;s`M{w>Gxl-pCFy1p3#S6xb7MY!2-7-a5noH?lIY#rN007!Y-F)xBc@ zF*Y#;Dgt_ZE98$KN1mO&=K*wnmcXYA0JZ=cuq|W2UE&Vl4taY6YhZ)E0Qj~5A&3AD z_`_$={B|9_rDbUPYw>L@`~hDF45$MNMF3Ul2=JpYIN%hGhNFN3QW7o!9I(Ju{25df zyoQ4zfdi0iAPyWb{3uEMH3@$38XN_dL}SFzzyT-%s^f4_un7zXe?UrNK|K_lUko^) zQ8=(pj2IGBM@sU8Rim+RPyve*#{dTu0*Nnzf`OJ{C9t4nfJNZ7Ewh2jpkRAYC>-n$ z6b=oZp%5rg4=4iPI)Hj8NesRP3>3R1+;$!>L`iN}y?t$49rOsOWvfnp&FQ)16x4B(4Y{7z_-qiMgW!JcY*E<0FXR{2hxw>H*&HE->`s38y*zkU<1icKLBZK zV+-;R{Ja^}Vk5KLBcV?)N;?wSd!`3x?#;nR#LZk{QW-HwUuxl%$42A&Gy|yPf z@EQ>b96-q?SAkhCV+ACEiESUncXP*8dtjPDffrPeg7aeQ4}wMk7=oBWe#(G!9`Fxh zy)A=bfSj4%WKbv!OsoB+9DqzPDf>4W8r-7#iwuTAf~n-c$N*h{TYr}!!0!329O4h< zkl=oe-^%fWS#Q6}U}z+ek@l+$iUBvV{3hcE$KzKS3=QTW{wCuGlU09}!D0M+Y=L30 zd)k2gp$#}3wnrC^f&uw{zqRoPn}PS({NB${IFkPlx?n!+uk|6IdwoM7pnJv_j^*Ej z0|f9=nmzR)!2RpLwSflri2o+TqW0<{_x3&F5BNi1!7RaF>qEl8OuXM@NbprIzse9m z;@IypB(U#)j|>TJLj6@2i2?U!|0d%H)3|?=0n=)a3I#6ph}C4KzQv3Gr9^FtEMYKx6h|91XnvX|FE-9$W#_ZI8_u3`pd^wTpo5 z8D9)?FD9`t=$4U}bLlT^Zy{vXccD%}78 literal 0 HcmV?d00001 From 3f51637bcfcee9ed92b37c1d65b8dd39c2afdf82 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 09:36:35 +0800 Subject: [PATCH 12/46] 'commit' --- dsLightRag/Test/T1_Login.py | 5 ----- dsLightRag/Test/T3_GetArticle.py | 28 +++++++++++++++------------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/dsLightRag/Test/T1_Login.py b/dsLightRag/Test/T1_Login.py index 8c4c57e0..e313e8ab 100644 --- a/dsLightRag/Test/T1_Login.py +++ b/dsLightRag/Test/T1_Login.py @@ -4,13 +4,8 @@ # 微信爬爬猫---公众号文章抓取代码分析 # https://blog.csdn.net/yajuanpi4899/article/details/121584268 -import datetime import json import logging -import random -import re - -import requests """ # 查看selenium版本 diff --git a/dsLightRag/Test/T3_GetArticle.py b/dsLightRag/Test/T3_GetArticle.py index 02e9cce8..6dae6c0b 100644 --- a/dsLightRag/Test/T3_GetArticle.py +++ b/dsLightRag/Test/T3_GetArticle.py @@ -1,15 +1,17 @@ -""" -安装pdfkit库 -https://github.com/JazzCore/python-pdfkit/wiki/Installing-wkhtmltopdf +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService +from selenium.webdriver.common.by import By -我是在Windows上开发的,所以,下载的是:【注意要科学上网下载,否则太慢了~】 -https://release-assets.githubusercontent.com/github-production-release-asset/131323182/3200f380-aba8-11ea-8942-42fa5e27a312?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-07-15T02%3A10%3A32Z&rscd=attachment%3B+filename%3Dwkhtmltox-0.12.6-1.mxe-cross-win64.7z&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2025-07-15T01%3A10%3A07Z&ske=2025-07-15T02%3A10%3A32Z&sks=b&skv=2018-11-09&sig=IYNB2Gi%2FZ9tZfPXmo7PbqjbxmcLULpP%2Bex2z6lp2DvE%3D&jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc1MjU0MjU3NSwibmJmIjoxNzUyNTQyMjc1LCJwYXRoIjoicmVsZWFzZWFzc2V0cHJvZHVjdGlvbi5ibG9iLmNvcmUud2luZG93cy5uZXQifQ.LyZXiO_mRK2qX99CTJtVwypU4DLsK-_Js0wspzsL0Y4&response-content-disposition=attachment%3B%20filename%3Dwkhtmltox-0.12.6-1.mxe-cross-win64.7z&response-content-type=application%2Foctet-stream -解压到D:\wkhtmltox中,还要注意把路径加到环境变量中 +url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd' -conda activate py310 -pip3 install pdfkit -""" -import pdfkit -path_wk = r'D:\wkhtmltox\bin\wkhtmltopdf.exe' #wkhtmltopdf安装位置 -config = pdfkit.configuration(wkhtmltopdf = path_wk) -pdfkit.from_url('http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd', 'out.pdf',configuration=config) \ No newline at end of file +options = Options() +options.add_argument('-headless') # 无头参数,调试时可以注释掉 +service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") +driver = webdriver.Chrome(service=service, options=options) +driver.get(url) +# 可以只要txt +html_content = driver.find_element(By.CLASS_NAME, "rich_media").text +# 第一行是标题,分离出来 +title = html_content.split('\n')[0] +print(title) From 78633b321cb5981b8a80ba0a94aed0022cf66f67 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 09:43:31 +0800 Subject: [PATCH 13/46] 'commit' --- dsLightRag/Test/T3_GetArticle.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/dsLightRag/Test/T3_GetArticle.py b/dsLightRag/Test/T3_GetArticle.py index 6dae6c0b..baaecd48 100644 --- a/dsLightRag/Test/T3_GetArticle.py +++ b/dsLightRag/Test/T3_GetArticle.py @@ -15,3 +15,30 @@ html_content = driver.find_element(By.CLASS_NAME, "rich_media").text # 第一行是标题,分离出来 title = html_content.split('\n')[0] print(title) + +# 按行遍历html_content,当发现空行时,删除空行前面的内容,只保留后面的内容 +lines = html_content.split('\n') +content_after_empty_line = "" +found_empty_line = False + +for line in lines: + if not found_empty_line and line.strip() == "": + # 找到第一个空行 + found_empty_line = True + continue + + if found_empty_line: + # 空行后的内容添加到结果中 + content_after_empty_line += line + "\n" + +# 如果没有找到空行,保留原始内容 +if not found_empty_line: + content_after_empty_line = html_content + +for x in content_after_empty_line.split("\n"): + if x.strip() == "" : + continue + print(x) + +# 关闭浏览器 +driver.quit() From b8b4b08b3fa73d3fcd134fbeb56a3e25f27bc6ae Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 10:04:49 +0800 Subject: [PATCH 14/46] 'commit' --- dsLightRag/Test/T3_GetArticle.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/dsLightRag/Test/T3_GetArticle.py b/dsLightRag/Test/T3_GetArticle.py index baaecd48..2f9c9e27 100644 --- a/dsLightRag/Test/T3_GetArticle.py +++ b/dsLightRag/Test/T3_GetArticle.py @@ -35,10 +35,7 @@ for line in lines: if not found_empty_line: content_after_empty_line = html_content -for x in content_after_empty_line.split("\n"): - if x.strip() == "" : - continue - print(x) - +content_after_empty_line = content_after_empty_line.replace("\n\n", "\n") +print(content_after_empty_line) # 关闭浏览器 driver.quit() From fca710e0719b982411d4b9aed2dc75ea026c72d2 Mon Sep 17 00:00:00 2001 From: "Kalman.CHENG" <123204464@qq.com> Date: Tue, 15 Jul 2025 10:15:43 +0800 Subject: [PATCH 15/46] =?UTF-8?q?=E6=95=99=E8=82=B2=E5=9E=82=E7=9B=B4?= =?UTF-8?q?=E9=A2=86=E5=9F=9F=E5=A4=A7=E6=A8=A1=E5=9E=8B=E5=B9=B3=E5=8F=B0?= =?UTF-8?q?=20modify=20by=20Kalman.CHENG=20=E2=98=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api/controller/DmController.py | 2 +- .../api/controller/DocumentController.py | 2 +- .../api/controller/LoginController.py | 28 ++++++++++ .../api/controller/QuestionController.py | 2 +- .../api/controller/TestController.py | 2 +- .../api/controller/ThemeController.py | 2 +- .../api/controller/UserController.py | 32 +++++++++++ dsAiTeachingModel/main.py | 14 +++-- dsAiTeachingModel/routes/__init__.py | 3 +- dsAiTeachingModel/tasks/BackgroundTasks.py | 48 +++++++++++++++-- dsAiTeachingModel/utils/Database.py | 14 ++++- dsAiTeachingModel/utils/DocxUtil.py | 54 ++++++++++++++++++- dsAiTeachingModel/utils/LightRagUtil.py | 47 ++++++++++++++-- 13 files changed, 228 insertions(+), 22 deletions(-) create mode 100644 dsAiTeachingModel/api/controller/UserController.py diff --git a/dsAiTeachingModel/api/controller/DmController.py b/dsAiTeachingModel/api/controller/DmController.py index 7dbd0e51..2f719317 100644 --- a/dsAiTeachingModel/api/controller/DmController.py +++ b/dsAiTeachingModel/api/controller/DmController.py @@ -1,4 +1,4 @@ -# routes/LoginController.py +# routes/DmController.py from fastapi import APIRouter, Depends diff --git a/dsAiTeachingModel/api/controller/DocumentController.py b/dsAiTeachingModel/api/controller/DocumentController.py index d88da710..bd0e2998 100644 --- a/dsAiTeachingModel/api/controller/DocumentController.py +++ b/dsAiTeachingModel/api/controller/DocumentController.py @@ -1,4 +1,4 @@ -# routes/LoginController.py +# routes/DocumentController.py import os from fastapi import APIRouter, Request, Response, Depends, UploadFile, File diff --git a/dsAiTeachingModel/api/controller/LoginController.py b/dsAiTeachingModel/api/controller/LoginController.py index 307fd3b6..b3368d35 100644 --- a/dsAiTeachingModel/api/controller/LoginController.py +++ b/dsAiTeachingModel/api/controller/LoginController.py @@ -129,3 +129,31 @@ async def login(request: Request, response: Response): else: return {"success": False, "message": "用户名或密码错误"} + +# 【Base-Login-3】通过手机号获取Person的ID +@router.get("/getPersonIdByTelephone") +async def get_person_id_by_telephone(request: Request): + telephone = await get_request_str_param(request, "telephone", True, True) + if not telephone: + return {"success": False, "message": "手机号不能为空"} + select_user_sql: str = "SELECT person_id FROM t_sys_loginperson WHERE telephone = '" + telephone + "' and b_use = 1 " + userlist = await find_by_sql(select_user_sql,()) + user = userlist[0] if userlist else None + if user: + return {"success": True, "message": "查询成功", "data": {"person_id": user['person_id']}} + else: + return {"success": False, "message": "未查询到相关信息"} + + + +# 【Base-Login-4】忘记密码重设,不登录的状态 +@router.post("/resetPassword") +async def reset_password(request: Request): + person_id = await get_request_str_param(request, "person_id", True, True) + password = await get_request_str_param(request, "password", True, True) + if not person_id or not password: + return {"success": False, "message": "用户ID和新密码不能为空"} + password_md5 = md5_encrypt(password) + update_user_sql: str = "UPDATE t_sys_loginperson SET original_pwd = '" + password + "', pwdmd5 = '" + password_md5 + "' WHERE person_id = '" + person_id + "'" + await execute_sql(update_user_sql) + return {"success": True, "message": "密码修改成功"} \ No newline at end of file diff --git a/dsAiTeachingModel/api/controller/QuestionController.py b/dsAiTeachingModel/api/controller/QuestionController.py index 89456bc5..48b7ed39 100644 --- a/dsAiTeachingModel/api/controller/QuestionController.py +++ b/dsAiTeachingModel/api/controller/QuestionController.py @@ -1,4 +1,4 @@ -# routes/LoginController.py +# routes/QuestionController.py from fastapi import APIRouter, Request, Response, Depends from auth.dependencies import * diff --git a/dsAiTeachingModel/api/controller/TestController.py b/dsAiTeachingModel/api/controller/TestController.py index 4a572ff3..5c6a8ed5 100644 --- a/dsAiTeachingModel/api/controller/TestController.py +++ b/dsAiTeachingModel/api/controller/TestController.py @@ -1,4 +1,4 @@ -# routes/LoginController.py +# routes/TestController.py from fastapi import APIRouter, Request diff --git a/dsAiTeachingModel/api/controller/ThemeController.py b/dsAiTeachingModel/api/controller/ThemeController.py index 297817d1..3bd9fcd5 100644 --- a/dsAiTeachingModel/api/controller/ThemeController.py +++ b/dsAiTeachingModel/api/controller/ThemeController.py @@ -1,4 +1,4 @@ -# routes/LoginController.py +# routes/ThemeController.py from fastapi import APIRouter, Depends from utils.ParseRequest import * diff --git a/dsAiTeachingModel/api/controller/UserController.py b/dsAiTeachingModel/api/controller/UserController.py new file mode 100644 index 00000000..e23d8f5f --- /dev/null +++ b/dsAiTeachingModel/api/controller/UserController.py @@ -0,0 +1,32 @@ +# routes/UserController.py +import re + +from fastapi import APIRouter, Request, Response, Depends +from auth.dependencies import * +from utils.Database import * +from utils.ParseRequest import * + +# 创建一个路由实例,需要依赖get_current_user,登录后才能访问 +router = APIRouter(dependencies=[Depends(get_current_user)]) + +# 【Base-User-1】维护用户手机号 +@router.post("/modifyTelephone") +async def modify_telephone(request: Request): + person_id = await get_request_str_param(request, "person_id", True, True) + telephone = await get_request_str_param(request, "telephone", True, True) + # 校验手机号码格式 + if not re.match(r"^1[3-9]\d{9}$", telephone): + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="手机号码格式错误") + # 校验手机号码是否已被注册 + select_telephone_sql: str = "select * from t_sys_loginperson where b_use = 1 and telephone = '" + telephone + "' and person_id <> '" + person_id + "'" + userlist = await find_by_sql(select_telephone_sql, ()) + if len(userlist) > 0: + return {"success": False, "message": "手机号码已被注册"} + else: + update_telephone_sql: str = "update t_sys_loginperson set telephone = '" + telephone + "' where person_id = '" + person_id + "'" + await execute_sql(update_telephone_sql) + return {"success": True, "message": "修改成功"} + + +# 【Base-User-2】维护用户密码 +# @router.post("/modifyPassword") diff --git a/dsAiTeachingModel/main.py b/dsAiTeachingModel/main.py index 18a578f1..8f99a901 100644 --- a/dsAiTeachingModel/main.py +++ b/dsAiTeachingModel/main.py @@ -1,6 +1,7 @@ import threading -import logging + import uvicorn +import asyncio from fastapi.middleware.cors import CORSMiddleware from starlette.staticfiles import StaticFiles @@ -18,11 +19,12 @@ logging.basicConfig( ) async def lifespan(app: FastAPI): - # 启动线程 - thread = threading.Thread(target=train_document_task, daemon=True) - thread.start() # 创建数据库连接池 await init_database() + + # 启动异步任务 + asyncio.create_task(train_document_task()) + yield await shutdown_database() @@ -41,8 +43,10 @@ app.add_middleware( app.mount("/static", StaticFiles(directory="Static"), name="static") # 注册路由 -# 登录相关 +# 登录相关(不用登录) app.include_router(login_router, prefix="/api/login", tags=["login"]) +# 用户相关 +app.include_router(user_router, prefix="/api/user", tags=["user"]) # 主题相关 app.include_router(theme_router, prefix="/api/theme", tags=["theme"]) # 文档相关 diff --git a/dsAiTeachingModel/routes/__init__.py b/dsAiTeachingModel/routes/__init__.py index 5bde8674..4fa720b9 100644 --- a/dsAiTeachingModel/routes/__init__.py +++ b/dsAiTeachingModel/routes/__init__.py @@ -5,6 +5,7 @@ from api.controller.ThemeController import router as theme_router from api.controller.QuestionController import router as question_router from api.controller.TestController import router as test_router from api.controller.DmController import router as dm_router +from api.controller.UserController import router as user_router # 导出所有路由 -__all__ = ["login_router", "document_router", "theme_router", "question_router", "dm_router", "test_router"] +__all__ = ["login_router", "document_router", "theme_router", "question_router", "dm_router", "test_router", "user_router"] diff --git a/dsAiTeachingModel/tasks/BackgroundTasks.py b/dsAiTeachingModel/tasks/BackgroundTasks.py index e90bdc52..d43dc190 100644 --- a/dsAiTeachingModel/tasks/BackgroundTasks.py +++ b/dsAiTeachingModel/tasks/BackgroundTasks.py @@ -1,12 +1,52 @@ +import asyncio import logging import time +from utils.Database import * +from utils.DocxUtil import get_docx_content_by_pandoc +from utils.LightRagUtil import initialize_pg_rag + +# 使用PG库后,这个是没有用的,但目前的项目代码要求必传,就写一个吧。 +WORKING_DIR = f"./output" + # 后台任务,监控是否有新的未训练的文档进行训练 -def train_document_task(): +async def train_document_task(): print("线程5秒后开始运行【监控是否有新的未训练的文档进行训练】") - time.sleep(5) # 线程5秒后开始运行 + await asyncio.sleep(5) # 使用 asyncio.sleep 而不是 time.sleep # 这里放置你的线程逻辑 while True: # 这里可以放置你的线程要执行的代码 - logging.info("线程正在运行") - time.sleep(1000) # 每隔10秒执行一次 + logging.info("开始查询是否有未训练的文档") + no_train_document_sql: str = " SELECT * FROM t_ai_teaching_model_document WHERE is_deleted = 0 and train_flag = 0 ORDER BY create_time DESC" + no_train_document_result = await find_by_sql(no_train_document_sql, ()) + if not no_train_document_result: + logging.info("没有未训练的文档") + else: + logging.info("存在未训练的文档" + str(len(no_train_document_result))+"个") + # document = no_train_document_result[0] + # print("开始训练文档:" + document["document_name"]) + # theme = await find_by_id("t_ai_teaching_model_theme", "id", document["theme_id"]) + # # 训练开始前,更新训练状态 + # update_sql: str = " UPDATE t_ai_teaching_model_document SET train_flag = 1 WHERE id = " + str(document["id"]) + # execute_sql(update_sql) + # document_name = document["document_name"] + "." + document["document_suffix"] + # logging.info("开始训练文档:" + document_name) + # workspace = theme["short_name"] + # docx_name = document_name + # docx_path = document["document_path"] + # logging.info(f"开始处理文档:{docx_name}, 还有%s个文档需要处理!", len(no_train_document_result) - 1) + # # 训练代码开始 + # try: + # rag = await initialize_pg_rag(WORKING_DIR=WORKING_DIR, workspace=workspace) + # # 获取docx文件的内容 + # content = get_docx_content_by_pandoc(docx_path) + # await rag.insert(input=content, file_paths=[docx_name]) + # finally: + # if rag: + # await rag.finalize_storages() + # # 训练结束,更新训练状态 + # update_sql: str = " UPDATE t_ai_teaching_model_document SET train_flag = 2 WHERE id = " + str(document["id"]) + # execute_sql(update_sql) + + # 添加适当的等待时间,避免频繁查询 + await asyncio.sleep(60) # 每分钟查询一次 diff --git a/dsAiTeachingModel/utils/Database.py b/dsAiTeachingModel/utils/Database.py index 85580029..4ac15243 100644 --- a/dsAiTeachingModel/utils/Database.py +++ b/dsAiTeachingModel/utils/Database.py @@ -204,4 +204,16 @@ async def delete_by_id(table_name, property_name, property_value): raise Exception(f"为表[{table_name}]删除数据失败: {e}") else: logging.error("参数不全") - return False \ No newline at end of file + return False + + +# 执行一个SQL语句 +async def execute_sql(sql): + logging.debug(sql) + try: + async with pool.acquire() as conn: + await conn.fetch(sql) + except Exception as e: + logging.error(f"数据库查询错误: {e}") + logging.error(f"执行的SQL语句: {sql}") + raise Exception(f"执行SQL失败: {e}") \ No newline at end of file diff --git a/dsAiTeachingModel/utils/DocxUtil.py b/dsAiTeachingModel/utils/DocxUtil.py index 82e26d2c..6c8051fc 100644 --- a/dsAiTeachingModel/utils/DocxUtil.py +++ b/dsAiTeachingModel/utils/DocxUtil.py @@ -1,8 +1,56 @@ +import logging import os import subprocess import uuid +from PIL import Image +import os + +# 在程序开始时添加以下配置 +logging.basicConfig( + level=logging.INFO, # 设置日志级别为INFO + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +# 或者如果你想更详细地控制日志输出 +logger = logging.getLogger('DocxUtil') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) +logger.addHandler(handler) +logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO) +def resize_images_in_directory(directory_path, max_width=640, max_height=480): + """ + 遍历目录下所有图片并缩放到指定尺寸 + :param directory_path: 图片目录路径 + :param max_width: 最大宽度 + :param max_height: 最大高度 + """ + # 支持的图片格式 + valid_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif') + + for root, _, files in os.walk(directory_path): + for filename in files: + if filename.lower().endswith(valid_extensions): + file_path = os.path.join(root, filename) + try: + with Image.open(file_path) as img: + # 计算缩放比例 + width, height = img.size + ratio = min(max_width / width, max_height / height) + # 如果图片已经小于目标尺寸,则跳过 + if ratio >= 1: + continue + # 计算新尺寸并缩放 + new_size = (int(width * ratio), int(height * ratio)) + resized_img = img.resize(new_size, Image.Resampling.LANCZOS) + + # 保存图片(覆盖原文件) + resized_img.save(file_path) + logger.info(f"已缩放: {file_path} -> {new_size}") + except Exception as e: + logger.error(f"处理 {file_path} 时出错: {str(e)}") def get_docx_content_by_pandoc(docx_file): # 最后拼接的内容 content = "" @@ -15,6 +63,9 @@ def get_docx_content_by_pandoc(docx_file): os.mkdir("./static/Images/" + file_name) subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown, '--extract-media=./static/Images/' + file_name]) + # 遍历目录 './static/Images/'+file_name 下所有的图片,缩小于640*480的尺寸上 + + resize_images_in_directory('./static/Images/' + file_name+'/media') # 读取然后修改内容,输出到新的文件 img_idx = 0 # 图片索引 with open(temp_markdown, 'r', encoding='utf-8') as f: @@ -23,8 +74,9 @@ def get_docx_content_by_pandoc(docx_file): if not line: continue # 跳过图片高度描述行 - if line.startswith('height=') and line.endswith('in"}'): + if line.startswith('height=') and (line.endswith('in"}') or line.endswith('in"')): continue + # height="1.91044072615923in" # 使用find()方法安全地检查图片模式 is_img = line.find("![](") >= 0 and ( line.find(".png") > 0 or diff --git a/dsAiTeachingModel/utils/LightRagUtil.py b/dsAiTeachingModel/utils/LightRagUtil.py index 528f5963..e791c4a8 100644 --- a/dsAiTeachingModel/utils/LightRagUtil.py +++ b/dsAiTeachingModel/utils/LightRagUtil.py @@ -1,9 +1,7 @@ import logging import logging.config import os - import numpy as np - from lightrag import LightRAG from lightrag.kg.shared_storage import initialize_pipeline_status from lightrag.llm.openai import openai_complete_if_cache, openai_embed @@ -25,7 +23,7 @@ def configure_logging(): log_dir = os.getenv("LOG_DIR", os.getcwd()) log_file_path = os.path.abspath( - os.path.join(log_dir, "./logs/lightrag.log") + os.path.join(log_dir, "./Logs/lightrag.log") ) print(f"\nLightRAG log file: {log_file_path}\n") @@ -97,10 +95,13 @@ async def embedding_func(texts: list[str]) -> np.ndarray: ) -async def initialize_rag(working_dir): +async def initialize_rag(working_dir, graph_storage=None): + if graph_storage is None: + graph_storage = 'NetworkXStorage' rag = LightRAG( working_dir=working_dir, llm_model_func=llm_model_func, + graph_storage=graph_storage, embedding_func=EmbeddingFunc( embedding_dim=EMBED_DIM, max_token_size=EMBED_MAX_TOKEN_SIZE, @@ -139,4 +140,40 @@ def create_embedding_func(): api_key=EMBED_API_KEY, base_url=EMBED_BASE_URL, ), - ) \ No newline at end of file + ) + + +# AGE +os.environ["AGE_GRAPH_NAME"] = AGE_GRAPH_NAME +os.environ["POSTGRES_HOST"] = POSTGRES_HOST +os.environ["POSTGRES_PORT"] = str(POSTGRES_PORT) +os.environ["POSTGRES_USER"] = POSTGRES_USER +os.environ["POSTGRES_PASSWORD"] = POSTGRES_PASSWORD +os.environ["POSTGRES_DATABASE"] = POSTGRES_DATABASE + + +async def initialize_pg_rag(WORKING_DIR, workspace='default'): + rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + llm_model_name=LLM_MODEL_NAME, + llm_model_max_async=4, + llm_model_max_token_size=32768, + enable_llm_cache_for_entity_extract=True, + embedding_func=EmbeddingFunc( + embedding_dim=EMBED_DIM, + max_token_size=EMBED_MAX_TOKEN_SIZE, + func=embedding_func + ), + kv_storage="PGKVStorage", + doc_status_storage="PGDocStatusStorage", + graph_storage="PGGraphStorage", + vector_storage="PGVectorStorage", + auto_manage_storages_states=False, + vector_db_storage_cls_kwargs={"workspace": workspace} + ) + + await rag.initialize_storages() + await initialize_pipeline_status() + + return rag From bbc9583d47735029dfa834ac8e342b51864405b7 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 10:27:06 +0800 Subject: [PATCH 16/46] 'commit' --- dsLightRag/Test/cookies.txt | 1 - dsLightRag/Test/out.pdf | Bin 19544 -> 0 bytes dsLightRag/{Test => WxGzh}/T1_Login.py | 9 +++++++-- .../{Test => WxGzh}/T2_GetArticleList.py | 15 +++++++++++++++ dsLightRag/{Test => WxGzh}/T3_GetArticle.py | 0 dsLightRag/WxGzh/__init__.py | 0 dsLightRag/{Test => WxGzh}/article_urls.txt | 0 dsLightRag/WxGzh/cookies.txt | 17 +++++++++++++++++ 8 files changed, 39 insertions(+), 3 deletions(-) delete mode 100644 dsLightRag/Test/cookies.txt delete mode 100644 dsLightRag/Test/out.pdf rename dsLightRag/{Test => WxGzh}/T1_Login.py (89%) rename dsLightRag/{Test => WxGzh}/T2_GetArticleList.py (89%) rename dsLightRag/{Test => WxGzh}/T3_GetArticle.py (100%) create mode 100644 dsLightRag/WxGzh/__init__.py rename dsLightRag/{Test => WxGzh}/article_urls.txt (100%) create mode 100644 dsLightRag/WxGzh/cookies.txt diff --git a/dsLightRag/Test/cookies.txt b/dsLightRag/Test/cookies.txt deleted file mode 100644 index 4999d643..00000000 --- a/dsLightRag/Test/cookies.txt +++ /dev/null @@ -1 +0,0 @@ -{"_clsk": "1v8cz8t|1752541383487|1|1|mp.weixin.qq.com/weheat-agent/payload/record", "xid": "fff1911b542cde79c5c47a38cb3929c8", "data_bizuin": "3514353238", "slave_user": "gh_4f88a4e194da", "slave_sid": "cDlUaWlaek5RZHV6SUIyVWNNZlJGYTJQdHY5YzUyN29LMG94RlptUV9lbkVDUWxmaTBURFE5YWNKeVRkYlZSdU9VRnNjWXRKN2xfZ2pZd0JWal82aVpsRDhqUnJXQkdYMml4SlhrdGtGY2k2MG95YTlQVEFVanpIR01oZ3p4dldiME9hRE1zcGxZV0FlNTVV", "rand_info": "CAESIPFuk5/nui6QoQ6zEO2B5RfaUmjuQjTJOQVg9mBuI/XG", "data_ticket": "AIy4PwNlFMRBDHcZ7jcXDXf/8fFLl5NS25Nj3tYuDL8H4W8EiURU4G9Dakn7aSUC", "bizuin": "3514353238", "mm_lang": "zh_CN", "slave_bizuin": "3514353238", "uuid": "91eaae9bc5e4f725e03ee2b7e75c8a2c", "ua_id": "bbkG1LsuVI1DszGdAAAAADm2HzejXloc87mSyGEMpdY=", "wxuin": "52541365079710", "_clck": "1l32fbr|1|fxm|0"} \ No newline at end of file diff --git a/dsLightRag/Test/out.pdf b/dsLightRag/Test/out.pdf deleted file mode 100644 index 4aec06ad8c5bbf081626dcc93f2bfe0877c5ab8b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 19544 zcmd6P2{ct*__vgJRfbGu$dwSno$poV;hM>mDKbmuIdhq(gb*^6nauML8VnH$WeCZX z$`pwxeCJ%0(*OJSertVez1F(UJFbf$<_qVGt<9#_$TM zfB-3Le>9+R-rmO8$;f0&MHwgvAwU9`u)qhiQ=z$ywS<8q(1yfWI24Y8VqhpJKOBL; za6_>iP$&l{sgMwI>+E|WIVqpm0ofGH}vtZC&O{wAgM%3Ec#u4ZaCqqa4xxBfx1u373iIF2j z2Zn@gR|$b&FeqN22)&@7)Q1dorcmW&*MVFa3j z@TsfFkb?gpr>__|8#ox*o7*~`b+$1#zIYmXk@FmWG=DmZ+GF&212bTF&Kua9SUW=C zz-1MnS8c2vfl~(v41^{iP%&|^ak4iu0VE(K014h#;1nD};8elH*xW$O=IWNYD1HrAEN$#nZ4Hb}@IIXvSG9#~R|J0j2?D?rRa>(ift0<0D_G@E5aQLP zR1F~8R&7BO0|7AuShaPEg@C9~!E5f|4CqR*A{^i-z78B3g8orRQ_1iOIA~jies&(- zSm}RvjnD%go%-ju?g6?gCj7#tw^sX zHpMDfFu*7R=GSP>5}&caU1B}$f7OD{?esCeB?>o^JDF_#_gJoPtjqSEy?>eURzO*K zv5DXbb5lK;v?TALi*55O_HV4b+6*jfZ@l2b>gvH8%upgK<2Ew-%_H{x!RK5#dJU86 z$*oS*Qtk6;L%&sV(Qy&gDfIA@4xX;KbHh7`&usC+1ZVlle99yA{lu@9RSyIT$i7^8 zk{@^ba8oMX=a}2LXB*nI-?Yrvk_eb459iMmPSi5?yAODP5auU% z?;i?8a9ARuL_ghvW8CUQ^GqYb>ZD;22)EGPc&WSCHYx|uzA_LybMU2_G?LyVtQk)<5$3p^yV6jyl0$|$5zUR54n zX-l3T$>O;t!glI-cNx8J4%?2G*l4Z^5SJYkeF4K%a zxb!AVJs)SyaNmqkY58)Zd)>ox`MZj;UT$AXb_=JXMub|ESvZ-_<-8eHrk19%oQ)+9 zZlirZl`MC(g7k<>K6IbwMd=gPaAa+=E#H}QaI08lQ}&WgVQ^hQtv-;otLy#n0j}f} z-S@?75AwZ7#>d5kxPFpMQs2 z0b1O(^x&!G|IP3^K(KbC`C6CCt> zhrf0v_%_Y_Zxg)7gIwUJhJ@+shoML! zdxwbXfP?I&ZlG#fqJHiWWSye?Kd$5v!4SQC+r&?x}ncv!kO( zu{jNr2|@y&Ad&DtfMjR2g`rS8Ac5~)>He=Ef$v!~fTbFbitR-M{$<|6?G* zJF&GG{8$C>p6}SZ4HEcYt^zj{i%AJ#hlfAY#QRTY+;Fj)BJ=S*e-<0$n|L>fgO=EP z78)O}Wf51f4BKV9NQ<;NBWYrL+KI;1n%*Nd!k&;EM@R(05%TjY z{UmOwkII*bT-+&dzEV5Ps%fRZD&itm=#XnZaQ}_PyhN6bcDq?W@23kDoKnVz8G7gI zQXar2w2F;viOLM6HxGOfv&<(KiCBRFWCgXhfUzWvI+ce*#xh*)A9c` zp!|VN@SgA3y91QJWD@`=WRD>f!}HwMXwEuc6@62v9hBuTS@WZp5=PprcAr*=o}JfS zddf6kCs5T_FizP?Zt3}8{JahGb-ykoU2M-8B6G>6%UbGTPY(P;b%4Z`J#Iv@q2qn< zQA~AVm9X)GnnDxwx&dn&aUO!{c683P%!F#@A?>15pe@!WZhaN)#q-YGCLDtE2d$b=vKzz^GmV!9vt~B>S;?4Tt4ge{S zb>Fx@n9!>Yml6;zIQ5tn-aYQoT9sl78i5odI^n$Y@zEuleVJ{ zsNU(LpKo4nSTrK%^Jf={-HQ~wU!bqBT}b(toPyZHDTrS=1@X6>g4f&W`TrVH{=g}C z&v)$ILCXINrw|oT#~qhvlqd1^wS%I%;(DbSzk8DJ=9C_gQ^ae=&(BAQkXlS!)Kn~` zE~BDJQ&oeJQ5m;Wuq?Ji8{`b0y$`0x3~3B8YYM2V!sPX1XeRfY#D&Y{8ux|1;hxvL zJyI6OSEMtVZ%L&nn=fQ(~V@Sa}0ot~UQ}CYe*t?As}2EpVGcF|)#5LCaftCr%?ttcXI4 z^iXjG74<@+{^FIF*-~QKN~ z?r9v0*crr=C0>68QKjfAUe8Cv^+7GBxdD`Cbd-hlb>BNF#>aw`LIY3=D#f7oq0aD6 zN=b?gKVY$Z(P)EmR>Ldo6QQ@Ui0@vY;N1dU-F2IS{Fju1+(RkIUnvFox0Hg{+v)iK z8c_Z~DR|F!?A-y%UsK92i_dHP;={7F_yp1T`g)I~|GxMXe_6OLUgvq>XlB;d;=|2_ zdj>2%IO#Zu$Wz3l@4RbPIKJqrR?nt&me8y#Tmh7a8I*&r9S$y0F z*w28APx7+YV)N-7|Jmp}hIphL{&diXPbBOQNcjV$p#CMLp!QG->Q_oZ{Vk>7^>%vx zKZX>%6QFIoCU=FsV(60{=Re;=pT^s2TDQzOG-iSp%nD5 zl!E?SO2O;x^!$GeDR?J9+jdb3-t!%Mw~>PWYf1qZAL^lz!mFh(`aJrPNeB`$v8Qh# zWGbSO7Y>tcc#c>g)aVqX?Olc@b?am}9taeZeYo!FDo!YAmG;iRe11sW7Pms#O~`vkvl$d)Ktmu4ZoOmuCrz>)3E-rm%mFq zPLI=0N){;@zxQd;SVtF&=%CaVmVBz!IA4$Uo}IdNa+FstKB&edy-J=2L% zdTx{MANNnM-3eJeJV^A$_ztNygVBu)aOt7oC;J%e)A*hBb>a88w?v+%BA$B*1^t6i zem-==fCRDQ{-sMAa-xdj(n>0q;E=5W{~5l*fW)y?@@JSi#@m!lk>}ceMN%7T2&1c> z;((#5Sp|e?bDzsT#W0mSCTs38iqV%@Znmh(Ufb`)e8Zmr0|cF+DhD_BagstHJCQ@o z*31M15+)9g5Wd}(fW^dZoPbap6byWABOI@>V-7y_pycFeX>JV!8ToML_CO_ggMnaz zyupu?UFsn3?fAVFe-$@yG_bTW1E2Yu0kv#%#gAIIO~hYS0D^Ep?9{@!upC;n}`1Cusbn6~)R=w%{fZFd*zngbG{dOG6S=^@s+yk0upPh7@ zt9!qs5Q?9OZRpA1GW6?D?(-ZrnmST#5esbZn7h0tTS)jk8$`VJPDH$0psTy?0Wmw= zBp?9c+X5sQUIwNEJ|=HtzZGs)G_V4Yrz|ERDk{pYU~Xh@<6vXzsAV8+Vy-TuVq)fG zX#keAG%#}jV&YpmV))294-AgvMZsVg7#6|^Ya7Q7YHN*1Hr;jyhu1d z8jAs*Ao9ro)4|+G)Y=TlD1ic1i8>g8L4GuXpBH$-1_N%z@TkWNd0;>-7$^n`Ja7b} z?gqAiJ!Y4I84H}LI+|ErfFSvKu}~yXHJC~PdIpTC2^@k2=f~~|gBSb?0~io5#{y3x zfT_F{8pm$m!~Acs$af|7P*Y$2B)C!+-fB1lsRJ)J=T_G1 z<)1%rn6I5(a+srl_+cY!gGHW3KX@89N&Ka!$c#zMTtVFB3*WboF031ZEaDCp_uIJc z+jB%yjbX^XwU_sgre5G{a;B^By~1#=CNZ$K(a@cv;*$C0g42VgQxDVpzTP&hEh)~s z?|&+a-m<}G_F-m8b1p;ViGTVxd?@;=B`A0w;Zh-rA;i=;M!Uw1bo2BCYg?7`bp9ea zL7OgAHSnc;KSsuJUy+hU!u_q-qnArLZ%g+Dzvhg{4VDgAa4{_CzA=wG?KQ_DaaCz3 z@MSIu?22Q8eiF{*CR7yOW%+^AZloi%>9HsyooVR#X0EYd<}Rw#SLU|amCsf0sS*8i z_Qt^2mmPx!p2i97?2B!eVR`YoW1jxmB)yBr1l^t(wb!^J zrfcSfYItqz*-KKEUW-vr(TqQZj7c6`vAdn3ohHRQy^k$*^io}!F-`uA!N&c#dMSaH z@1vjZbUu=j=kqwyGqA5!6T)d^bcy@U+>6e8i1+<~W?DX1>>H45WwCazU>SuWHy zosJ35jdsgmX?$z6RBUUly##OYaVmcK(Dd=fDl}NIb}Fmkw$PU|9xVBHv?MM?EajBR zP0>0Utib3}=uUiCc36q7NO%9fWV0@2bZJ~TQ3S=&>o3_N)S_fcq79_(e<$Z4Q(Hu_J z{xpd^$@+=uB*a^%Y&kIRaZ^!YQaQ@G{zIqwy1E3ga5vr0*1b)47|zAZ$+`F&6*cJzK<*Lz;eP9_G@+<^8hMPNi;6sE$JK5WuFSBW~%9iUTbcwOP z4)aIGn-t{O6}7Nzohw&!;tWw-NQ@VSUW1lwq1dU!8v-tG-?2VUL^(9K-p@MSQB@P( zcsD7Yhd$|nFJ-(b4)^w>?7D5OQ@kxhfjm85%_=*^OXLk<}~ z%6)1(I#+OKK4m|bsc97|bZB&Z758>kC791`-8d?M@ohMR3{I-!#)D3tuh*U!B1zK* z8izSure(!PWIAXC)or_`6-cQox*2Wsbw`|wNBRRGhtzy;FHv|FgeFI$5zFKW{jFp! zno1tL&OQVtidUvSMJ7ewx=2jZLd{>zwK@5`p0wh-z}Hl<4l?n2n-W)kZGX&Kw^9Y} zl=CGkx3S>Nkw&)ecZr0r>CcvzoLqS_$&hA9a5+j+x41M^(1FeIB_&CYl4(nopWONJ z&)!)JtDzr5-RXKe>9ZH8A6tz)aaurzziKlgvI;A_iIMe_Yel{`3{XOaBCGH1`%1^< z9h#NDa;O}$IXTZv|88o2o{sk8ypnC!J#?u(a!rGLOd;Uz`mq;oRb&Tr9Goh&Z7ybp%boVZF}%JGF8i^ z(B@6qD1Yn|R~qx$RK_zEuU#{mRR19%=Wb-^RcyjwJeqoAdTZUg_@*sQn?jF$2^?hQ z(ICl9B`s*PPyF<9@!=z&15QOchCy9j zBvihCT^`jtIMBueWB16Pm6NKv>$IQ2Q%EITmdo9i9;YO&^dfP!Oz|~NxN5^!tE!41 zv!tQE>(%T;adv&jm9KUtw#Oc@uV3uW551ZZFH{CUeV``;bHPoF!QYD>i8V)+jSDd3x$)ob0Sz45IVu`G+kB_>L zoO0sc!1u9^{zNUOp3KUg!V-sZI7gOLS(Ac-r0#LkY2|R*PO2kUunVv3UYEnCU<=~8 z`%<3Vb)_978!nbF>^JtbIPBT%ynfE^)>8MG8aajO8vWJ>u6a`Q=-XZ=t;DCqJuI)k zzZnG^3~m(Qjgc2gc1u--y-v!kOcH*da#}n5CSSni+_Bb+5swNiS$vu(d-zoq>MONw zb=EI1d%X0T2+H8kgSe@<>m}WOt6G#Tc)zydo6HM8zY;z&CWCq6fJdJ=F3_rb5)nHU zI~*0iKU#yDzTfPm!zwjD>jxtWbdD~#PFQuwgC3H^WKqRuDQ@5G$8N#T?$;m?LF$@P zaB1_&>E$OCJXEX*?m6!VJZ=}iT&mb7@#r?<+!KapCleCpP`d8wS?<#+7n-}Ms`>1M zXUw=IYFhM@F;NnfhdR|52f{T{ZlW$bYD*TWWRTztVQCdl_w~1EyjN1e=G(>yJveZ% znE6tHR>q04?3}~tT3Mr%UmH6En~HGD<59{Ms_)EB_-8M3Ka(ZzxT5*GV=iAef_hk! zvql8#Ua5Pg!*96rmgEOvQr}fmmYk6~7q|Yocj`doiCcV$(I9VE>T4QG-~50-=q_}g)0b`*{!)~S&&nwh%-<4R~t}& zY=iP+qG#VEdwu%u^zaSIllN>Nbm^bVPG9(d^DfKdI78odp*z0Yi~f4``lZ56$m6=t z_ck)Dn!k?RHN7guWL%n6GZ4F)=gKQ7 zPE%77u^lA-r^MF<>;7nhB%y@$Z7VY7Cim2Wc^0$s_*>WMFj%3_^ zzBx8eye9Vx9%AoWT7EX0;^A{ixU!f$O=L8|Je7;4DzU+xa9!OEfeK|;@mz0mzZvAC zI&+mLKcP5awu++-N$!!3b|&K24Qi4O{tCB~tV<_U|Il$cTDs!AeQvS&(-RHN&8*#e zqHGsRIQpk%N<-(b(|H}$SO|(T_@kvI8XCeR4eKNtHr>GyCEm=LRty8v?#u`KR|zDFS!`*J6^0I3x|u{73LFnXnVs%K zhY3D{U-I;v@o@4xk@xY;vNp9(S$*6+58kz0VprIQIrl`}+Df7Kqq&A&dWflFZq_u> zuH>oaH<=cLWKVjIKha$N+&c{2ko}BWera2ixZc<=DVwC)?C$gRaT;^!wEISvDLR&^ zVJZrOD~`8O3ai>@mhO4!v0!kXW{6n+!XwpiiS-GyFczZ-m$>sRvlrjd&!tEBhA)bn z@l`_+6cgVc5^(9&tbO$v^2>KbR_SnWoS?5@z6%v<2$}3w756>Y?DvAWzrSF@@~V$# zWV18k6>JUr02$7i5mQ_wK$gosC1cUXUYv57&^Wy)Fuihgoxg5I96c`PI7F6L?tbXw ztpstZu@KceJ!aA9n0!m4<`2w$9zs^(wgTy{MrIjmuRhr}g|kWAc=hmf24((3I*aSr zFxT;}b7r>)Rs5pk8QG%WRS;zF3mn0H>-+q@*_z$+(ThN0@v8i&$1{}0Lbur+h(CW1 z&7Vm_AAY^>lNR^)=ES!f3}0%`Rai~;9}#>UMAodA*fl*-lAiivZJ%xsO_yXZf!i{r z;B#IdQQ7gU5;(ap6Iw}X3A$9nvE)bm)zux$b*)}SvZ|MBGFrSPLf5~dKkf0FE1!=m z*KOIHB8;cFuWsvB*@>BrQM_DI7C8Pblhtc_lt zU@vX5Tq@2AUnZXlE{`|!Jd!di9G%FTUL7B zpguj%QfyN*{o&&}QI2Qhgp3A5LKYMFMZwTaYAxS8?q6sWXjiq{b2_Er-zqLy z$%K5cx1J6u7xy-O_RmYZZf}}r8k*$1mw2hQ$w~dk{23gywcppa?=R?TJwyFw+`@RE zo{meI^~ifCxZROFoii)}6wfYet)Ay$3TcIDiJCYEXg=_mEobaAocZ$V>{1v!^BN92&XNZ2Ud%Kx^$RCB27|Mf+Fo(PRT-?czkWUR;8Rf0rw2pt0-`6P zqu1J&kkU+-#9Jnsn_60wIzmH<BtcDnrLRO z1_rJY5ghn>gMPA8=ia^zAD8R~=4&b5m$TPn)R^es67RD=_mJG_9*vO(<02*Z;ZVjB zg*2zQ(1QuMw8%T{&Bx-s-z45xHmq_QEoh?RC^EbM3^HM_B-b~WR2n)u^(L?NTEI%% zxRXuvlm5n5uglzf930#WZqOYxnR1*f%p;V3SJv^E>5gs7JwNWWcE^~m`p9lC z%B&3d^}CKi>=J0ZjJw?V1M1ob$I7V4pY!g!eEICvyn++@!wH?Y6{Cd$C(_s2>|`iz zztF_+Y9S>a<(6xZzkDgQ=IG8~SwnXbGT9hPVO;l87iu-;SA7W4L(q+I#myNWQch?J z4IgA)wWhghIVSpG1Tr}&#?P!yzgCDiZ^2U;GFUG^G&j(&fAiYC?v58ldg6}0Lek1y zf|-3?weL!9zRiI#s9AqTWEPRXd!8|MDs(?P($d?|c&xy)?BfjuhYCTy2SS{IlcfrQ zH>`rYGy#D^n_*zcxQS9CCj6GhZx|@Y=ICay<=1lG=G3FBER&roTeib5xRb^(u@w zMr|Ovq5NNIo;p=8tjfs0L?DROgE#9}vu(pwGs-`_P$HLF=VVcrj>#GpozBtcBcE{@ zUQ~TZwxssGylv2OG~wN>(e32In~zT2J{CTxS4vc_S1*fU>ApRaz@x1s@8=}DCamhC=jQXJFzKGgU(ftrAy-)L6-OV2d z0?iVlZq#w;>0;pKtE*|3%oIq|2ajS3@;l^{*Ar4Iy~U#EBcAY``v*3j^f1HlMoPgj z;`F!Cg0`X{yBzh__;U9vBkqOHodrggbhY1?ot_?;@;_@MVWtPv40#F;(TteS(bAHvym*}3oO`)15N(qQ}I555DDz;LMK&W(QVu%Kf@L9fCMPZ9!99X-ZS*G`cg4 zNt~GTqI(@qvn0W=C0?fmjZ$(VJrQX!Tx>{G#wXP=CGj@{^q`@-uaVoq^B{Tg6;W)A!>Ok2R|x+LF!&QAAg& z(H|zVZJM)b{k~u?2rjN?u-1ylEHrz331=Xu%JlG*teJbs>D5_sZS&wlC5F1&cP5wH zvBXS&kuWaors6c&=$GVnmlMg=n;9qg$X(8tCkhQ4zH&1QxZ~?LsgY2qXI1ah#iQ&d zJpST!Y@K_)-H_CUfyYI?^6XEl^ClQP1HG0vtqu4#9x(kA{HRCSA^u1S*R$KbYLnuj zqgJu1vEGkp8Ozk@#CW=WwVLi;B5R#F&~C;ZTlzG+=vL$r-h`L!0gw^b_+p!Of5aCr z-M1SG(+=~Mz2%2_IIRTN*$WeN8q7_m@695X!$tFDpT0bOu&&g23Uk{gQJvA!wDvRk zrBt8sd4P%HyZj&qdBu)KOV{utdiG9Px7&q=Q z|Co2?_D59ik<@2JJHMz~;Qo($!a^;yyU-(q23~E!bl_m1r zro1EP6`>%)&3r$K;~J2;>pbDy#7p-lyo8UtL@g-83z}Cj(mwl%j~*~34v!^8Fja}* z(&7i~>>eH?I)3;ZQ&rS~#|W~Ek{2EdGgeXcM&DwpN~pS3?`8H;{#&mL%ZF?_=b@M| zN@l0G){jXS5X5te_DM5lwkJexAIfU_5EoWU_TFWxs%OM_$hT)~DLKr7a>a2?thw%B z*h*~Qs(Q2M!KIO=soWU2Q1ne@_tvV-Yp^E=--<4Ng&Zfpy>Kl2+`C<@50cFQIcs z-sblZ2vq?#%#J&t&DpKeBcme^Hnz!Ns_t2}vnWh3L`j`3Y6w;vb79D#K0%R8tw*g0 z5#VE_BD>|Q$_}(2M{hCm1W zh!v)}t&@MoLQ(YF?AZ{(9f+#sZ~?v z=bw`dvfb+IQ+w2+Rny)iAj(Z(QDmhx($R#Q5|BCLhdD$w9lyylI6+ULI`0b6Dx)Vq8p`E_>n7$)2v$ z7FPntEhiW0J%cPOUe4n7p?510OidX&&sE>vWZW=SVp(=)vlDz z4^=)(qJG)x3|D%mBH55M+*<5PCcFBcG(A%_R(F!wKDpC8qL>ovbxNo@i;G!b-KD@< z_{k=>jZMDZfQ_$JiSTp;n}(eZb1LVlqhnZIbJpfAHcOUw zEuBf)#B^}iD%_vZOC4CS%4!RDX{+er?3~D|W0U)K*2CC$rfc(})GTF5D@(l2{gnZ! zB;RKj-M_j%Gf^Bg`L@|YEV0}(A_Q^Zyk(VI=y_ace`!MQ;Nmg&=wn|`e4yOlc-by0 zj)i^9WyXpdHTvC>5u*_+K~*o)>|#~YIePz1Y;e1mhd5#-*D4;BGdX_ckb=d-1;{7;vajrkytY;Y>+Q~bX?pU=f%e7Uz(#HZ2NV28W4`|}+zTkEx;g-1RvBwk8(`lt1V|q+GY9r>xe(jg*Xp zf&s9-dPkp>PXpYqt%CwWvlzguKYif9R_@>BpiGw(*wGE_fdj9R0vo?{kU%CP_yGlQ zAc(x9v|A3!cAtaaVc0Fb9o-$h-;cVaySI~L!Q_suATwkqsEh%2>jPb|z3UZpS;pEN zNJRl&ieRK_;s`M{w>Gxl-pCFy1p3#S6xb7MY!2-7-a5noH?lIY#rN007!Y-F)xBc@ zF*Y#;Dgt_ZE98$KN1mO&=K*wnmcXYA0JZ=cuq|W2UE&Vl4taY6YhZ)E0Qj~5A&3AD z_`_$={B|9_rDbUPYw>L@`~hDF45$MNMF3Ul2=JpYIN%hGhNFN3QW7o!9I(Ju{25df zyoQ4zfdi0iAPyWb{3uEMH3@$38XN_dL}SFzzyT-%s^f4_un7zXe?UrNK|K_lUko^) zQ8=(pj2IGBM@sU8Rim+RPyve*#{dTu0*Nnzf`OJ{C9t4nfJNZ7Ewh2jpkRAYC>-n$ z6b=oZp%5rg4=4iPI)Hj8NesRP3>3R1+;$!>L`iN}y?t$49rOsOWvfnp&FQ)16x4B(4Y{7z_-qiMgW!JcY*E<0FXR{2hxw>H*&HE->`s38y*zkU<1icKLBZK zV+-;R{Ja^}Vk5KLBcV?)N;?wSd!`3x?#;nR#LZk{QW-HwUuxl%$42A&Gy|yPf z@EQ>b96-q?SAkhCV+ACEiESUncXP*8dtjPDffrPeg7aeQ4}wMk7=oBWe#(G!9`Fxh zy)A=bfSj4%WKbv!OsoB+9DqzPDf>4W8r-7#iwuTAf~n-c$N*h{TYr}!!0!329O4h< zkl=oe-^%fWS#Q6}U}z+ek@l+$iUBvV{3hcE$KzKS3=QTW{wCuGlU09}!D0M+Y=L30 zd)k2gp$#}3wnrC^f&uw{zqRoPn}PS({NB${IFkPlx?n!+uk|6IdwoM7pnJv_j^*Ej z0|f9=nmzR)!2RpLwSflri2o+TqW0<{_x3&F5BNi1!7RaF>qEl8OuXM@NbprIzse9m z;@IypB(U#)j|>TJLj6@2i2?U!|0d%H)3|?=0n=)a3I#6ph}C4KzQv3Gr9^FtEMYKx6h|91XnvX|FE-9$W#_ZI8_u3`pd^wTpo5 z8D9)?FD9`t=$4U}bLlT^Zy{vXccD%}78 diff --git a/dsLightRag/Test/T1_Login.py b/dsLightRag/WxGzh/T1_Login.py similarity index 89% rename from dsLightRag/Test/T1_Login.py rename to dsLightRag/WxGzh/T1_Login.py index e313e8ab..83eb3e0d 100644 --- a/dsLightRag/Test/T1_Login.py +++ b/dsLightRag/WxGzh/T1_Login.py @@ -7,6 +7,8 @@ import json import logging +from torch.distributed.elastic.timer import expires + """ # 查看selenium版本 pip show selenium @@ -55,18 +57,21 @@ if __name__ == '__main__': driver.get('https://mp.weixin.qq.com/') # 获取cookies cookie_items = driver.get_cookies() + expiry=-1 # 获取到的cookies是列表形式,将cookies转成json形式并存入本地名为cookie的文本中 for cookie_item in cookie_items: cookies[cookie_item['name']] = cookie_item['value'] + if('expiry' in cookie_item and cookie_item['expiry'] > expiry): + expiry = cookie_item['expiry'] if "slave_sid" not in cookies: logging.info("登录公众号失败,获取cookie失败") exit() - # cookies = json.dumps(post) # 注释掉这一行 # 将cookies写入文件 + cookies["expiry"] = expiry with open('cookies.txt', mode='w', encoding="utf-8") as f: - f.write(json.dumps(cookies)) + f.write(json.dumps(cookies, indent=4, ensure_ascii=False)) # 关闭浏览器 driver.quit() # 输出提示 diff --git a/dsLightRag/Test/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py similarity index 89% rename from dsLightRag/Test/T2_GetArticleList.py rename to dsLightRag/WxGzh/T2_GetArticleList.py index bbf4318f..f06849b4 100644 --- a/dsLightRag/Test/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -37,6 +37,21 @@ if __name__ == '__main__': content = f.read() # 使用json还原为json对象 cookies = json.loads(content) + # "expiry": 1787106233 + # 检查是否有过期时间 + expiry=cookies["expiry"] + if expiry: + # 换算出过期时间 + expiry_time = time.localtime(expiry) + expiry_date = time.strftime("%Y-%m-%d %H:%M:%S", expiry_time) + print("cookies的过期时间一般是4天,cookies过期时间:", expiry_date) + # 获取当前时间戳 + current_timestamp = time.time() + # 检查是否已过期 + if current_timestamp > expiry: + print("Cookie已过期") + exit() + options = Options() options.add_argument('-headless') # 无头参数,调试时可以注释掉 # 设置headers - 使用微信内置浏览器的User-Agent diff --git a/dsLightRag/Test/T3_GetArticle.py b/dsLightRag/WxGzh/T3_GetArticle.py similarity index 100% rename from dsLightRag/Test/T3_GetArticle.py rename to dsLightRag/WxGzh/T3_GetArticle.py diff --git a/dsLightRag/WxGzh/__init__.py b/dsLightRag/WxGzh/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dsLightRag/Test/article_urls.txt b/dsLightRag/WxGzh/article_urls.txt similarity index 100% rename from dsLightRag/Test/article_urls.txt rename to dsLightRag/WxGzh/article_urls.txt diff --git a/dsLightRag/WxGzh/cookies.txt b/dsLightRag/WxGzh/cookies.txt new file mode 100644 index 00000000..7183301e --- /dev/null +++ b/dsLightRag/WxGzh/cookies.txt @@ -0,0 +1,17 @@ +{ + "_clsk": "2gtve8|1752546228205|1|1|mp.weixin.qq.com/weheat-agent/payload/record", + "xid": "16332bed01be1055e236ad45b33af8df", + "data_bizuin": "3514353238", + "slave_user": "gh_4f88a4e194da", + "slave_sid": "QzBRX1FWTXNMaEdJYnc4ODBaM3FJU3RRbjVJNFE2N2IzMXFyVGlRQ0V5YklvNGFOc3NBWHdjV2J5OVg5U0JBVXdfdGhSU3lObXRheG1TdFUyXzVFcTFYS3E1NTh2aTlnSlBOOUluMUljUnBkYktjeUJDM216WVJNYzJKQkx2eW9Ib1duUk1yWXI3RndTa2dK", + "rand_info": "CAESIFwUSYus3XR5tFa1+b5ytJeuGAQS02d07zNBJNfi+Ftk", + "data_ticket": "9gQ088/vC7+jqxfFxBKS2aRx/JjmzJt+8HyuDLJtQBgpVej1hfSG1A0FQKWBbHQh", + "bizuin": "3514353238", + "mm_lang": "zh_CN", + "slave_bizuin": "3514353238", + "uuid": "8c5dc8e06af66d00a4b8e8596c8662eb", + "ua_id": "y1HZNMSzYCWuaUJDAAAAAApPVJ0a_arX_A5zqoUh6P8=", + "wxuin": "52546211515015", + "_clck": "msq32d|1|fxm|0", + "expiry": 1787106233 +} \ No newline at end of file From 36f57caaee2e54f7894bf4301b2b0f206429756f Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 10:34:58 +0800 Subject: [PATCH 17/46] 'commit' --- dsLightRag/WxGzh/T2_GetArticleList.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index f06849b4..0c40c584 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -39,7 +39,7 @@ if __name__ == '__main__': cookies = json.loads(content) # "expiry": 1787106233 # 检查是否有过期时间 - expiry=cookies["expiry"] + expiry = cookies["expiry"] if expiry: # 换算出过期时间 expiry_time = time.localtime(expiry) @@ -51,6 +51,8 @@ if __name__ == '__main__': if current_timestamp > expiry: print("Cookie已过期") exit() + # 移除expiry属性 + del cookies["expiry"] options = Options() options.add_argument('-headless') # 无头参数,调试时可以注释掉 @@ -80,7 +82,8 @@ if __name__ == '__main__': logging.info("微信token:" + token) article_urls = [] - gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}] + gzlist = [{"account_name": "长春教育八卦阵", "account_id": "jybg100"}, + {"account_name": "致知物理", "account_id": "zhizhiphysics"}] for item in gzlist: account_name = item["account_name"] account_id = item["account_id"] @@ -147,7 +150,7 @@ if __name__ == '__main__': # 将返回的地址写入到文件 with open('article_urls.txt', 'w', encoding='utf-8') as f: for record in article_urls: - f.write(record['title']+" "+record['publish_time']+" "+record['url'] + '\n') + f.write(record['title'] + " " + record['publish_time'] + " " + record['url'] + '\n') # 关闭浏览器 driver.quit() From bf4ce692d7a629668d7a0ddfa37c128ba92b4c9e Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 10:35:45 +0800 Subject: [PATCH 18/46] 'commit' --- dsLightRag/WxGzh/article_urls.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dsLightRag/WxGzh/article_urls.txt b/dsLightRag/WxGzh/article_urls.txt index c8d57b34..06ea3b09 100644 --- a/dsLightRag/WxGzh/article_urls.txt +++ b/dsLightRag/WxGzh/article_urls.txt @@ -9,3 +9,9 @@ 长春市第十九中学2025年职称评聘拟通过人员名单的公示!有你认识的老师吗? 2025-07-11 15:22:13 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526176&idx=2&sn=693d13964e4be18718c0eb4fd13ba68f&chksm=84e1abb2b39622a442a9f7cea8ddc72820050b2896968f2d0ae283c7caca2dbe014a721feb2e#rd 高分喜报频传!长春这所小学靠啥成为“学霸制造机”? 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=1&sn=a0af7f484d6a3300a9b7f3d787a2594d&chksm=84e1ab9cb396228a56420696eb09071ff829d58e8e31bd652f849f3cbd0ee276b0baad7a1e89#rd 蝉联冠军!吉大尚德游泳队斩获骄人成绩! 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=2&sn=cabb682e99978bf2a58ff0e9e06dc53d&chksm=84e1ab9cb396228a5cd457cd7ee0728491e6b3dc34fbde02240624364cfa8a9e2c533052d2b4#rd +明日(16日)公布高中一批次录取结果,查询通道在此,请收好! 2025-07-15 08:38:04 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546664&idx=1&sn=431c1f89b968ddff2165466ce20b2976&chksm=feb6a485c9c12d930da2340a813d24d5dd168688af162b4e7bdcd42d5c31d832ef9dc915b1dc#rd +长春2024-2025九上试卷合集(赠答案) 2025-07-15 08:38:04 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546664&idx=2&sn=ffd3261777f03cdfcd78e7a899f25778&chksm=feb6a485c9c12d93a16db0ce8d9850cc843b22442b5d49850f93c99efbbc160855ff1a2314cf#rd +网传各初中2025中考最高分准吗? 2025-07-13 09:22:08 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546662&idx=1&sn=5c8a0cc82f0aab69a600d06b6e63a57f&chksm=feb6a48bc9c12d9da8ed3b2a19d12fa275f83796201448996bdfa2204bf41f7d826430ae30be#rd +长春市2025年中考各批次录取最低控制线确定 2025-07-12 10:05:49 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546644&idx=1&sn=ea3371033b95e7203e881947c980a8a4&chksm=feb6a4b9c9c12daf64cbd87239cd2fdc22a93e6e0d2555ce3c5b66bc0e96028bfc3565021201#rd +长春市2025年中考成绩将于7月12日12时公布 2025-07-11 15:13:54 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546642&idx=1&sn=718e0fa8463273260dae093e0686b7e0&chksm=feb6a4bfc9c12da9ab59ff8d7da3caefbb44d8195bb694b31cd5b3cab8fc6505b20c61a0ae62#rd +长春2024-2025九上试卷合集(赠答案) 2025-07-09 10:56:48 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546544&idx=1&sn=d07bf1b38403c0578ad67ae007ce6159&chksm=feb6a51dc9c12c0b9b90a2131a9ba913b92ed2eab3dcaa78fadccb6b231e4f5cb4247750f910#rd From 352d2d71a4f582c119001bdf47fc75c47366fde3 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:00:49 +0800 Subject: [PATCH 19/46] 'commit' --- .../Config/__pycache__/Config.cpython-310.pyc | Bin 911 -> 911 bytes .../PostgreSQLUtil.cpython-310.pyc | Bin 1565 -> 1565 bytes dsLightRag/WxGzh/T2_GetArticleList.py | 21 ++++++++++++++++-- dsLightRag/WxGzh/article_urls.txt | 12 +++++----- 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/dsLightRag/Config/__pycache__/Config.cpython-310.pyc b/dsLightRag/Config/__pycache__/Config.cpython-310.pyc index 76e8086d4dbc8bb523dc4cfcd66e1f5ab1971f11..d08d90dfe3ea0cfe56c5b74d7d02171c7f2b11d0 100644 GIT binary patch delta 220 zcmeBY?`P-E=jG*M0D|S-B^$Z#GS){i7bT|qX^KR#Ir>D#J96>IX0U4z*tW?tnWPz`CU0d*Wn%|= Lq=D#J9){%d?22$zl*C+yq}}5>n#CSUnf@=WD#D72uwR4gbO!-6T)}# z^fkIA0@dso5g+33?dlgF>>1^Hi__25-^2@Oo98V)C^OhK2yENrnM~4*w3R?< ziZ{SNIK(~3H8|b_$Ux!-_y-|#LxWv|q6FcJ0~~{c!~KI?q9%84j53qESSB;dO_pPo4Y Date: Tue, 15 Jul 2025 11:04:40 +0800 Subject: [PATCH 20/46] 'commit' --- dsLightRag/WxGzh/T3_GetArticle.py | 41 --------------------------- dsLightRag/WxGzh/Util/WxGzhUtil.py | 45 ++++++++++++++++++++++++++++++ dsLightRag/WxGzh/Util/__init__.py | 0 3 files changed, 45 insertions(+), 41 deletions(-) delete mode 100644 dsLightRag/WxGzh/T3_GetArticle.py create mode 100644 dsLightRag/WxGzh/Util/WxGzhUtil.py create mode 100644 dsLightRag/WxGzh/Util/__init__.py diff --git a/dsLightRag/WxGzh/T3_GetArticle.py b/dsLightRag/WxGzh/T3_GetArticle.py deleted file mode 100644 index 2f9c9e27..00000000 --- a/dsLightRag/WxGzh/T3_GetArticle.py +++ /dev/null @@ -1,41 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.chrome.service import Service as ChromeService -from selenium.webdriver.common.by import By - -url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd' - -options = Options() -options.add_argument('-headless') # 无头参数,调试时可以注释掉 -service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") -driver = webdriver.Chrome(service=service, options=options) -driver.get(url) -# 可以只要txt -html_content = driver.find_element(By.CLASS_NAME, "rich_media").text -# 第一行是标题,分离出来 -title = html_content.split('\n')[0] -print(title) - -# 按行遍历html_content,当发现空行时,删除空行前面的内容,只保留后面的内容 -lines = html_content.split('\n') -content_after_empty_line = "" -found_empty_line = False - -for line in lines: - if not found_empty_line and line.strip() == "": - # 找到第一个空行 - found_empty_line = True - continue - - if found_empty_line: - # 空行后的内容添加到结果中 - content_after_empty_line += line + "\n" - -# 如果没有找到空行,保留原始内容 -if not found_empty_line: - content_after_empty_line = html_content - -content_after_empty_line = content_after_empty_line.replace("\n\n", "\n") -print(content_after_empty_line) -# 关闭浏览器 -driver.quit() diff --git a/dsLightRag/WxGzh/Util/WxGzhUtil.py b/dsLightRag/WxGzh/Util/WxGzhUtil.py new file mode 100644 index 00000000..6b6d55b1 --- /dev/null +++ b/dsLightRag/WxGzh/Util/WxGzhUtil.py @@ -0,0 +1,45 @@ +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService +from selenium.webdriver.common.by import By + +def get_article_content(url): + """ + 获取微信公众号文章内容 + :param url: 文章URL + :return: 文章内容文本 + """ + options = Options() + options.add_argument('-headless') + service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") + driver = webdriver.Chrome(service=service, options=options) + + try: + driver.get(url) + html_content = driver.find_element(By.CLASS_NAME, "rich_media").text + + # 处理内容,提取空行后的文本 + lines = html_content.split('\n') + content_after_empty_line = "" + found_empty_line = False + + for line in lines: + if not found_empty_line and line.strip() == "": + found_empty_line = True + continue + + if found_empty_line: + content_after_empty_line += line + "\n" + + if not found_empty_line: + content_after_empty_line = html_content + + return content_after_empty_line.replace("\n\n", "\n") + finally: + driver.quit() + +if __name__ == '__main__': + # 示例用法 + url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd' + content = get_article_content(url) + print(content) diff --git a/dsLightRag/WxGzh/Util/__init__.py b/dsLightRag/WxGzh/Util/__init__.py new file mode 100644 index 00000000..e69de29b From 2ad3154fe87ddd18612f75a77ff603c68439edfd Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:09:16 +0800 Subject: [PATCH 21/46] 'commit' --- dsLightRag/Util/WxGzhUtil.py | 100 ++++++++++++++++++ .../__pycache__/WxGzhUtil.cpython-310.pyc | Bin 0 -> 2483 bytes dsLightRag/WxGzh/T2_GetArticleList.py | 15 ++- dsLightRag/WxGzh/Util/WxGzhUtil.py | 45 -------- dsLightRag/WxGzh/Util/__init__.py | 0 dsLightRag/WxGzh/article_urls.txt | 17 +-- 6 files changed, 122 insertions(+), 55 deletions(-) create mode 100644 dsLightRag/Util/WxGzhUtil.py create mode 100644 dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc delete mode 100644 dsLightRag/WxGzh/Util/WxGzhUtil.py delete mode 100644 dsLightRag/WxGzh/Util/__init__.py diff --git a/dsLightRag/Util/WxGzhUtil.py b/dsLightRag/Util/WxGzhUtil.py new file mode 100644 index 00000000..d4bf9138 --- /dev/null +++ b/dsLightRag/Util/WxGzhUtil.py @@ -0,0 +1,100 @@ +import datetime +import random +import requests +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service as ChromeService +from selenium.webdriver.common.by import By + +def init_wechat_browser(): + """初始化微信爬虫浏览器实例""" + options = Options() + options.add_argument('-headless') + service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") + return webdriver.Chrome(service=service, options=options) + +def get_wechat_articles(account_name, account_id, token, cookies, header): + """获取指定公众号的文章列表""" + article_urls = [] + + # 搜索微信公众号的接口地址 + search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' + query_id = { + 'action': 'search_biz', + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'query': account_name, + 'begin': '0', + 'count': '5' + } + + # 完整实现搜索和获取文章逻辑 + search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) + lists = search_response.json().get('list')[0] + fakeid = lists.get('fakeid') + + # 微信公众号文章接口 + appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' + query_id_data = { + 'token': token, + 'lang': 'zh_CN', + 'f': 'json', + 'ajax': '1', + 'random': random.random(), + 'action': 'list_ex', + 'begin': '0', + 'count': '5', + 'query': '', + 'fakeid': fakeid, + 'type': '9' + } + + query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) + fakeid_list = query_fakeid_response.json().get('app_msg_list') + + for item in fakeid_list: + article_urls.append({ + 'title': item.get('title'), + 'url': item.get('link'), + 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S') + }) + + return article_urls + +def get_article_content(url): + """ + 获取微信公众号文章内容 + :param url: 文章URL + :return: 文章内容文本 + """ + options = Options() + options.add_argument('-headless') + service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") + driver = webdriver.Chrome(service=service, options=options) + + try: + driver.get(url) + html_content = driver.find_element(By.CLASS_NAME, "rich_media").text + + # 处理内容,提取空行后的文本 + lines = html_content.split('\n') + content_after_empty_line = "" + found_empty_line = False + + for line in lines: + if not found_empty_line and line.strip() == "": + found_empty_line = True + continue + + if found_empty_line: + content_after_empty_line += line + "\n" + + if not found_empty_line: + content_after_empty_line = html_content + + return content_after_empty_line.replace("\n\n", "\n") + finally: + driver.quit() \ No newline at end of file diff --git a/dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc b/dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9742d49875cc692f0c4f8f3f838f7f92bca1af9 GIT binary patch literal 2483 zcmai0>x&#k5TEXO?(E|(m&Ew?l$^Ng-KkN;MU8rDMDmcF5?oG(x#_;$x!ZYMPp|i2 z76e~`81pS+h_d@hL&AL%5hQ-{pXjeR(CvkbgN><_eNg*c=HW0kn13}cO;MXg@>8Ftc+*O4e~&}G8HcPRtGK9TY; zi8IMAQub~bP+;*(TY*#`Khj+Tehcu+9s^$^!rjU43~ zG0tMx*6bbx_2KC+hF_f-esX?zb?4d-gBz!>-2D9V^>5DI{Oa`Z^RI_HUtYU<=2mI8 zO?}d#S$6B-LSyMv7?WfpTUy-8IE@ZHu@tmf64Ao*Il4(NAu$0%(B=MehkB{c+n2Dg zv*HxOPDUp)nGP<9icg5=vsO2vF&EVZWo>k1CJ!jAVy=!ET$#qoKrU%)5{6rkz1&zL z*{OuB!SDu4R%yV2I~umy{G{JndW(mhrBjy=4X`p{xX^ zG6#r-UF4z~VpAXlCXn$XcayuTNDE+ETF)e?XAzu3go!~+uJyG6#9m%EaW0m6Hb<*C zw{kr`uafYW8w}FDc_;>N_h#S$^i9TQtg%hZj02Et%=+vlVs1ua<8H+ zmX*bdvRKWlBa2gDqn6iR*50|NH<6otBcA}vdk093`v!=ct7hNgwvzV`(EE@feTUoz zo*cNS%WPRcmYa|px2xQ^`RTXAbLX#r^6_xz!tlc@*S`O3c<$R97e2Ut{^J{$`@_?p z-5h+q{YabhG;7SyN9o)K4L8GhZhd_&NTT_m70xb)@q9*o7PObc?Zd(?cn<_&ZnwRK zV`8B19vtkh?Il!2Wv;|#&Q32Vh)qaoRT=zm@J(=^IjhlMRa z_UNY21bU`kW_N-ac9$e~OLC7S_eyeyB-4`IFUft9JP1S>d@H4p>1pbLMNis;6NbD) zrDyS)D0frhbL#OhqT73B-kzO_W@pIs%xjIAV~v@`y3QVzcJwamh)UXB1`BPaTra7u zg~KSsB;%RTTa>f?Qrm+4XiS77O-sw7x&ly@T*mz<6?zDWb09D)a;`oJ9tJ9wUhIRm zJC;IHocRG|wTLa~G9Cs1aPTSHy<6#^qi)9wiI^DOIHOq##f%D59z839;ui6h-;3(_ z6Ayg##r{GpZpz#Jx;&at@iRgwKdK58WRG-iFC2ah?va0)gl_ga5Ca=J#uyc^H6DMP zs!jpw|6CidjP!n;s@0H#4dlRkCvtx`EcObR3uHQtA6Y_V1yS{32N+d}QI7smiLt80 zI7hM)_w?=Oa=mZnn#3P)fX`Kjm;=08>KVDwcZmhKHi-?D#_8F)3HaU*b;QmZP)o`K zjo53LH3(Fj_1WBJFG0O=h;=}FTI*SICOeWFJuA2R6=*xw0H2b(R0kT;7I(P~tzPa@ zaN3=QrvQG@5q5z;Lj6Y{4tKs&S`CG)X}G+`bg}r>$)m*zqrA)F@$^uA11i@qUQz4J zf}NNxiHsPN9l7bUFlc+wMnYd8m&soC>YLlxb?Y_NCRkOf)+CWVflWyA1dwKB1sak^ zJ5=@$CTn)nT{!yU;-Yu_#bd7k^ynslElWG095@S8VKbU``~bkW-VHfGXkJ>|Z7#wp znpv~b=26EBl9;$OSC)ktHcU68K2M*v2p@_q7 zRP@gK$0C&H(Z7qiqG8XC`f$;O|3h!oxS9N;p>Cjg_rDoUq9}>ymbcUus0OM~qTKvT i2qYSr!m9IZl#p(RJ}+^U?E~Uy5@MCn>tM&O9Qp%Tx!MB& literal 0 HcmV?d00001 diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index a9b9ae09..8aff8387 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -14,6 +14,7 @@ import re import requests import asyncio from Util.PostgreSQLUtil import init_postgres_pool +from Util.WxGzhUtil import init_wechat_browser, get_wechat_articles async def get_wechat_sources(): """从t_wechat_source表获取微信公众号列表""" @@ -78,7 +79,9 @@ if __name__ == '__main__': } service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") - driver = webdriver.Chrome(service=service, options=options) + driver = webdriver.Chrome(service=service, options=options) # 删除这行 + # 使用统一的初始化方式 + driver = init_wechat_browser() # 方法3:使用requests库发送请求获取重定向URL url = 'https://mp.weixin.qq.com' @@ -93,15 +96,20 @@ if __name__ == '__main__': logging.info("微信token:" + token) article_urls = [] - # 替换硬编码的gzlist + # 初始化浏览器 + driver = init_wechat_browser() + + # 获取公众号列表 loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: gzlist = loop.run_until_complete(get_wechat_sources()) finally: loop.close() - + + # 爬取文章 for item in gzlist: + article_urls = get_wechat_articles(item["account_name"], item["account_id"], token, cookies, header) account_name = item["account_name"] account_id = item["account_id"] # 搜索微信公众号的接口地址 @@ -171,4 +179,3 @@ if __name__ == '__main__': # 关闭浏览器 driver.quit() - print("所有文章爬取完成!") diff --git a/dsLightRag/WxGzh/Util/WxGzhUtil.py b/dsLightRag/WxGzh/Util/WxGzhUtil.py deleted file mode 100644 index 6b6d55b1..00000000 --- a/dsLightRag/WxGzh/Util/WxGzhUtil.py +++ /dev/null @@ -1,45 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.chrome.service import Service as ChromeService -from selenium.webdriver.common.by import By - -def get_article_content(url): - """ - 获取微信公众号文章内容 - :param url: 文章URL - :return: 文章内容文本 - """ - options = Options() - options.add_argument('-headless') - service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") - driver = webdriver.Chrome(service=service, options=options) - - try: - driver.get(url) - html_content = driver.find_element(By.CLASS_NAME, "rich_media").text - - # 处理内容,提取空行后的文本 - lines = html_content.split('\n') - content_after_empty_line = "" - found_empty_line = False - - for line in lines: - if not found_empty_line and line.strip() == "": - found_empty_line = True - continue - - if found_empty_line: - content_after_empty_line += line + "\n" - - if not found_empty_line: - content_after_empty_line = html_content - - return content_after_empty_line.replace("\n\n", "\n") - finally: - driver.quit() - -if __name__ == '__main__': - # 示例用法 - url = 'http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd' - content = get_article_content(url) - print(content) diff --git a/dsLightRag/WxGzh/Util/__init__.py b/dsLightRag/WxGzh/Util/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/dsLightRag/WxGzh/article_urls.txt b/dsLightRag/WxGzh/article_urls.txt index 7e5eccc0..dc2077eb 100644 --- a/dsLightRag/WxGzh/article_urls.txt +++ b/dsLightRag/WxGzh/article_urls.txt @@ -1,9 +1,14 @@ -明日(16日)公布高中一批次录取结果,查询通道在此,请收好! 2025-07-15 08:38:04 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546664&idx=1&sn=431c1f89b968ddff2165466ce20b2976&chksm=feb6a485c9c12d930da2340a813d24d5dd168688af162b4e7bdcd42d5c31d832ef9dc915b1dc#rd -长春2024-2025九上试卷合集(赠答案) 2025-07-15 08:38:04 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546664&idx=2&sn=ffd3261777f03cdfcd78e7a899f25778&chksm=feb6a485c9c12d93a16db0ce8d9850cc843b22442b5d49850f93c99efbbc160855ff1a2314cf#rd -网传各初中2025中考最高分准吗? 2025-07-13 09:22:08 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546662&idx=1&sn=5c8a0cc82f0aab69a600d06b6e63a57f&chksm=feb6a48bc9c12d9da8ed3b2a19d12fa275f83796201448996bdfa2204bf41f7d826430ae30be#rd -长春市2025年中考各批次录取最低控制线确定 2025-07-12 10:05:49 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546644&idx=1&sn=ea3371033b95e7203e881947c980a8a4&chksm=feb6a4b9c9c12daf64cbd87239cd2fdc22a93e6e0d2555ce3c5b66bc0e96028bfc3565021201#rd -长春市2025年中考成绩将于7月12日12时公布 2025-07-11 15:13:54 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546642&idx=1&sn=718e0fa8463273260dae093e0686b7e0&chksm=feb6a4bfc9c12da9ab59ff8d7da3caefbb44d8195bb694b31cd5b3cab8fc6505b20c61a0ae62#rd -长春2024-2025九上试卷合集(赠答案) 2025-07-09 10:56:48 http://mp.weixin.qq.com/s?__biz=MzU5OTQ0MzEzOA==&mid=2247546544&idx=1&sn=d07bf1b38403c0578ad67ae007ce6159&chksm=feb6a51dc9c12c0b9b90a2131a9ba913b92ed2eab3dcaa78fadccb6b231e4f5cb4247750f910#rd +长春中考上演“神仙打架”!省二力旺等五校过半考生超700分! 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd +独家专访赫行学校2025年中考“双黄蛋”!学霸靠啥杀出重围? 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=2&sn=c7733f7c2c6331e51e55af695f99a43e&chksm=84e1ab0cb396221a7d185dcb99acc9dce45cc5c66c3eef42680a215b710bb9bfa9fd10da4419#rd +长春40所学校中考成绩曝光!700+成批涌现!谁是最大黑马? 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=1&sn=1f314640ae6eec236b0e16271bd44362&chksm=84e1ab1db396220b73ae08898a026d887436501a6c42abe01d7fa4aef9063533fad89720d3b8#rd +喜报!长春外国语学校女子篮球队夺得冠军! 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=2&sn=31651043acb6ecbf4232e92e635196b6&chksm=84e1ab1db396220b0810c3bdf332128b110d1902658f2556eaeff67cec084a8a068a5ae9a275#rd +“趣闯盛夏·探无界”!探秘一实验银河小学夏令营 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=3&sn=8edf6ce8cebdaad55343b39639876c27&chksm=84e1ab1db396220b26b172b3b565f919f7ded4c2a5b78227294ea29a558a7666c33b8c1de660#rd +刚刚!2025年长春中考各批次控制线公布! 2025-07-12 10:04:32 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526196&idx=1&sn=282e5e824410a9a92a83dd800cb58a7c&chksm=84e1aba6b39622b03fe6422032474c9696f83541d9ff9b8b6a9f0f099ce459da430f720d05e4#rd +重磅消息!师大附属实验学校(经开)校长有新任命! 2025-07-12 10:04:32 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526196&idx=2&sn=9449c87935faf86ddcc5a674ea888913&chksm=84e1aba6b39622b03fd8413ff1e74b61f662ec8deb3887c2c5b8e5ad15470b15ae14b21e94ea#rd +市教育局最新发布!长春2025年中考成绩将于7月12日公布! 2025-07-11 15:22:13 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526176&idx=1&sn=a5b4104d2fe74ace32ab31faf5f1c44c&chksm=84e1abb2b39622a40b0e0969e84fb00c753cc8ffefb8624726afa2a7352ea725c7f967bf25f5#rd +长春市第十九中学2025年职称评聘拟通过人员名单的公示!有你认识的老师吗? 2025-07-11 15:22:13 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526176&idx=2&sn=693d13964e4be18718c0eb4fd13ba68f&chksm=84e1abb2b39622a442a9f7cea8ddc72820050b2896968f2d0ae283c7caca2dbe014a721feb2e#rd +高分喜报频传!长春这所小学靠啥成为“学霸制造机”? 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=1&sn=a0af7f484d6a3300a9b7f3d787a2594d&chksm=84e1ab9cb396228a56420696eb09071ff829d58e8e31bd652f849f3cbd0ee276b0baad7a1e89#rd +蝉联冠军!吉大尚德游泳队斩获骄人成绩! 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=2&sn=cabb682e99978bf2a58ff0e9e06dc53d&chksm=84e1ab9cb396228a5cd457cd7ee0728491e6b3dc34fbde02240624364cfa8a9e2c533052d2b4#rd 长春中考上演“神仙打架”!省二力旺等五校过半考生超700分! 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd 独家专访赫行学校2025年中考“双黄蛋”!学霸靠啥杀出重围? 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=2&sn=c7733f7c2c6331e51e55af695f99a43e&chksm=84e1ab0cb396221a7d185dcb99acc9dce45cc5c66c3eef42680a215b710bb9bfa9fd10da4419#rd 长春40所学校中考成绩曝光!700+成批涌现!谁是最大黑马? 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=1&sn=1f314640ae6eec236b0e16271bd44362&chksm=84e1ab1db396220b73ae08898a026d887436501a6c42abe01d7fa4aef9063533fad89720d3b8#rd From 24de098979caeea9e6c0add92c234cbbc2126229 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:15:40 +0800 Subject: [PATCH 22/46] 'commit' --- dsLightRag/Util/WxGzhUtil.py | 53 ------------------ .../__pycache__/WxGzhUtil.cpython-310.pyc | Bin 2483 -> 1364 bytes dsLightRag/WxGzh/T2_GetArticleList.py | 37 +++++------- 3 files changed, 13 insertions(+), 77 deletions(-) diff --git a/dsLightRag/Util/WxGzhUtil.py b/dsLightRag/Util/WxGzhUtil.py index d4bf9138..07abec69 100644 --- a/dsLightRag/Util/WxGzhUtil.py +++ b/dsLightRag/Util/WxGzhUtil.py @@ -1,6 +1,3 @@ -import datetime -import random -import requests from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService @@ -13,56 +10,6 @@ def init_wechat_browser(): service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") return webdriver.Chrome(service=service, options=options) -def get_wechat_articles(account_name, account_id, token, cookies, header): - """获取指定公众号的文章列表""" - article_urls = [] - - # 搜索微信公众号的接口地址 - search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' - query_id = { - 'action': 'search_biz', - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'query': account_name, - 'begin': '0', - 'count': '5' - } - - # 完整实现搜索和获取文章逻辑 - search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id) - lists = search_response.json().get('list')[0] - fakeid = lists.get('fakeid') - - # 微信公众号文章接口 - appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' - query_id_data = { - 'token': token, - 'lang': 'zh_CN', - 'f': 'json', - 'ajax': '1', - 'random': random.random(), - 'action': 'list_ex', - 'begin': '0', - 'count': '5', - 'query': '', - 'fakeid': fakeid, - 'type': '9' - } - - query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data) - fakeid_list = query_fakeid_response.json().get('app_msg_list') - - for item in fakeid_list: - article_urls.append({ - 'title': item.get('title'), - 'url': item.get('link'), - 'publish_time': datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S') - }) - - return article_urls def get_article_content(url): """ diff --git a/dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc b/dsLightRag/Util/__pycache__/WxGzhUtil.cpython-310.pyc index b9742d49875cc692f0c4f8f3f838f7f92bca1af9..5075ff6ce5e093e8119550083361d00de35b186a 100644 GIT binary patch delta 198 zcmdlie1(fIpO=@50SNXUFU_!Jo5&~Q?*indFr+Z%Fyt~uF*1VKOgT)s%u&o>HggV3 zE^8DUn9Y*Ip34!%0c5kLu(dEmai*}RZ~$qp6wY7zB)dYBeB0LN{j3CIt#KQpqwXP?V delta 1323 zcmaJ=L66%+6!zF-$99}#n>Nsvwv>ugjbJw{g#&x3(6$mnYI~@JL?o~r&urq|*zt}h z>^8EM14ZqDvY?e(soFz)Ljob89(v)*Kj47oOxkkdgg7Db#z~5};E})g-preCzW2ud z#eRIeI;>P|1s?J9sQ-uaP4&!#KtvRR21;jcQ3n;V+EKx(11-cIjO3a&&_knR6h0o5 zLbGEPzCN(Sa;FTwK}vU&jzdghfvpgml)+XBx}~(7-{rwKT1Z%Xv`bj<8D%Z>8w0c$f#^LYSMIv?)PNZ^4N^#s}y*xUB@g~7at3`3#sjwrh= z^e*iMkn0ae5pQXtru`cGTVRyX+(r6sU@?hY_VY|y@Y#VEVN(}abZGlw&Y}M1vL6a zJKe~GbbA>EWM1`!f_C0s$Ynh62h;^o$&34y3eDnz+<=JM;?@~W;$f6fq08cvd0T8R zeqXj&5{Sxjj|*x&(U^zx+j3uQFE1vF7UF=@@NgYB$R73k9(Nal68Xm_9|B+%YpSZE z`7oCDuX?2pKl{IH%aujmpY>`LnJPvmjMK>e8yoD}V@v%MVRlJT4*S{#oxKhGhX>kK zowcR+tCpe7!;kRg#!U!YHTDje`FF7{*T2U1jcao1=HWU0;>i=>nu_Fi7Q Date: Tue, 15 Jul 2025 11:16:53 +0800 Subject: [PATCH 23/46] 'commit' --- dsLightRag/WxGzh/T2_GetArticleList.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index 153e43a6..65af9f41 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -27,6 +27,8 @@ async def get_wechat_sources(): return [dict(row) for row in rows] finally: await pool.close() + + """ # 查看selenium版本 pip show selenium @@ -100,7 +102,7 @@ if __name__ == '__main__': article_urls = [] # 初始化浏览器 driver = init_wechat_browser() - + # 获取公众号列表 loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) @@ -108,7 +110,7 @@ if __name__ == '__main__': gzlist = loop.run_until_complete(get_wechat_sources()) finally: loop.close() - + # 爬取文章 for item in gzlist: account_name = item["account_name"] @@ -159,12 +161,14 @@ if __name__ == '__main__': article_url = item.get('link') article_title = item.get('title') publish_time = datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S') - + # 直接获取并显示文章内容 + if '试卷' in article_title: # 过滤掉试卷 + continue print(f"正在处理文章: {article_title} ({publish_time})") content = get_article_content(article_url) print(f"文章内容预览: {content[:200]}...") - + time.sleep(1) # 关闭浏览器 driver.quit() From 7101b19cb7fea5bbf75ce13dc95880661ec4ccb3 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:28:01 +0800 Subject: [PATCH 24/46] 'commit' --- dsLightRag/WxGzh/T2_GetArticleList.py | 33 +++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index 65af9f41..0bb8944c 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -47,6 +47,20 @@ from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService +async def save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, account_id): + try: + async with pool.acquire() as conn: + # 确保account_id是整数 + account_id_int = int(account_id) if account_id else 0 + await conn.execute(''' + INSERT INTO t_wechat_articles + (title, source, url, publish_time, content, source_id) + VALUES ($1, $2, $3, $4, $5, $6) + ''', article_title, account_name, article_url, + publish_time, content, account_id_int) # 修改为整数类型 + except Exception as e: + logging.error(f"保存文章失败: {e}") + if __name__ == '__main__': # 从文件cookies.txt中获取 with open('cookies.txt', 'r', encoding='utf-8') as f: @@ -157,17 +171,26 @@ if __name__ == '__main__': fakeid_list = query_fakeid_response.json().get('app_msg_list') for item in fakeid_list: - # 采集item示例 article_url = item.get('link') article_title = item.get('title') - publish_time = datetime.datetime.fromtimestamp(int(item.get("update_time"))).strftime('%Y-%m-%d %H:%M:%S') + publish_time = datetime.datetime.fromtimestamp(int(item.get("update_time"))) - # 直接获取并显示文章内容 - if '试卷' in article_title: # 过滤掉试卷 + if '试卷' in article_title: # 过滤掉试卷 continue + print(f"正在处理文章: {article_title} ({publish_time})") content = get_article_content(article_url) - print(f"文章内容预览: {content[:200]}...") + + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + pool = loop.run_until_complete(init_postgres_pool()) + loop.run_until_complete( + save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, + account_id)) + finally: + loop.run_until_complete(pool.close()) + loop.close() time.sleep(1) # 关闭浏览器 From ed4aa1bd69a80592b284258656a03bfb4b0b5946 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:33:27 +0800 Subject: [PATCH 25/46] 'commit' --- dsLightRag/WxGzh/T2_GetArticleList.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index 0bb8944c..207f5bbd 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -23,7 +23,7 @@ async def get_wechat_sources(): try: pool = await init_postgres_pool() async with pool.acquire() as conn: - rows = await conn.fetch('SELECT account_id, account_name FROM t_wechat_source') + rows = await conn.fetch('SELECT * FROM t_wechat_source') return [dict(row) for row in rows] finally: await pool.close() @@ -43,24 +43,24 @@ https://googlechromelabs.github.io/chrome-for-testing/ https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.94/win64/chromedriver-win64.zip """ import time -from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService -async def save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, account_id): + +async def save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, id): try: async with pool.acquire() as conn: - # 确保account_id是整数 - account_id_int = int(account_id) if account_id else 0 + # 更安全的account_id转换逻辑 await conn.execute(''' - INSERT INTO t_wechat_articles - (title, source, url, publish_time, content, source_id) - VALUES ($1, $2, $3, $4, $5, $6) - ''', article_title, account_name, article_url, - publish_time, content, account_id_int) # 修改为整数类型 + INSERT INTO t_wechat_articles + (title, source, url, publish_time, content, source_id) + VALUES ($1, $2, $3, $4, $5, $6) + ''', article_title, account_name, article_url, + publish_time, content, id) except Exception as e: logging.error(f"保存文章失败: {e}") + if __name__ == '__main__': # 从文件cookies.txt中获取 with open('cookies.txt', 'r', encoding='utf-8') as f: @@ -97,7 +97,6 @@ if __name__ == '__main__': } service = ChromeService(executable_path=r"C:\Windows\System32\chromedriver.exe") - driver = webdriver.Chrome(service=service, options=options) # 删除这行 # 使用统一的初始化方式 driver = init_wechat_browser() @@ -114,8 +113,6 @@ if __name__ == '__main__': logging.info("微信token:" + token) article_urls = [] - # 初始化浏览器 - driver = init_wechat_browser() # 获取公众号列表 loop = asyncio.new_event_loop() @@ -129,6 +126,7 @@ if __name__ == '__main__': for item in gzlist: account_name = item["account_name"] account_id = item["account_id"] + id = item["id"] # 搜索微信公众号的接口地址 search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' # 搜索微信公众号接口需要传入的参数,有三个变量:微信公众号token、随机数random、搜索的微信公众号名字 @@ -187,7 +185,7 @@ if __name__ == '__main__': pool = loop.run_until_complete(init_postgres_pool()) loop.run_until_complete( save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, - account_id)) + id)) finally: loop.run_until_complete(pool.close()) loop.close() From 0686c3332cada42ae652a5cf36677c3a1d5aa41b Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:36:28 +0800 Subject: [PATCH 26/46] 'commit' --- dsLightRag/WxGzh/T2_GetArticleList.py | 30 +++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index 207f5bbd..99ec2cfa 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -47,16 +47,34 @@ from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service as ChromeService +async def is_article_exist(pool, article_url): + """检查文章URL是否已存在数据库中""" + try: + async with pool.acquire() as conn: + row = await conn.fetchrow(''' + SELECT 1 FROM t_wechat_articles + WHERE url = $1 LIMIT 1 + ''', article_url) + return row is not None + except Exception as e: + logging.error(f"检查文章存在性失败: {e}") + return False # 出错时默认返回False,避免影响正常流程 + + async def save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, id): + # 先检查文章是否已存在 + if await is_article_exist(pool, article_url): + logging.info(f"文章已存在,跳过保存: {article_url}") + return + try: async with pool.acquire() as conn: - # 更安全的account_id转换逻辑 await conn.execute(''' - INSERT INTO t_wechat_articles - (title, source, url, publish_time, content, source_id) - VALUES ($1, $2, $3, $4, $5, $6) - ''', article_title, account_name, article_url, - publish_time, content, id) + INSERT INTO t_wechat_articles + (title, source, url, publish_time, content, source_id) + VALUES ($1, $2, $3, $4, $5, $6) + ''', article_title, account_name, article_url, + publish_time, content, id) except Exception as e: logging.error(f"保存文章失败: {e}") From a4c4bd7475d72ce432811bc5f526ab8528a96fa4 Mon Sep 17 00:00:00 2001 From: "Kalman.CHENG" <123204464@qq.com> Date: Tue, 15 Jul 2025 11:37:29 +0800 Subject: [PATCH 27/46] =?UTF-8?q?=E6=95=99=E8=82=B2=E5=9E=82=E7=9B=B4?= =?UTF-8?q?=E9=A2=86=E5=9F=9F=E5=A4=A7=E6=A8=A1=E5=9E=8B=E5=B9=B3=E5=8F=B0?= =?UTF-8?q?=20modify=20by=20Kalman.CHENG=20=E2=98=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api/controller/UserController.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/dsAiTeachingModel/api/controller/UserController.py b/dsAiTeachingModel/api/controller/UserController.py index e23d8f5f..bff5dd85 100644 --- a/dsAiTeachingModel/api/controller/UserController.py +++ b/dsAiTeachingModel/api/controller/UserController.py @@ -3,6 +3,7 @@ import re from fastapi import APIRouter, Request, Response, Depends from auth.dependencies import * +from utils.CommonUtil import md5_encrypt from utils.Database import * from utils.ParseRequest import * @@ -29,4 +30,21 @@ async def modify_telephone(request: Request): # 【Base-User-2】维护用户密码 -# @router.post("/modifyPassword") +@router.post("/modifyPassword") +async def modify_password(request: Request): + person_id = await get_request_str_param(request, "person_id", True, True) + old_password = await get_request_str_param(request, "old_password", True, True) + password = await get_request_str_param(request, "password", True, True) + # 校验旧密码是否正确 + select_password_sql: str = "select pwdmd5 from t_sys_loginperson where person_id = '" + person_id + "' and b_use = 1" + userlist = await find_by_sql(select_password_sql, ()) + if len(userlist) == 0: + return {"success": False, "message": "用户不存在"} + else: + if userlist[0]["pwdmd5"] != md5_encrypt(old_password): + return {"success": False, "message": "旧密码错误"} + else: + update_password_sql: str = "update t_sys_loginperson set original_pwd = '" + password + "',pwdmd5 = '" + md5_encrypt(password) + "' where person_id = '" + person_id + "'" + await execute_sql(update_password_sql) + return {"success": True, "message": "修改成功"} + From e25942c28f11c7314a73c6ddc511950c9ec28668 Mon Sep 17 00:00:00 2001 From: "Kalman.CHENG" <123204464@qq.com> Date: Tue, 15 Jul 2025 11:47:24 +0800 Subject: [PATCH 28/46] =?UTF-8?q?=E6=95=99=E8=82=B2=E5=9E=82=E7=9B=B4?= =?UTF-8?q?=E9=A2=86=E5=9F=9F=E5=A4=A7=E6=A8=A1=E5=9E=8B=E5=B9=B3=E5=8F=B0?= =?UTF-8?q?=20modify=20by=20Kalman.CHENG=20=E2=98=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dsAiTeachingModel/api/controller/UserController.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsAiTeachingModel/api/controller/UserController.py b/dsAiTeachingModel/api/controller/UserController.py index bff5dd85..2b1d1a3f 100644 --- a/dsAiTeachingModel/api/controller/UserController.py +++ b/dsAiTeachingModel/api/controller/UserController.py @@ -21,7 +21,7 @@ async def modify_telephone(request: Request): # 校验手机号码是否已被注册 select_telephone_sql: str = "select * from t_sys_loginperson where b_use = 1 and telephone = '" + telephone + "' and person_id <> '" + person_id + "'" userlist = await find_by_sql(select_telephone_sql, ()) - if len(userlist) > 0: + if userlist is not None: return {"success": False, "message": "手机号码已被注册"} else: update_telephone_sql: str = "update t_sys_loginperson set telephone = '" + telephone + "' where person_id = '" + person_id + "'" From 870e3540a0150410d04e7fabb85d2226222e629c Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:47:47 +0800 Subject: [PATCH 29/46] 'commit' --- dsLightRag/WxGzh/T2_GetArticleList.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index 99ec2cfa..43845916 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -18,6 +18,19 @@ from Util.PostgreSQLUtil import init_postgres_pool from Util.WxGzhUtil import init_wechat_browser, get_article_content +# 在程序开始时添加以下配置 +logging.basicConfig( + level=logging.INFO, # 设置日志级别为INFO + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +# 或者如果你想更详细地控制日志输出 +logger = logging.getLogger('WeiXinGongZhongHao') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) +logger.addHandler(handler) + async def get_wechat_sources(): """从t_wechat_source表获取微信公众号列表""" try: From 933b88853b529a6d6a3e6661889be700eec5877e Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:48:43 +0800 Subject: [PATCH 30/46] 'commit' --- dsLightRag/WxGzh/T2_GetArticleList.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index 43845916..001c2ec3 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -105,12 +105,12 @@ if __name__ == '__main__': # 换算出过期时间 expiry_time = time.localtime(expiry) expiry_date = time.strftime("%Y-%m-%d %H:%M:%S", expiry_time) - print("cookies的过期时间一般是4天,cookies过期时间:", expiry_date) + logger.info("cookies的过期时间一般是4天,cookies过期时间:", expiry_date) # 获取当前时间戳 current_timestamp = time.time() # 检查是否已过期 if current_timestamp > expiry: - print("Cookie已过期") + logger.error("Cookie已过期") exit() # 移除expiry属性 del cookies["expiry"] @@ -136,12 +136,11 @@ if __name__ == '__main__': response = requests.get(url=url, allow_redirects=False, cookies=cookies) if 'Location' in response.headers: redirect_url = response.headers.get("Location") - print("重定向URL:", redirect_url) + logger.info("重定向URL:", redirect_url) token_match = re.findall(r'token=(\d+)', redirect_url) if token_match: token = token_match[0] - print("获取到的token:", token) - logging.info("微信token:" + token) + logger.info("获取到的token:", token) article_urls = [] @@ -207,7 +206,7 @@ if __name__ == '__main__': if '试卷' in article_title: # 过滤掉试卷 continue - print(f"正在处理文章: {article_title} ({publish_time})") + logger.info(f"正在处理文章: {article_title} ({publish_time})") content = get_article_content(article_url) loop = asyncio.new_event_loop() From d41d7c19a47462d053adf49147aeb0b487cf16ad Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:50:05 +0800 Subject: [PATCH 31/46] 'commit' --- dsLightRag/WxGzh/T2_GetArticleList.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index d958d01d..187f3dbd 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -106,7 +106,7 @@ if __name__ == '__main__': # 换算出过期时间 expiry_time = time.localtime(expiry) expiry_date = time.strftime("%Y-%m-%d %H:%M:%S", expiry_time) - logger.info(f"cookies的过期时间一般是4天,cookies过期时间:%s" % expiry_date) + # 获取当前时间戳 current_timestamp = time.time() # 检查是否已过期 @@ -115,7 +115,7 @@ if __name__ == '__main__': exit() # 移除expiry属性 del cookies["expiry"] - + logger.info(f"cookies的过期时间一般是4天,cookies过期时间:%s" % expiry_date) options = Options() options.add_argument('-headless') # 无头参数,调试时可以注释掉 # 设置headers - 使用微信内置浏览器的User-Agent From b7334a0d07d323c23bdc4978eb5cdeb7f5cbe327 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:51:20 +0800 Subject: [PATCH 32/46] 'commit' --- dsLightRag/WxGzh/T2_GetArticleList.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index 187f3dbd..470abf3b 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -137,11 +137,11 @@ if __name__ == '__main__': response = requests.get(url=url, allow_redirects=False, cookies=cookies) if 'Location' in response.headers: redirect_url = response.headers.get("Location") - logger.info("重定向URL:", redirect_url) + logger.info(f"重定向URL:%s"%redirect_url) token_match = re.findall(r'token=(\d+)', redirect_url) if token_match: token = token_match[0] - logger.info("获取到的token:", token) + logger.info(f"获取到的token:%s"%token) article_urls = [] From 7f3ea89e3c9bc08549d9b4d0a6b252191a831bab Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:53:03 +0800 Subject: [PATCH 33/46] 'commit' --- dsLightRag/WxGzh/T2_GetArticleList.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index 470abf3b..a287c2f6 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -17,18 +17,15 @@ import requests from Util.PostgreSQLUtil import init_postgres_pool from Util.WxGzhUtil import init_wechat_browser, get_article_content -# 在程序开始时添加以下配置 -logging.basicConfig( - level=logging.INFO, # 设置日志级别为INFO - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) - -# 或者如果你想更详细地控制日志输出 +# 删除重复的日志配置,只保留以下内容 logger = logging.getLogger('WeiXinGongZhongHao') logger.setLevel(logging.INFO) -handler = logging.StreamHandler() -handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) -logger.addHandler(handler) + +# 确保只添加一个handler +if not logger.handlers: + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + logger.addHandler(handler) async def get_wechat_sources(): From 5c10a56d315b4bde75df5cbe1a203958accd9f2d Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:53:55 +0800 Subject: [PATCH 34/46] 'commit' --- dsLightRag/WxGzh/T2_GetArticleList.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_GetArticleList.py index a287c2f6..a6e073bb 100644 --- a/dsLightRag/WxGzh/T2_GetArticleList.py +++ b/dsLightRag/WxGzh/T2_GetArticleList.py @@ -75,7 +75,7 @@ async def is_article_exist(pool, article_url): async def save_article_to_db(pool, article_title, account_name, article_url, publish_time, content, id): # 先检查文章是否已存在 if await is_article_exist(pool, article_url): - logging.info(f"文章已存在,跳过保存: {article_url}") + logger.info(f"文章已存在,跳过保存: {article_url}") return try: From 2785f4afb740ee2e73521343ad7ddbb7760a60cd Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:55:40 +0800 Subject: [PATCH 35/46] 'commit' --- dsLightRag/WxGzh/{T1_Login.py => T1_LoginGetCookie.py} | 0 dsLightRag/WxGzh/{T2_GetArticleList.py => T2_CollectArticle.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename dsLightRag/WxGzh/{T1_Login.py => T1_LoginGetCookie.py} (100%) rename dsLightRag/WxGzh/{T2_GetArticleList.py => T2_CollectArticle.py} (100%) diff --git a/dsLightRag/WxGzh/T1_Login.py b/dsLightRag/WxGzh/T1_LoginGetCookie.py similarity index 100% rename from dsLightRag/WxGzh/T1_Login.py rename to dsLightRag/WxGzh/T1_LoginGetCookie.py diff --git a/dsLightRag/WxGzh/T2_GetArticleList.py b/dsLightRag/WxGzh/T2_CollectArticle.py similarity index 100% rename from dsLightRag/WxGzh/T2_GetArticleList.py rename to dsLightRag/WxGzh/T2_CollectArticle.py From 67c1296dc1729aa72e89dbc42e78945c208e5f48 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:55:49 +0800 Subject: [PATCH 36/46] 'commit' --- dsLightRag/WxGzh/article_urls.txt | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 dsLightRag/WxGzh/article_urls.txt diff --git a/dsLightRag/WxGzh/article_urls.txt b/dsLightRag/WxGzh/article_urls.txt deleted file mode 100644 index dc2077eb..00000000 --- a/dsLightRag/WxGzh/article_urls.txt +++ /dev/null @@ -1,22 +0,0 @@ -长春中考上演“神仙打架”!省二力旺等五校过半考生超700分! 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd -独家专访赫行学校2025年中考“双黄蛋”!学霸靠啥杀出重围? 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=2&sn=c7733f7c2c6331e51e55af695f99a43e&chksm=84e1ab0cb396221a7d185dcb99acc9dce45cc5c66c3eef42680a215b710bb9bfa9fd10da4419#rd -长春40所学校中考成绩曝光!700+成批涌现!谁是最大黑马? 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=1&sn=1f314640ae6eec236b0e16271bd44362&chksm=84e1ab1db396220b73ae08898a026d887436501a6c42abe01d7fa4aef9063533fad89720d3b8#rd -喜报!长春外国语学校女子篮球队夺得冠军! 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=2&sn=31651043acb6ecbf4232e92e635196b6&chksm=84e1ab1db396220b0810c3bdf332128b110d1902658f2556eaeff67cec084a8a068a5ae9a275#rd -“趣闯盛夏·探无界”!探秘一实验银河小学夏令营 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=3&sn=8edf6ce8cebdaad55343b39639876c27&chksm=84e1ab1db396220b26b172b3b565f919f7ded4c2a5b78227294ea29a558a7666c33b8c1de660#rd -刚刚!2025年长春中考各批次控制线公布! 2025-07-12 10:04:32 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526196&idx=1&sn=282e5e824410a9a92a83dd800cb58a7c&chksm=84e1aba6b39622b03fe6422032474c9696f83541d9ff9b8b6a9f0f099ce459da430f720d05e4#rd -重磅消息!师大附属实验学校(经开)校长有新任命! 2025-07-12 10:04:32 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526196&idx=2&sn=9449c87935faf86ddcc5a674ea888913&chksm=84e1aba6b39622b03fd8413ff1e74b61f662ec8deb3887c2c5b8e5ad15470b15ae14b21e94ea#rd -市教育局最新发布!长春2025年中考成绩将于7月12日公布! 2025-07-11 15:22:13 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526176&idx=1&sn=a5b4104d2fe74ace32ab31faf5f1c44c&chksm=84e1abb2b39622a40b0e0969e84fb00c753cc8ffefb8624726afa2a7352ea725c7f967bf25f5#rd -长春市第十九中学2025年职称评聘拟通过人员名单的公示!有你认识的老师吗? 2025-07-11 15:22:13 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526176&idx=2&sn=693d13964e4be18718c0eb4fd13ba68f&chksm=84e1abb2b39622a442a9f7cea8ddc72820050b2896968f2d0ae283c7caca2dbe014a721feb2e#rd -高分喜报频传!长春这所小学靠啥成为“学霸制造机”? 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=1&sn=a0af7f484d6a3300a9b7f3d787a2594d&chksm=84e1ab9cb396228a56420696eb09071ff829d58e8e31bd652f849f3cbd0ee276b0baad7a1e89#rd -蝉联冠军!吉大尚德游泳队斩获骄人成绩! 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=2&sn=cabb682e99978bf2a58ff0e9e06dc53d&chksm=84e1ab9cb396228a5cd457cd7ee0728491e6b3dc34fbde02240624364cfa8a9e2c533052d2b4#rd -长春中考上演“神仙打架”!省二力旺等五校过半考生超700分! 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=1&sn=41f941e481be6a7ccd26ad734c8d7a13&chksm=84e1ab0cb396221a59bce5c4ee842c9326968daf4aea1c7d24e55ed8879789c3ef34a7ce5ed1#rd -独家专访赫行学校2025年中考“双黄蛋”!学霸靠啥杀出重围? 2025-07-14 18:36:34 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526302&idx=2&sn=c7733f7c2c6331e51e55af695f99a43e&chksm=84e1ab0cb396221a7d185dcb99acc9dce45cc5c66c3eef42680a215b710bb9bfa9fd10da4419#rd -长春40所学校中考成绩曝光!700+成批涌现!谁是最大黑马? 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=1&sn=1f314640ae6eec236b0e16271bd44362&chksm=84e1ab1db396220b73ae08898a026d887436501a6c42abe01d7fa4aef9063533fad89720d3b8#rd -喜报!长春外国语学校女子篮球队夺得冠军! 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=2&sn=31651043acb6ecbf4232e92e635196b6&chksm=84e1ab1db396220b0810c3bdf332128b110d1902658f2556eaeff67cec084a8a068a5ae9a275#rd -“趣闯盛夏·探无界”!探秘一实验银河小学夏令营 2025-07-13 18:48:27 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526287&idx=3&sn=8edf6ce8cebdaad55343b39639876c27&chksm=84e1ab1db396220b26b172b3b565f919f7ded4c2a5b78227294ea29a558a7666c33b8c1de660#rd -刚刚!2025年长春中考各批次控制线公布! 2025-07-12 10:04:32 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526196&idx=1&sn=282e5e824410a9a92a83dd800cb58a7c&chksm=84e1aba6b39622b03fe6422032474c9696f83541d9ff9b8b6a9f0f099ce459da430f720d05e4#rd -重磅消息!师大附属实验学校(经开)校长有新任命! 2025-07-12 10:04:32 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526196&idx=2&sn=9449c87935faf86ddcc5a674ea888913&chksm=84e1aba6b39622b03fd8413ff1e74b61f662ec8deb3887c2c5b8e5ad15470b15ae14b21e94ea#rd -市教育局最新发布!长春2025年中考成绩将于7月12日公布! 2025-07-11 15:22:13 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526176&idx=1&sn=a5b4104d2fe74ace32ab31faf5f1c44c&chksm=84e1abb2b39622a40b0e0969e84fb00c753cc8ffefb8624726afa2a7352ea725c7f967bf25f5#rd -长春市第十九中学2025年职称评聘拟通过人员名单的公示!有你认识的老师吗? 2025-07-11 15:22:13 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526176&idx=2&sn=693d13964e4be18718c0eb4fd13ba68f&chksm=84e1abb2b39622a442a9f7cea8ddc72820050b2896968f2d0ae283c7caca2dbe014a721feb2e#rd -高分喜报频传!长春这所小学靠啥成为“学霸制造机”? 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=1&sn=a0af7f484d6a3300a9b7f3d787a2594d&chksm=84e1ab9cb396228a56420696eb09071ff829d58e8e31bd652f849f3cbd0ee276b0baad7a1e89#rd -蝉联冠军!吉大尚德游泳队斩获骄人成绩! 2025-07-10 19:00:00 http://mp.weixin.qq.com/s?__biz=MzA3MjQ1Mjg2MQ==&mid=2651526158&idx=2&sn=cabb682e99978bf2a58ff0e9e06dc53d&chksm=84e1ab9cb396228a5cd457cd7ee0728491e6b3dc34fbde02240624364cfa8a9e2c533052d2b4#rd From 73033e0333f772bd07e74693307f8cc00aef8fe0 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 11:57:09 +0800 Subject: [PATCH 37/46] 'commit' --- dsLightRag/Start.py | 7 +-- dsLightRag/T1_Train.py | 6 --- dsLightRag/WxGzh/T3_TrainIntoKG.py | 68 ++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 12 deletions(-) create mode 100644 dsLightRag/WxGzh/T3_TrainIntoKG.py diff --git a/dsLightRag/Start.py b/dsLightRag/Start.py index 584315f5..a6b09126 100644 --- a/dsLightRag/Start.py +++ b/dsLightRag/Start.py @@ -17,13 +17,8 @@ from starlette.staticfiles import StaticFiles from Util.LightRagUtil import * from Util.PostgreSQLUtil import init_postgres_pool -# 在程序开始时添加以下配置 -logging.basicConfig( - level=logging.INFO, # 设置日志级别为INFO - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -# 或者如果你想更详细地控制日志输出 +# 想更详细地控制日志输出 logger = logging.getLogger('lightrag') logger.setLevel(logging.INFO) handler = logging.StreamHandler() diff --git a/dsLightRag/T1_Train.py b/dsLightRag/T1_Train.py index 1db08183..88080efa 100644 --- a/dsLightRag/T1_Train.py +++ b/dsLightRag/T1_Train.py @@ -4,12 +4,6 @@ import logging from Util.DocxUtil import get_docx_content_by_pandoc from Util.LightRagUtil import initialize_pg_rag -# 在程序开始时添加以下配置 -logging.basicConfig( - level=logging.INFO, # 设置日志级别为INFO - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) - # 或者如果你想更详细地控制日志输出 logger = logging.getLogger('lightrag') logger.setLevel(logging.INFO) diff --git a/dsLightRag/WxGzh/T3_TrainIntoKG.py b/dsLightRag/WxGzh/T3_TrainIntoKG.py new file mode 100644 index 00000000..c7de823c --- /dev/null +++ b/dsLightRag/WxGzh/T3_TrainIntoKG.py @@ -0,0 +1,68 @@ +import asyncio +import logging + +from Util.DocxUtil import get_docx_content_by_pandoc +from Util.LightRagUtil import initialize_pg_rag + + +logger = logging.getLogger('lightrag') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) +logger.addHandler(handler) +logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO) + +# 使用PG库后,这个是没有用的,但目前的项目代码要求必传,就写一个吧。 +WORKING_DIR = f"./output" + +#### 下面两个要注意写清楚内容 #### +# 1、工作空间【知识库名称】 +# 2、文档名称【不允许出现重复,因为后面需要以此为条件查询】 +tasks = [ + # { # 苏轼 + # "workspace": "SuShi", "docx_name": "苏轼.docx", + # }, + # { # 化学 + # "workspace": "Chemistry", "docx_name": "Chemistry.docx", + # }, + # { # 几何 + # "workspace": "JiHe", "docx_name": "JiHe.docx", + # }, + # { # 数学 + # "workspace": "Math", "docx_name": "Math.docx", + # }, + # { # 史记 + # "workspace": "ShiJi", "docx_name": "少年读史记张嘉骅.docx", + # }, + # { # 长春市一批次高中学校介绍 + # "workspace": "ChangChun", "docx_name": "长春市一批次高中学校介绍.docx", + # }, + # { # 2024长春43所高中录取分数线 + # "workspace": "ChangChun", "docx_name": "2024长春43所高中录取分数线.docx", + # }, + { # 长春市2025年中考各批次录取最低控制线 + "workspace": "ChangChun", "docx_name": "长春市2025年中考各批次录取最低控制线.docx", + } +] +for task in tasks: + task["docx_path"] = "./static/Txt/" + task["docx_name"] # 3、文档路径 python是按引用传递的& + + +async def main(): + for task in tasks: + workspace = task["workspace"] + docx_name = task["docx_name"] + docx_path = task["docx_path"] + logger.info(f"开始处理文档: {docx_name}" + ",共%s个文档,当前是第%s个。", len(tasks), tasks.index(task) + 1) + try: + rag = await initialize_pg_rag(WORKING_DIR=WORKING_DIR, workspace=workspace) + # 获取docx文件的内容 + content = get_docx_content_by_pandoc(docx_path) + await rag.ainsert(input=content, file_paths=[docx_name]) # 添加来源参数 + finally: + if rag: + await rag.finalize_storages() + + +if __name__ == "__main__": + asyncio.run(main()) From f639b4c9fc23720f155bc9f8e42612110f4a1b4c Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 13:02:12 +0800 Subject: [PATCH 38/46] 'commit' --- .../__pycache__/LightRagUtil.cpython-310.pyc | Bin 4511 -> 4511 bytes dsLightRag/WxGzh/T3_TrainIntoKG.py | 77 ++++++++---------- 2 files changed, 36 insertions(+), 41 deletions(-) diff --git a/dsLightRag/Util/__pycache__/LightRagUtil.cpython-310.pyc b/dsLightRag/Util/__pycache__/LightRagUtil.cpython-310.pyc index 9bc1e54dac9dbe973c4131d5b80ceb60cf4263d8..d89e8018a6fc1ad3abe25bb0951aa47b9d96253a 100644 GIT binary patch delta 1003 zcmYL|OK%cU6o8om!<1nL+NOz!u@8bu)x4E9o>n_(D7h$WO7hITq_bhOySei$XejArjybhntdEvi5s3NLKgefLk!45zRc@VyMSJuHI(n!F_0xzM#4KVC5&DN7JSEvD zaT7`(lOpV=D6^S>%x$o2z!FnvUPLT8eH;gF3(FwJIm)xyo>MF#jc^5KNfns#SLx|h zG!ikFO5ZtYkM!nJ39FR1Rl8B|=;T}~g`IU|_EH|W;&g#+6S-P76vHZO)%NiE{>wNS zwZ3nb2>T|7Vfeq~SOMRK_;;I~ha$aYs{{h3?C5CLiQLpm@R%JXiKq?RQVi9EL3EUO zMRm6EQ7|@%1-0Lh$5Rfmiom3F4dDjF-7T?@T}v-!Qc7ZRjVfr!J8$+QxRg-Qr7M$( zbMU2Ml%g~aKhP_zG)(?-$uhU~3dOZ*dCi7V+i!K5HR}8uaN>m7p=oINLui6H3+b-g zlsaqTsp@sRcq@NjFhN5O)#x5cd%(4E4cZz`zz% delta 1003 zcmYL|$xjne7{EJBI~_WmX#q(jgvB%kiixsB5)UpwwXAIc%Nm_b+hIB|OL?6Z8!vJ+ z(S!N!qVdGh#GCQtf&W00C-G?f2RwN5eN(DU^XvPT_pPs|iPMDS%5o&YKG(aCK1iQ< zj>0)V$@qt#@1^oeE~Bo_ZmV-Uj~;B8fxz1Ug|*;IZUBA+A990`3{4LXvD;*rfr|U4 zr8}CfQ_Qi|7-w++&@j1S++hRX<1UQZSvGr#U6v52cn(LS<%Au@<$ed6kyP3Q~)$in_wD zqYD^UAi7s44&_!2%5Zehk5SQ2jw%XnOY7vc(6fUn}Ju!@Q-q&n|%n=mQ&z_ZR# zZV%pf9wcZit9!>TIo#gxT%sBJ8Bb;1F6!m7=@_gco5DmSd98psWrJwdidt)Vl0BwN ziKGLg7oZUtXEEPJh6N23ML3VFaR&5B-oz3*21yY0IYb_@hFC|$|Dg*{1^6uOu+k^x ze)uIRa6hVW705+hX$#Feh+V&ri|B&*n29Nl2r%1q71HuJcL1(zuq}U+pU>b15`$!9XySzCrz%Fs=}3%_l}vTW6M%evKCUeA9SH>1^>PKm;I zrH?KDx02k#xzNaAqdh3vU3N=Mz?5wrDWa3o&`WHXZ6)p3>Yl6Gn!^^+R^k=Se~m}6 z>UOnBJDe)<{|&ft3SRskYxrY)de?uDe9a@opI6W_ sYaR_dWsMN6*{aN7S4ahMfH=emVclt%#C3>;DjVS-TtpRd!j}5vFY3lC1ONa4 diff --git a/dsLightRag/WxGzh/T3_TrainIntoKG.py b/dsLightRag/WxGzh/T3_TrainIntoKG.py index c7de823c..86473413 100644 --- a/dsLightRag/WxGzh/T3_TrainIntoKG.py +++ b/dsLightRag/WxGzh/T3_TrainIntoKG.py @@ -3,7 +3,7 @@ import logging from Util.DocxUtil import get_docx_content_by_pandoc from Util.LightRagUtil import initialize_pg_rag - +from Util.PostgreSQLUtil import init_postgres_pool logger = logging.getLogger('lightrag') logger.setLevel(logging.INFO) @@ -15,54 +15,49 @@ logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO) # 使用PG库后,这个是没有用的,但目前的项目代码要求必传,就写一个吧。 WORKING_DIR = f"./output" -#### 下面两个要注意写清楚内容 #### -# 1、工作空间【知识库名称】 -# 2、文档名称【不允许出现重复,因为后面需要以此为条件查询】 -tasks = [ - # { # 苏轼 - # "workspace": "SuShi", "docx_name": "苏轼.docx", - # }, - # { # 化学 - # "workspace": "Chemistry", "docx_name": "Chemistry.docx", - # }, - # { # 几何 - # "workspace": "JiHe", "docx_name": "JiHe.docx", - # }, - # { # 数学 - # "workspace": "Math", "docx_name": "Math.docx", - # }, - # { # 史记 - # "workspace": "ShiJi", "docx_name": "少年读史记张嘉骅.docx", - # }, - # { # 长春市一批次高中学校介绍 - # "workspace": "ChangChun", "docx_name": "长春市一批次高中学校介绍.docx", - # }, - # { # 2024长春43所高中录取分数线 - # "workspace": "ChangChun", "docx_name": "2024长春43所高中录取分数线.docx", - # }, - { # 长春市2025年中考各批次录取最低控制线 - "workspace": "ChangChun", "docx_name": "长春市2025年中考各批次录取最低控制线.docx", - } -] -for task in tasks: - task["docx_path"] = "./static/Txt/" + task["docx_name"] # 3、文档路径 python是按引用传递的& +async def get_unprocessed_articles(): + """从t_wechat_articles表获取未处理的文章""" + try: + pool = await init_postgres_pool() + async with pool.acquire() as conn: + rows = await conn.fetch(''' + SELECT id, source, title, content + FROM t_wechat_articles + WHERE is_finish = 0 + ''') + return [dict(row) for row in rows] + finally: + await pool.close() async def main(): - for task in tasks: - workspace = task["workspace"] - docx_name = task["docx_name"] - docx_path = task["docx_path"] - logger.info(f"开始处理文档: {docx_name}" + ",共%s个文档,当前是第%s个。", len(tasks), tasks.index(task) + 1) + # 获取未处理的文章 + articles = await get_unprocessed_articles() + logger.info(f"共获取到{len(articles)}篇未处理的文章") + + for article in articles: + workspace = 'ChangChun' + docx_name = f"{article['source']}_{article['title']}" # 组合来源和标题作为文档名 + content = article["content"] # 使用文章内容 + + logger.info(f"开始处理文档: {docx_name}") try: rag = await initialize_pg_rag(WORKING_DIR=WORKING_DIR, workspace=workspace) - # 获取docx文件的内容 - content = get_docx_content_by_pandoc(docx_path) - await rag.ainsert(input=content, file_paths=[docx_name]) # 添加来源参数 + await rag.ainsert(input=content, file_paths=[docx_name]) + + # 标记为已处理 + pool = await init_postgres_pool() + async with pool.acquire() as conn: + await conn.execute(''' + UPDATE t_wechat_articles + SET is_finish = 1 + WHERE id = $1 + ''', article["id"]) finally: if rag: await rag.finalize_storages() - + if pool: + await pool.close() if __name__ == "__main__": asyncio.run(main()) From bd7399b14727276673cdacd181a63c5a9775f0c9 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 13:05:20 +0800 Subject: [PATCH 39/46] 'commit' --- .../postgres_impl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dsLightRag/Doc/9、Postgresql支持工作空间的代码修改/postgres_impl.py b/dsLightRag/Doc/9、Postgresql支持工作空间的代码修改/postgres_impl.py index f02ad79f..d18c6cc0 100644 --- a/dsLightRag/Doc/9、Postgresql支持工作空间的代码修改/postgres_impl.py +++ b/dsLightRag/Doc/9、Postgresql支持工作空间的代码修改/postgres_impl.py @@ -965,8 +965,8 @@ class PGDocStatusStorage(DocStatusStorage): else: exist_keys = [] new_keys = set([s for s in keys if s not in exist_keys]) - print(f"keys: {keys}") - print(f"new_keys: {new_keys}") + #print(f"keys: {keys}") + #print(f"new_keys: {new_keys}") return new_keys except Exception as e: logger.error( From db7f327f7b6217f7aa6a8960ac2208a331bb9137 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 13:29:17 +0800 Subject: [PATCH 40/46] 'commit' --- dsLightRag/static/ChangChun.html | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/dsLightRag/static/ChangChun.html b/dsLightRag/static/ChangChun.html index ac938d40..3a3b6666 100644 --- a/dsLightRag/static/ChangChun.html +++ b/dsLightRag/static/ChangChun.html @@ -200,13 +200,19 @@
2025年各批次最低分数线是多少?
-
+
介绍一下师大自由校区?
今年中考成绩690分,能上哪个高中呢?
+
+ 通达小学介绍 +
+
+ 师大附属实验学校的马校长 +
From 564a76f3f237091c6a6dcaf56173de8f3be9a90d Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Tue, 15 Jul 2025 13:31:00 +0800 Subject: [PATCH 41/46] 'commit' --- dsLightRag/static/ChangChun.html | 4 ++-- dsLightRag/static/ai.html | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dsLightRag/static/ChangChun.html b/dsLightRag/static/ChangChun.html index 3a3b6666..7510be6f 100644 --- a/dsLightRag/static/ChangChun.html +++ b/dsLightRag/static/ChangChun.html @@ -3,7 +3,7 @@ - 【长春市中考报考知识库】 + 【长春市教育信息资讯库】