From 68231fd5f8266f772d1df9785dc74ab36899b9f9 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 23 Jun 2025 20:11:10 +0800 Subject: [PATCH] 'commit' --- dsRag/Config/Config.py | 4 ++ dsRag/T7_Query.py | 68 ++++++++++++++++++++++ dsRag/T8_RAG.py | 100 ++++++++++++++++++++++++++++++++ dsRag/运行结果.txt | 129 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 301 insertions(+) create mode 100644 dsRag/T7_Query.py create mode 100644 dsRag/T8_RAG.py create mode 100644 dsRag/运行结果.txt diff --git a/dsRag/Config/Config.py b/dsRag/Config/Config.py index 7c522ab4..79a46e78 100644 --- a/dsRag/Config/Config.py +++ b/dsRag/Config/Config.py @@ -7,4 +7,8 @@ ES_CONFIG = { "default_index": "knowledge_base" } +# 词向量模型路径 WORD2VEC_MODEL_PATH = r"D:\Tencent_AILab_ChineseEmbedding\Tencent_AILab_ChineseEmbedding.txt" + +# DeepSeek API KEY +DEEPSEEK_API_KEY = 'sk-44ae895eeb614aa1a9c6460579e322f1' diff --git a/dsRag/T7_Query.py b/dsRag/T7_Query.py new file mode 100644 index 00000000..f2521eca --- /dev/null +++ b/dsRag/T7_Query.py @@ -0,0 +1,68 @@ +from elasticsearch import Elasticsearch +from Util.EmbeddingUtil import text_to_embedding +import random +import Config.Config as config + +# 初始化ES连接 +es = Elasticsearch( + hosts=config.ES_CONFIG['hosts'], + basic_auth=config.ES_CONFIG['basic_auth'], + verify_certs=config.ES_CONFIG['verify_certs'] +) + +def vector_search(text): + """向量相似度搜索""" + vector = text_to_embedding(text) + script_query = { + "script_score": { + "query": {"match_all": {}}, + "script": { + "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0", + "params": {"query_vector": vector} + } + } + } + response = es.search( + index='knowledge_base', + body={ + "size": 5, + "query": script_query, + "_source": ["text"] + } + ) + return [hit['_source']['text'] for hit in response['hits']['hits']] + +def text_search(text): + """文本精确搜索""" + response = es.search( + index='raw_texts', + body={ + "query": { + "match_phrase": { + "text": text + } + } + } + ) + return [hit['_source']['text'] for hit in response['hits']['hits']] + +def test_queries(file_path): + """从文本文件中随机选取5个句子进行测试""" + with open(file_path, 'r', encoding='utf-8') as f: + sentences = [line.strip() for line in f if line.strip()] + + test_samples = random.sample(sentences, min(5, len(sentences))) + + for sample in test_samples: + print(f"测试句子: {sample}") + print("向量搜索结果:") + for result in vector_search(sample): + print(f"- {result}") + + print("\n文本精确搜索结果:") + for result in text_search(sample): + print(f"- {result}") + print("="*50) + +if __name__ == "__main__": + test_queries("人口变化趋势对云南教育的影响.txt") \ No newline at end of file diff --git a/dsRag/T8_RAG.py b/dsRag/T8_RAG.py new file mode 100644 index 00000000..5b2b9b93 --- /dev/null +++ b/dsRag/T8_RAG.py @@ -0,0 +1,100 @@ +''' +pip install openai +''' +from elasticsearch import Elasticsearch +from Util.EmbeddingUtil import text_to_embedding +import Config.Config as config +from openai import OpenAI +import json + +# 初始化ES连接 +es = Elasticsearch( + hosts=config.ES_CONFIG['hosts'], + basic_auth=config.ES_CONFIG['basic_auth'], + verify_certs=config.ES_CONFIG['verify_certs'] +) + +# 初始化DeepSeek客户端 +client = OpenAI(api_key=config.DEEPSEEK_API_KEY) + +def search_related_data(query): + """搜索向量数据和原始相关数据""" + # 向量搜索 + vector = text_to_embedding(query) + vector_results = es.search( + index='knowledge_base', + body={ + "size": 5, + "query": { + "script_score": { + "query": {"match_all": {}}, + "script": { + "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0", + "params": {"query_vector": vector} + } + } + }, + "_source": ["text"] + } + ) + + # 文本精确搜索 + text_results = es.search( + index='raw_texts', + body={ + "query": { + "multi_match": { + "query": query, + "fields": ["text"], + "type": "best_fields" + } + }, + "size": 5 + } + ) + + # 合并结果 + all_results = [hit['_source']['text'] for hit in vector_results['hits']['hits']] + all_results.extend([hit['_source']['text'] for hit in text_results['hits']['hits']]) + + return "\n\n".join(all_results) + +def generate_report(query, context): + """使用DeepSeek生成报告""" + prompt = f"""根据以下关于'{query}'的相关信息,整理一份结构化的报告: +要求: +1. 分章节组织内容 +2. 包含关键数据和事实 +3. 语言简洁专业 + +相关信息: +{context}""" + + response = client.chat.completions.create( + model="deepseek-chat", + messages=[ + {"role": "system", "content": "你是一个专业的文档整理助手"}, + {"role": "user", "content": prompt} + ], + temperature=0.3 + ) + + return response.choices[0].message.content + +def process_query(query): + """处理用户查询并生成报告""" + print(f"正在搜索与'{query}'相关的数据...") + context = search_related_data(query) + print(f"找到{len(context.split('\n\n'))}条相关数据") + + print("正在生成报告...") + report = generate_report(query, context) + + return report + +if __name__ == "__main__": + #user_query = input("请输入您的查询要求:") + user_query = "整理云南省初中在校生情况文档" + report = process_query(user_query) + print("\n=== 生成的报告 ===\n") + print(report) \ No newline at end of file diff --git a/dsRag/运行结果.txt b/dsRag/运行结果.txt new file mode 100644 index 00000000..260ea4a6 --- /dev/null +++ b/dsRag/运行结果.txt @@ -0,0 +1,129 @@ +D:\anaconda3\envs\rag\python.exe D:\dsWork\dsProject\dsRag\T7_Query.py +D:\anaconda3\envs\rag\lib\site-packages\jieba\_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. + import pkg_resources +2025-06-23 20:03:12,025 - INFO - loading projection weights from D:\Tencent_AILab_ChineseEmbedding\Tencent_AILab_ChineseEmbedding.txt +2025-06-23 20:03:13,349 - INFO - KeyedVectors lifecycle event {'msg': 'loaded (10000, 200) matrix of type float32 from D:\\Tencent_AILab_ChineseEmbedding\\Tencent_AILab_ChineseEmbedding.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-06-23T20:03:13.329238', 'gensim': '4.3.3', 'python': '3.10.18 | packaged by conda-forge | (main, Jun 4 2025, 14:42:04) [MSC v.1943 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'load_word2vec_format'} +2025-06-23 20:03:13,349 - INFO - 模型加载成功,词向量维度: 200 +D:\anaconda3\envs\rag\lib\site-packages\elasticsearch\_sync\client\__init__.py:311: SecurityWarning: Connecting to 'https://10.10.14.206:9200' using TLS with verify_certs=False is insecure + _transport = transport_class( +Building prefix dict from the default dictionary ... +2025-06-23 20:03:13,622 - DEBUG - Building prefix dict from the default dictionary ... +Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache +2025-06-23 20:03:13,622 - DEBUG - Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache +测试句子: (万人) 教职工数需求 +向量搜索结果: +Loading model cost 0.655 seconds. +2025-06-23 20:03:14,277 - DEBUG - Loading model cost 0.655 seconds. +Prefix dict has been built successfully. +2025-06-23 20:03:14,278 - DEBUG - Prefix dict has been built successfully. +2025-06-23 20:03:14,278 - INFO - 文本: (万人) 教职工数需求, 分词结果: ['(', '万人', ')', '\t', '教职工', '数', '需求'] +2025-06-23 20:03:14,278 - INFO - 有效词向量数量: 5 +2025-06-23 20:03:14,278 - INFO - 生成的平均向量: [-0.0246012 -0.1393744 -0.0812634 0.22129479 -0.0203214 ]... +D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings + warnings.warn( +2025-06-23 20:03:14,384 - INFO - POST https://10.10.14.206:9200/knowledge_base/_search [status:200 duration:0.104s] +D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings + warnings.warn( +- (万人) 教职工数需求 +- (万人) 教职工数需求 +- (万人) 专任教师数需求 +- (万人) 教职工数需求 +- (万人) 教职工需求 + +文本精确搜索结果: +2025-06-23 20:03:14,419 - INFO - POST https://10.10.14.206:9200/raw_texts/_search [status:200 duration:0.033s] +2025-06-23 20:03:14,420 - INFO - 文本: 表 6 高等教育规模及资源配置需求预测统计表, 分词结果: ['表', ' ', '6', ' ', '高等教育', '规模', '及', '资源配置', '需求预测', '统计表'] +2025-06-23 20:03:14,420 - INFO - 有效词向量数量: 4 +2025-06-23 20:03:14,421 - INFO - 生成的平均向量: [ 0.18066275 -0.12073625 -0.12693274 0.219709 0.00525575]... +D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings + warnings.warn( +- (万人) 教职工数需求 +- (万人) 教职工数需求 +- (万人) 教职工数需求 +================================================== +测试句子: 表 6 高等教育规模及资源配置需求预测统计表 +向量搜索结果: +- 表 6 高等教育规模及资源配置需求预测统计表 +- 表 1 学前教育规模及资源配置需求预测统计表 +- 表 5 中等职业教育规模及资源配置需求预测统计表 +- 表 4 普通高中教育规模及资源配置需求预测统计表 +- 表 3 初中教育规模及资源配置需求预测统计表 + +文本精确搜索结果: +- 表 6 高等教育规模及资源配置需求预测统计表 +================================================== +测试句子: 指标 +向量搜索结果: +2025-06-23 20:03:14,431 - INFO - POST https://10.10.14.206:9200/knowledge_base/_search [status:200 duration:0.008s] +D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings + warnings.warn( +2025-06-23 20:03:14,440 - INFO - POST https://10.10.14.206:9200/raw_texts/_search [status:200 duration:0.008s] +2025-06-23 20:03:14,440 - INFO - 文本: 指标, 分词结果: ['指标'] +2025-06-23 20:03:14,441 - INFO - 有效词向量数量: 1 +2025-06-23 20:03:14,441 - INFO - 生成的平均向量: [-0.24731 -0.30286 -0.223835 0.217864 -0.162194]... +D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings + warnings.warn( +2025-06-23 20:03:14,450 - INFO - POST https://10.10.14.206:9200/knowledge_base/_search [status:200 duration:0.007s] +D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings + warnings.warn( +2025-06-23 20:03:14,456 - INFO - POST https://10.10.14.206:9200/raw_texts/_search [status:200 duration:0.005s] +2025-06-23 20:03:14,457 - INFO - 文本: 二是保障随迁子女受教育权利。健全以居住证为主要依据的农业转移人口随迁子女入学保障政策,以公办学校为主将随迁子女纳入流入地义务教育保障范围,保障享受基本公共教育服务。, 分词结果: ['二是', '保障', '随迁', '子女', '受', '教育', '权利', '。', '健全', '以', '居住证', '为', '主要', '依据', '的', '农业', '转移', '人口', '随迁', '子女', '入学', '保障', '政策', ',', '以', '公办', '学校', '为主', '将', '随迁', '子女', '纳入', '流入地', '义务教育', '保障', '范围', ',', '保障', '享受', '基本', '公共', '教育', '服务', '。'] +2025-06-23 20:03:14,457 - INFO - 有效词向量数量: 32 +2025-06-23 20:03:14,458 - INFO - 生成的平均向量: [ 0.18174155 -0.13902284 -0.13658895 0.17560197 0.04838478]... +D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings + warnings.warn( +- 指标 +- 指标 +- 指标 +- 指标 +- 指标 + +文本精确搜索结果: +- 指标 +- 指标 +- 指标 +- 指标 +- 指标 +- 指标 +- 2001-2022年云南省人口指标统计表 +- 各学段数据及人口指标统计表 +- 1.2001-2022年云南省人口指标统计表 33 +- 2.各学段数据及人口指标统计表 34 +================================================== +测试句子: 二是保障随迁子女受教育权利。健全以居住证为主要依据的农业转移人口随迁子女入学保障政策,以公办学校为主将随迁子女纳入流入地义务教育保障范围,保障享受基本公共教育服务。 +向量搜索结果: +- 健全以居住证为主要依据的农业转移人口随迁子女入学保障政策,以公办学校为主将随迁子女纳入流入地义务教育保障范围,保障享受基本公共教育服务 +- 在学校布局、入学政策等方面守牢底线,既适应城乡人口变化趋势,又能提供公平的受教育机会,保障不同群体受教育权利 +- 健全随迁子女入学保障政策,保障各类特殊群体受教育权利 +- 健全随迁子女入学保障政策,保障各类特殊群体受教育权利 +- 健全随迁子女入学保障政策,保障各类特殊群体受教育权利 + +文本精确搜索结果: +2025-06-23 20:03:14,464 - INFO - POST https://10.10.14.206:9200/knowledge_base/_search [status:200 duration:0.005s] +D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings + warnings.warn( +2025-06-23 20:03:14,475 - INFO - POST https://10.10.14.206:9200/raw_texts/_search [status:200 duration:0.010s] +2025-06-23 20:03:14,475 - INFO - 文本: (一)2012—2022年全省人口总体呈现“长增-突降”态势, 分词结果: ['(', '一', ')', '2012', '—', '2022', '年', '全省', '人口', '总体', '呈现', '“', '长增', '-', '突降', '”', '态势'] +2025-06-23 20:03:14,476 - INFO - 有效词向量数量: 14 +2025-06-23 20:03:14,476 - INFO - 生成的平均向量: [ 0.06827115 -0.212308 -0.07557671 0.21362822 0.12347364]... +D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings + warnings.warn( +2025-06-23 20:03:14,482 - INFO - POST https://10.10.14.206:9200/knowledge_base/_search [status:200 duration:0.005s] +D:\anaconda3\envs\rag\lib\site-packages\urllib3\connectionpool.py:1097: InsecureRequestWarning: Unverified HTTPS request is being made to host '10.10.14.206'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings + warnings.warn( +================================================== +测试句子: (一)2012—2022年全省人口总体呈现“长增-突降”态势 +向量搜索结果: +- (一)2012—2022年全省人口总体呈现“长增-突降”态势 +- (一)2012—2022年全省人口总体呈现“长增-突降”态势 2 +- 总的看,2012—2022年全省人口变化呈现先增、突降的态势(图1) +- 结合全国趋势看,预计我省2023—2035年新生人口数持续保持在较低水平(图2),从2024年起,全省人口总数呈缓慢下降趋势,与全国范围人口下降趋势基本同步(图1) +- 按照全省普通高中师生比标准(1:12.5)测算,2023—2035年全省普通高中教师需求量逐年增大,2033年达到峰值(10.31万人),与2022年实际专任教师数7.64万人比,存在2.67万人的缺口(图20) + +文本精确搜索结果: +- (一)2012—2022年全省人口总体呈现“长增-突降”态势 +- (一)2012—2022年全省人口总体呈现“长增-突降”态势 2 +================================================== +2025-06-23 20:03:14,488 - INFO - POST https://10.10.14.206:9200/raw_texts/_search [status:200 duration:0.005s] + +进程已结束,退出代码为 0