From 6441d596cf8637f579c6d0dd0b8e9a02f978360b Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Thu, 26 Jun 2025 10:09:55 +0800 Subject: [PATCH] 'commit' --- .../Config/__pycache__/Config.cpython-310.pyc | Bin 751 -> 751 bytes .../__pycache__/__init__.cpython-310.pyc | Bin 137 -> 137 bytes dsRag/Test/T12.py | 47 ++++++++++++++++++ dsRag/Test/T13.py | 26 ++++++++++ dsRag/Test/T14_DownloadHuggingFace.py | 27 ++++++++++ dsRag/Tools/T4_DeepSeekRag.py | 2 + .../__pycache__/EmbeddingUtil.cpython-310.pyc | Bin 1100 -> 1100 bytes .../__pycache__/SplitDocxUtil.cpython-310.pyc | Bin 1586 -> 1586 bytes .../Util/__pycache__/__init__.cpython-310.pyc | Bin 135 -> 135 bytes 9 files changed, 102 insertions(+) create mode 100644 dsRag/Test/T12.py create mode 100644 dsRag/Test/T13.py create mode 100644 dsRag/Test/T14_DownloadHuggingFace.py diff --git a/dsRag/Config/__pycache__/Config.cpython-310.pyc b/dsRag/Config/__pycache__/Config.cpython-310.pyc index d1978751c15144394676870ba39f8891be947818..765cd6e0f0dcd50d45ebd0d5844f8758ebde9c7e 100644 GIT binary patch delta 123 zcmaFQ`ks|LpO=@50SKNm%ajBF$yF-d delta 123 zcmaFQ`ks|LpO=@50SMe)Mo;9PXA;GiotIyplbVvA8lRL{oT|xsi!~#^xTH9W3&_n( zj!!Ht$%x`9OD)PwtBg-hEdt7HyvxbR7&ZAhW0T4)USEF~SD$#t0MB@D*T`F35U!u2 Suj@*NA~v8sMQoFMnUVk-!YK*> diff --git a/dsRag/Config/__pycache__/__init__.cpython-310.pyc b/dsRag/Config/__pycache__/__init__.cpython-310.pyc index 2278514cd50c4f589d57ed9e603ceb02eedfb9c9..b3f99ca71b3b92c5c36e61e071899a9af29e37da 100644 GIT binary patch delta 18 YcmeBV>}2H1=jG*M0D=QgCvw#T03*BvaR2}S delta 18 XcmeBV>}2H1=jG*M0D|>K6S?XEAe#fN diff --git a/dsRag/Test/T12.py b/dsRag/Test/T12.py new file mode 100644 index 00000000..bfbae62b --- /dev/null +++ b/dsRag/Test/T12.py @@ -0,0 +1,47 @@ +""" +pip install faiss-cpu +""" +import re +import faiss +from transformers import AutoTokenizer, AutoModel +from transformers import LlamaForCausalLM +from Util.SplitDocxUtil import SplitDocxUtil + +# 加载预训练模型和分词器 +tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') +#model = AutoModel.from_pretrained('bert-base-uncased') + +model = LlamaForCausalLM.from_pretrained("./path/to/local/directory", local_files_only=True) + +def split_into_blocks(text): + """使用正则表达式匹配问题和话题的标题及内容""" + pattern = r'(问题\d+|话题\d+)([\s\S]+?)(?=问题\d+|话题\d+|$)' + blocks = re.findall(pattern, text, re.DOTALL) + return [(i+1, title + content) for i, (title, content) in enumerate(blocks)] + +def encode_text(text): + inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length') + outputs = model(**inputs) + embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy() + return embeddings + +# 创建向量数据库 +dimension = 768 # BERT模型的维度 +index = faiss.IndexFlatL2(dimension) + +# 源文件 +input_file = '../Txt/小学数学(史校长).docx' +output_dir = '../Txt/processed_chunks' + +"""处理文档主函数""" +text = SplitDocxUtil.read_docx(input_file) + +blocks=split_into_blocks(text) + +# 将块编码并添加到向量数据库 +for block in blocks: + embedding = encode_text(block) + index.add(embedding) + +# 保存向量数据库 +#faiss.write_index(index, 'math_blocks.index') \ No newline at end of file diff --git a/dsRag/Test/T13.py b/dsRag/Test/T13.py new file mode 100644 index 00000000..e501d919 --- /dev/null +++ b/dsRag/Test/T13.py @@ -0,0 +1,26 @@ +import re +import faiss +from transformers import AutoTokenizer, AutoModel +# 加载预训练模型和分词器 +tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') +model = AutoModel.from_pretrained('bert-base-uncased') + +def encode_text(text): + inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length') + outputs = model(**inputs) + embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy() + return embeddings + +def search_similar_blocks(query, index, k=5): + query_embedding = encode_text(query) + D, I = index.search(query_embedding, k) + return I[0] + +# 示例:查询 +query = "小学数学中有哪些模型" +index = faiss.read_index('math_blocks.index') +similar_indices = search_similar_blocks(query, index) + +# 打印最相似的块 +for idx in similar_indices: + print(f"Similar Block {idx + 1}:\n{blocks[idx]}\n") \ No newline at end of file diff --git a/dsRag/Test/T14_DownloadHuggingFace.py b/dsRag/Test/T14_DownloadHuggingFace.py new file mode 100644 index 00000000..9a681637 --- /dev/null +++ b/dsRag/Test/T14_DownloadHuggingFace.py @@ -0,0 +1,27 @@ +""" +pip install huggingface_hub +pip install pysocks +pip install hf_xet +开VPN后,使用Python下载模型 +""" +import os +from transformers import AutoModel, AutoTokenizer + +# 设置环境变量 +os.environ['HTTP_PROXY'] = 'socks5://127.0.0.1:1080' +os.environ['HTTPS_PROXY'] = 'socks5://127.0.0.1:1080' + +# 配置代理 +proxies = { + 'http': 'socks5://127.0.0.1:1080', + 'https': 'socks5://127.0.0.1:1080' +} + +# 加载模型 +model_id = "google-bert/bert-base-uncased" +model = AutoModel.from_pretrained(model_id, proxies=proxies) +tokenizer = AutoTokenizer.from_pretrained(model_id, proxies=proxies) + +# 保存模型到本地 +model.save_pretrained("d:/Model/google-bert/bert-base-uncased") +tokenizer.save_pretrained("d:/Model/google-bert/bert-base-uncased") diff --git a/dsRag/Tools/T4_DeepSeekRag.py b/dsRag/Tools/T4_DeepSeekRag.py index b7372087..c47c3df6 100644 --- a/dsRag/Tools/T4_DeepSeekRag.py +++ b/dsRag/Tools/T4_DeepSeekRag.py @@ -100,9 +100,11 @@ def search_related_data(query): context = "" for hit in vector_results['hits']['hits']: context += f"向量相似度结果(score={hit['_score']}):\n{hit['_source']['text']}\n\n" + print(context) for hit in text_results['hits']['hits']: context += f"文本精确匹配结果(score={hit['_score']}):\n{hit['_source']['text']}\n\n" + print(context) return context diff --git a/dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc b/dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc index 038ba6a30edebf7c5acd7c47e9c01e8489627849..cd5032e9dc84b774ed747b451d8cd98952191ed7 100644 GIT binary patch delta 19 ZcmX@ZafX8{pO=@50SFE}-N@y{0su0R1nd9+ delta 19 ZcmX@ZafX8{pO=@50SI)1HgY+!001mN1I7RV diff --git a/dsRag/Util/__pycache__/SplitDocxUtil.cpython-310.pyc b/dsRag/Util/__pycache__/SplitDocxUtil.cpython-310.pyc index ddcc5089f3c26ac3c72d665bd12ec138fef0cde4..90e11e184e4b38782c92321ca1a5324e4a4e0907 100644 GIT binary patch delta 19 ZcmdnQvx$c*pO=@50SKN<-^iuG1^_OR1bzSj delta 19 ZcmdnQvx$c*pO=@50SF3uHgajO0RSta1OEU3 diff --git a/dsRag/Util/__pycache__/__init__.cpython-310.pyc b/dsRag/Util/__pycache__/__init__.cpython-310.pyc index 2089b6b502a2908d4f89653139dba6f8d101300d..79767003c9ac32bf4142003b854ef98e2eacfc07 100644 GIT binary patch delta 18 XcmZo?Y-i-k=jG*M0D@rdiCnb+9qj`o delta 18 XcmZo?Y-i-k=jG*M0D=$N6S-;uA$|kl