diff --git a/dsRag/Config/__pycache__/Config.cpython-310.pyc b/dsRag/Config/__pycache__/Config.cpython-310.pyc index d1978751..765cd6e0 100644 Binary files a/dsRag/Config/__pycache__/Config.cpython-310.pyc and b/dsRag/Config/__pycache__/Config.cpython-310.pyc differ diff --git a/dsRag/Config/__pycache__/__init__.cpython-310.pyc b/dsRag/Config/__pycache__/__init__.cpython-310.pyc index 2278514c..b3f99ca7 100644 Binary files a/dsRag/Config/__pycache__/__init__.cpython-310.pyc and b/dsRag/Config/__pycache__/__init__.cpython-310.pyc differ diff --git a/dsRag/Test/T12.py b/dsRag/Test/T12.py new file mode 100644 index 00000000..bfbae62b --- /dev/null +++ b/dsRag/Test/T12.py @@ -0,0 +1,47 @@ +""" +pip install faiss-cpu +""" +import re +import faiss +from transformers import AutoTokenizer, AutoModel +from transformers import LlamaForCausalLM +from Util.SplitDocxUtil import SplitDocxUtil + +# 加载预训练模型和分词器 +tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') +#model = AutoModel.from_pretrained('bert-base-uncased') + +model = LlamaForCausalLM.from_pretrained("./path/to/local/directory", local_files_only=True) + +def split_into_blocks(text): + """使用正则表达式匹配问题和话题的标题及内容""" + pattern = r'(问题\d+|话题\d+)([\s\S]+?)(?=问题\d+|话题\d+|$)' + blocks = re.findall(pattern, text, re.DOTALL) + return [(i+1, title + content) for i, (title, content) in enumerate(blocks)] + +def encode_text(text): + inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length') + outputs = model(**inputs) + embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy() + return embeddings + +# 创建向量数据库 +dimension = 768 # BERT模型的维度 +index = faiss.IndexFlatL2(dimension) + +# 源文件 +input_file = '../Txt/小学数学(史校长).docx' +output_dir = '../Txt/processed_chunks' + +"""处理文档主函数""" +text = SplitDocxUtil.read_docx(input_file) + +blocks=split_into_blocks(text) + +# 将块编码并添加到向量数据库 +for block in blocks: + embedding = encode_text(block) + index.add(embedding) + +# 保存向量数据库 +#faiss.write_index(index, 'math_blocks.index') \ No newline at end of file diff --git a/dsRag/Test/T13.py b/dsRag/Test/T13.py new file mode 100644 index 00000000..e501d919 --- /dev/null +++ b/dsRag/Test/T13.py @@ -0,0 +1,26 @@ +import re +import faiss +from transformers import AutoTokenizer, AutoModel +# 加载预训练模型和分词器 +tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') +model = AutoModel.from_pretrained('bert-base-uncased') + +def encode_text(text): + inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length') + outputs = model(**inputs) + embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy() + return embeddings + +def search_similar_blocks(query, index, k=5): + query_embedding = encode_text(query) + D, I = index.search(query_embedding, k) + return I[0] + +# 示例:查询 +query = "小学数学中有哪些模型" +index = faiss.read_index('math_blocks.index') +similar_indices = search_similar_blocks(query, index) + +# 打印最相似的块 +for idx in similar_indices: + print(f"Similar Block {idx + 1}:\n{blocks[idx]}\n") \ No newline at end of file diff --git a/dsRag/Test/T14_DownloadHuggingFace.py b/dsRag/Test/T14_DownloadHuggingFace.py new file mode 100644 index 00000000..9a681637 --- /dev/null +++ b/dsRag/Test/T14_DownloadHuggingFace.py @@ -0,0 +1,27 @@ +""" +pip install huggingface_hub +pip install pysocks +pip install hf_xet +开VPN后,使用Python下载模型 +""" +import os +from transformers import AutoModel, AutoTokenizer + +# 设置环境变量 +os.environ['HTTP_PROXY'] = 'socks5://127.0.0.1:1080' +os.environ['HTTPS_PROXY'] = 'socks5://127.0.0.1:1080' + +# 配置代理 +proxies = { + 'http': 'socks5://127.0.0.1:1080', + 'https': 'socks5://127.0.0.1:1080' +} + +# 加载模型 +model_id = "google-bert/bert-base-uncased" +model = AutoModel.from_pretrained(model_id, proxies=proxies) +tokenizer = AutoTokenizer.from_pretrained(model_id, proxies=proxies) + +# 保存模型到本地 +model.save_pretrained("d:/Model/google-bert/bert-base-uncased") +tokenizer.save_pretrained("d:/Model/google-bert/bert-base-uncased") diff --git a/dsRag/Tools/T4_DeepSeekRag.py b/dsRag/Tools/T4_DeepSeekRag.py index b7372087..c47c3df6 100644 --- a/dsRag/Tools/T4_DeepSeekRag.py +++ b/dsRag/Tools/T4_DeepSeekRag.py @@ -100,9 +100,11 @@ def search_related_data(query): context = "" for hit in vector_results['hits']['hits']: context += f"向量相似度结果(score={hit['_score']}):\n{hit['_source']['text']}\n\n" + print(context) for hit in text_results['hits']['hits']: context += f"文本精确匹配结果(score={hit['_score']}):\n{hit['_source']['text']}\n\n" + print(context) return context diff --git a/dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc b/dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc index 038ba6a3..cd5032e9 100644 Binary files a/dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc and b/dsRag/Util/__pycache__/EmbeddingUtil.cpython-310.pyc differ diff --git a/dsRag/Util/__pycache__/SplitDocxUtil.cpython-310.pyc b/dsRag/Util/__pycache__/SplitDocxUtil.cpython-310.pyc index ddcc5089..90e11e18 100644 Binary files a/dsRag/Util/__pycache__/SplitDocxUtil.cpython-310.pyc and b/dsRag/Util/__pycache__/SplitDocxUtil.cpython-310.pyc differ diff --git a/dsRag/Util/__pycache__/__init__.cpython-310.pyc b/dsRag/Util/__pycache__/__init__.cpython-310.pyc index 2089b6b5..79767003 100644 Binary files a/dsRag/Util/__pycache__/__init__.cpython-310.pyc and b/dsRag/Util/__pycache__/__init__.cpython-310.pyc differ