parent
2fbffe2fd4
commit
04955cfb56
Binary file not shown.
Binary file not shown.
@ -1,48 +0,0 @@
|
||||
"""
|
||||
pip install pymilvus gensim
|
||||
"""
|
||||
|
||||
from pymilvus import FieldSchema, DataType, utility
|
||||
|
||||
from Config.Config import MS_HOST, MS_PORT, MS_MAX_CONNECTIONS, MS_COLLECTION_NAME, MS_DIMENSION
|
||||
from Backup.Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager
|
||||
|
||||
# 1. 使用连接池管理 Milvus 连接
|
||||
milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS)
|
||||
|
||||
# 2. 从连接池中获取一个连接
|
||||
connection = milvus_pool.get_connection()
|
||||
|
||||
# 3. 初始化集合管理器
|
||||
collection_name = MS_COLLECTION_NAME
|
||||
collection_manager = MilvusCollectionManager(collection_name)
|
||||
|
||||
# 4. 判断集合是否存在,存在则删除
|
||||
if utility.has_collection(collection_name):
|
||||
print(f"集合 '{collection_name}' 已存在,正在删除...")
|
||||
utility.drop_collection(collection_name)
|
||||
print(f"集合 '{collection_name}' 已删除。")
|
||||
|
||||
# 5. 定义集合的字段和模式
|
||||
fields = [
|
||||
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
|
||||
FieldSchema(name="tags", dtype=DataType.JSON), # 改为JSON类型存储多个标签
|
||||
FieldSchema(name="user_input", dtype=DataType.VARCHAR, max_length=65535),
|
||||
FieldSchema(name="timestamp", dtype=DataType.VARCHAR, max_length=32),
|
||||
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=MS_DIMENSION)
|
||||
]
|
||||
schema_description = "Chat records collection with tags , user_input, and timestamp"
|
||||
|
||||
# 6. 创建集合
|
||||
print(f"正在创建集合 '{collection_name}'...")
|
||||
collection_manager.create_collection(fields, schema_description)
|
||||
print(f"集合 '{collection_name}' 创建成功。")
|
||||
|
||||
# 7. 释放连接
|
||||
milvus_pool.release_connection(connection)
|
||||
|
||||
# 8. 关闭连接池
|
||||
milvus_pool.close()
|
||||
|
||||
|
||||
|
@ -1,27 +0,0 @@
|
||||
from Backup.Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager
|
||||
from Config.Config import *
|
||||
|
||||
# 1. 使用连接池管理 Milvus 连接
|
||||
milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS)
|
||||
|
||||
# 2. 从连接池中获取一个连接
|
||||
connection = milvus_pool.get_connection()
|
||||
|
||||
# 3. 初始化集合管理器
|
||||
collection_name = MS_COLLECTION_NAME
|
||||
collection_manager = MilvusCollectionManager(collection_name)
|
||||
|
||||
# 4. 创建索引
|
||||
index_params = {
|
||||
"index_type": "IVF_FLAT", # 使用 IVF_FLAT 索引类型
|
||||
"metric_type": "L2", # 使用 L2 距离度量方式
|
||||
"params": {"nlist": 128} # 设置 IVF_FLAT 的 nlist 参数
|
||||
}
|
||||
collection_manager.create_index("embedding", index_params) # 为 embedding 字段创建索引
|
||||
print(f"集合 '{collection_name}' 的 'embedding' 字段索引创建成功。")
|
||||
|
||||
# 5. 释放连接
|
||||
milvus_pool.release_connection(connection)
|
||||
|
||||
# 6. 关闭连接池
|
||||
milvus_pool.close()
|
@ -1,71 +0,0 @@
|
||||
import os
|
||||
|
||||
from Util.SplitDocxUtil import SplitDocxUtil
|
||||
|
||||
|
||||
def split_into_blocks(text):
|
||||
"""按行遍历文本,发现'问题X'或'话题X'时开始分割,但去除这些前缀字符串"""
|
||||
blocks = []
|
||||
current_block = []
|
||||
in_block = False
|
||||
|
||||
for line in text.splitlines():
|
||||
if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
|
||||
if in_block:
|
||||
blocks.append('\n'.join(current_block))
|
||||
current_block = []
|
||||
in_block = True
|
||||
# 去除前缀字符串
|
||||
line = line[line.find(' ')+1:] if ' ' in line else ''
|
||||
|
||||
if in_block and line: # 只添加非空行
|
||||
current_block.append(line)
|
||||
|
||||
if current_block:
|
||||
blocks.append('\n'.join(current_block))
|
||||
|
||||
return [(i+1, block) for i, block in enumerate(blocks)]
|
||||
|
||||
def process_document(input_path, output_dir):
|
||||
"""处理文档主函数"""
|
||||
text = SplitDocxUtil.read_docx(input_path)
|
||||
if not text:
|
||||
print("无法读取输入文件内容")
|
||||
return False
|
||||
|
||||
# 确保输出目录存在并清空目录
|
||||
if os.path.exists(output_dir):
|
||||
for file in os.listdir(output_dir):
|
||||
os.remove(os.path.join(output_dir, file))
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
chunks = split_into_blocks(text)
|
||||
print(f"共分割出{len(chunks)}个段落块")
|
||||
|
||||
saved_count = 0
|
||||
for chunk_num, chunk in chunks:
|
||||
chunk = chunk.strip() # 确保去除空白字符
|
||||
output_file = os.path.join(output_dir, f"{chunk_num}.txt")
|
||||
if save_to_txt(chunk, output_file, mode='w'):
|
||||
saved_count += 1
|
||||
|
||||
print(f"处理完成,共保存{saved_count}个文件到目录: {output_dir}")
|
||||
return saved_count > 0
|
||||
|
||||
# 保留原有的save_to_txt函数
|
||||
def save_to_txt(content, file_path, mode='w'):
|
||||
"""将内容保存到文本文件"""
|
||||
try:
|
||||
with open(file_path, mode, encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"保存文件{file_path}时出错: {str(e)}")
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_file = '../../static/Txt/小学数学教学中的若干问题_MATH_1.docx'
|
||||
#input_file = '../static/Txt/小学数学知识点_MATH_2.docx'
|
||||
#input_file = '../static/Txt/高中文言文_CHINESE_1.docx'
|
||||
output_dir = '../Txt/processed_chunks'
|
||||
process_document(input_file, output_dir)
|
@ -1,72 +0,0 @@
|
||||
from Config.Config import *
|
||||
from Backup.Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager
|
||||
from gensim.models import KeyedVectors
|
||||
import jieba
|
||||
import os
|
||||
import time
|
||||
|
||||
# 需要进行标记的标签
|
||||
selectedTags = ["CHINESE_DATA_1", "高中语文文言文"]
|
||||
|
||||
# 1. 加载预训练的 Word2Vec 模型
|
||||
model_path = MS_MODEL_PATH
|
||||
model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=MS_MODEL_LIMIT)
|
||||
print(f"模型加载成功,词向量维度: {model.vector_size}")
|
||||
|
||||
|
||||
# 功能:将文本转换为嵌入向量
|
||||
def text_to_embedding(text):
|
||||
words = jieba.lcut(text)
|
||||
embeddings = [model[word] for word in words if word in model]
|
||||
if embeddings:
|
||||
return sum(embeddings) / len(embeddings)
|
||||
return [0.0] * model.vector_size
|
||||
|
||||
|
||||
# 2. 使用连接池管理 Milvus 连接
|
||||
milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS)
|
||||
connection = milvus_pool.get_connection()
|
||||
|
||||
# 3. 初始化集合管理器
|
||||
collection_name = MS_COLLECTION_NAME
|
||||
collection_manager = MilvusCollectionManager(collection_name)
|
||||
|
||||
# 4. 处理processed_chunks目录下的所有文件
|
||||
txt_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'Txt', 'processed_chunks')
|
||||
|
||||
for filename in os.listdir(txt_dir):
|
||||
if filename.endswith('.txt'):
|
||||
filepath = os.path.join(txt_dir, filename)
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
# 只读取第一行作为向量计算
|
||||
first_line = f.readline().strip()
|
||||
# 读取全部内容用于后续查询
|
||||
full_content = first_line + '\n' + f.read()
|
||||
|
||||
if not first_line:
|
||||
print(f"跳过空文件: {filename}")
|
||||
continue
|
||||
|
||||
print(f"正在处理文件: {filename}")
|
||||
|
||||
# 5. 获取当前时间和会话ID
|
||||
timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
|
||||
tags = {"tags": selectedTags, "full_content": full_content} # 添加完整内容
|
||||
|
||||
# 6. 将第一行文本转换为嵌入向量
|
||||
embedding = text_to_embedding(first_line)
|
||||
|
||||
# 7. 插入数据
|
||||
entities = [
|
||||
[tags], # tags
|
||||
[first_line], # user_input
|
||||
[timestamp], # timestamp
|
||||
[embedding] # embedding
|
||||
]
|
||||
collection_manager.insert_data(entities)
|
||||
print(f"文件 {filename} 数据插入成功")
|
||||
|
||||
# 8. 释放连接 (移出循环外)
|
||||
milvus_pool.release_connection(connection)
|
||||
milvus_pool.close()
|
||||
print("所有文件处理完成")
|
@ -1,52 +0,0 @@
|
||||
from Backup.Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager
|
||||
from Config.Config import *
|
||||
|
||||
# 1. 使用连接池管理 Milvus 连接
|
||||
milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS)
|
||||
|
||||
# 2. 从连接池中获取一个连接
|
||||
connection = milvus_pool.get_connection()
|
||||
|
||||
# 3. 初始化集合管理器
|
||||
collection_name = MS_COLLECTION_NAME
|
||||
collection_manager = MilvusCollectionManager(collection_name)
|
||||
|
||||
# 4. 加载集合到内存
|
||||
collection_manager.load_collection()
|
||||
print(f"集合 '{collection_name}' 已加载到内存。")
|
||||
|
||||
# 5. 直接在代码中指定要查询的标签
|
||||
#query_tag = "MATH_DATA_1" # 可以修改为MATH_DATA_2或其他需要的标签
|
||||
query_tag = "MATH_DATA_2" # 可以修改为MATH_DATA_2或其他需要的标签
|
||||
expr = f"array_contains(tags['tags'], '{query_tag}')"
|
||||
print(f"查询表达式: {expr}")
|
||||
|
||||
# 6. 查询数据
|
||||
try:
|
||||
results = collection_manager.collection.query(
|
||||
expr=expr,
|
||||
output_fields=["id", "tags", "user_input", "timestamp", "embedding"],
|
||||
limit=1000
|
||||
)
|
||||
print(f"查询标签 '{query_tag}' 结果:")
|
||||
if results:
|
||||
for result in results:
|
||||
try:
|
||||
print(f"ID: {result['id']}")
|
||||
print(f"标签: {result['tags']}")
|
||||
print(f"用户问题: {result['user_input']}")
|
||||
print(f"时间: {result['timestamp']}")
|
||||
print(f"向量: {result['embedding'][:5]}...")
|
||||
print("-" * 40)
|
||||
except Exception as e:
|
||||
print(f"处理结果失败: {e}")
|
||||
else:
|
||||
print(f"未找到标签为 '{query_tag}' 的数据。")
|
||||
except Exception as e:
|
||||
print(f"查询失败: {e}")
|
||||
|
||||
# 7. 释放连接
|
||||
milvus_pool.release_connection(connection)
|
||||
|
||||
# 8. 关闭连接池
|
||||
milvus_pool.close()
|
@ -1,92 +0,0 @@
|
||||
import time
|
||||
import jieba # 导入 jieba 分词库
|
||||
from Backup.Milvus.Utils.MilvusCollectionManager import MilvusCollectionManager
|
||||
from Config.Config import *
|
||||
from gensim.models import KeyedVectors
|
||||
|
||||
# 1. 加载预训练的 Word2Vec 模型
|
||||
model_path = MS_MODEL_PATH # 替换为你的 Word2Vec 模型路径
|
||||
model = KeyedVectors.load_word2vec_format(model_path, binary=False, limit=MS_MODEL_LIMIT)
|
||||
print(f"模型加载成功,词向量维度: {model.vector_size}")
|
||||
|
||||
|
||||
# 将文本转换为嵌入向量
|
||||
def text_to_embedding(text):
|
||||
words = jieba.lcut(text) # 使用 jieba 分词
|
||||
print(f"文本: {text}, 分词结果: {words}")
|
||||
embeddings = [model[word] for word in words if word in model]
|
||||
print(f"有效词向量数量: {len(embeddings)}")
|
||||
if embeddings:
|
||||
avg_embedding = sum(embeddings) / len(embeddings)
|
||||
print(f"生成的平均向量: {avg_embedding[:5]}...") # 打印前 5 维
|
||||
return avg_embedding
|
||||
else:
|
||||
print("未找到有效词,返回零向量")
|
||||
return [0.0] * model.vector_size
|
||||
|
||||
|
||||
# 2. 使用连接池管理 Milvus 连接
|
||||
milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS)
|
||||
|
||||
# 3. 从连接池中获取一个连接
|
||||
connection = milvus_pool.get_connection()
|
||||
|
||||
# 4. 初始化集合管理器
|
||||
collection_name = MS_COLLECTION_NAME
|
||||
collection_manager = MilvusCollectionManager(collection_name)
|
||||
|
||||
# 5. 加载集合到内存
|
||||
collection_manager.load_collection()
|
||||
print(f"集合 '{collection_name}' 已加载到内存。")
|
||||
|
||||
# 6. 输入一句话
|
||||
input_text = "小学数学中有哪些模型?"
|
||||
|
||||
# 7. 将文本转换为嵌入向量
|
||||
current_embedding = text_to_embedding(input_text)
|
||||
|
||||
# 8. 查询与当前对话最相关的历史对话
|
||||
start_time = time.time()
|
||||
search_params = {
|
||||
"metric_type": "L2", # 使用 L2 距离度量方式
|
||||
"params": {"nprobe": MS_NPROBE} # 设置 IVF_FLAT 的 nprobe 参数
|
||||
}
|
||||
# 哪些文档查询,哪些不查询,我说了算!
|
||||
# 这样的话,我就可以打多个标签了!
|
||||
expr = "array_contains(tags['tags'], 'MATH_DATA_1')"
|
||||
results = collection_manager.search(
|
||||
current_embedding,
|
||||
search_params,
|
||||
expr=expr, # 使用in操作符
|
||||
limit=5
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
|
||||
# 9. 输出查询结果
|
||||
print("最相关的历史对话:")
|
||||
if results:
|
||||
for hits in results:
|
||||
for hit in hits:
|
||||
try:
|
||||
# 查询非向量字段
|
||||
record = collection_manager.query_by_id(hit.id)
|
||||
print(f"ID: {hit.id}")
|
||||
print(f"标签: {record['tags']}")
|
||||
print(f"用户问题: {record['user_input']}")
|
||||
print(f"时间: {record['timestamp']}")
|
||||
print(f"距离: {hit.distance}")
|
||||
print("-" * 40) # 分隔线
|
||||
except Exception as e:
|
||||
print(f"查询失败: {e}")
|
||||
else:
|
||||
print("未找到相关历史对话,请检查查询参数或数据。")
|
||||
|
||||
# 10. 输出查询耗时
|
||||
print(f"查询耗时: {end_time - start_time:.4f} 秒")
|
||||
|
||||
# 11. 释放连接
|
||||
milvus_pool.release_connection(connection)
|
||||
|
||||
# 12. 关闭连接池
|
||||
milvus_pool.close()
|
Loading…
Reference in new issue