diff --git a/dsRag/Milvus/X1_create_collection.py b/dsRag/Milvus/X1_create_collection.py index 6a56a86f..f1a33d1d 100644 --- a/dsRag/Milvus/X1_create_collection.py +++ b/dsRag/Milvus/X1_create_collection.py @@ -26,11 +26,11 @@ if utility.has_collection(collection_name): # 5. 定义集合的字段和模式 fields = [ - FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), # 主键字段,自动生成 ID - FieldSchema(name="document_id", dtype=DataType.VARCHAR, max_length=64), # 文档 ID - FieldSchema(name="user_input", dtype=DataType.VARCHAR, max_length=65535), # 用户问题 - FieldSchema(name="timestamp", dtype=DataType.VARCHAR, max_length=32), # 时间 - FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=MS_DIMENSION) # 向量字段,维度为 200 + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), + FieldSchema(name="tags", dtype=DataType.JSON), # 改为JSON类型存储多个标签 + FieldSchema(name="user_input", dtype=DataType.VARCHAR, max_length=65535), + FieldSchema(name="timestamp", dtype=DataType.VARCHAR, max_length=32), + FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=MS_DIMENSION) ] schema_description = "Chat records collection with document_id , user_input, and timestamp" diff --git a/dsRag/Milvus/X4_InsertMathData.py b/dsRag/Milvus/X4_InsertMathData.py index 0397debe..63033729 100644 --- a/dsRag/Milvus/X4_InsertMathData.py +++ b/dsRag/Milvus/X4_InsertMathData.py @@ -46,14 +46,14 @@ for filename in os.listdir(txt_dir): # 5. 获取当前时间和会话ID timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) - document_id = "MATH_DATA_1" # 史校长的这本书定义为 MATH_DATA_1 + tags = {"tags": ["MATH_DATA_1", "小学数学"]} # 直接使用Python字典,Milvus会自动转换为JSON # 6. 将文本转换为嵌入向量 embedding = text_to_embedding(content) # 7. 插入数据 entities = [ - [document_id], # document_id + [tags], # tags [content], # user_input [timestamp], # timestamp [embedding] # embedding diff --git a/dsRag/Milvus/X5_select_all_data.py b/dsRag/Milvus/X5_select_all_data.py index 60cdfaa6..afaa5c39 100644 --- a/dsRag/Milvus/X5_select_all_data.py +++ b/dsRag/Milvus/X5_select_all_data.py @@ -21,7 +21,7 @@ try: # 使用 Milvus 的 query 方法查询所有数据 results = collection_manager.collection.query( expr="", # 空表达式表示查询所有数据 - output_fields=["id", "document_id", "user_input", "timestamp", "embedding"], # 指定返回的字段 + output_fields=["id", "tags", "user_input", "timestamp", "embedding"], # 指定返回的字段 limit=1000 # 设置最大返回记录数 ) print("查询结果:") @@ -29,13 +29,13 @@ try: for result in results: try: # 获取字段值 - document_id = result["document_id"] + tags = result["tags"] user_input = result["user_input"] timestamp = result["timestamp"] embedding = result["embedding"] # 打印结果 print(f"ID: {result['id']}") - print(f"文档 ID: {document_id}") + print(f"标签: {tags}") print(f"用户问题: {user_input}") print(f"时间: {timestamp}") print(f"向量: {embedding[:5]}...") # 只打印前 5 维向量 diff --git a/dsRag/Milvus/X6_search_near_data.py b/dsRag/Milvus/X6_search_near_data.py index be6465f9..0546c5cd 100644 --- a/dsRag/Milvus/X6_search_near_data.py +++ b/dsRag/Milvus/X6_search_near_data.py @@ -55,7 +55,7 @@ search_params = { # 哪些文档查询,哪些不查询,我说了算! # 这样的话,我就可以打多个标签了! # expr = "document_id in ['MATH_DATA_1', 'MATH_DATA_2']" -expr = "document_id in ['MATH_DATA_1']" +expr = "tags['tags'] == 'MATH_DATA_1'" results = collection_manager.search( current_embedding, search_params, @@ -74,7 +74,7 @@ if results: # 查询非向量字段 record = collection_manager.query_by_id(hit.id) print(f"ID: {hit.id}") - print(f"文档 ID: {record['document_id']}") + print(f"标签: {record['tags']}") print(f"用户问题: {record['user_input']}") print(f"时间: {record['timestamp']}") print(f"距离: {hit.distance}")