main
HuangHai 4 weeks ago
parent 15b6c65325
commit 39e1f9d4f7

@ -26,11 +26,11 @@ if utility.has_collection(collection_name):
# 5. 定义集合的字段和模式 # 5. 定义集合的字段和模式
fields = [ fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), # 主键字段,自动生成 ID FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
FieldSchema(name="document_id", dtype=DataType.VARCHAR, max_length=64), # 文档 ID FieldSchema(name="tags", dtype=DataType.JSON), # 改为JSON类型存储多个标签
FieldSchema(name="user_input", dtype=DataType.VARCHAR, max_length=65535), # 用户问题 FieldSchema(name="user_input", dtype=DataType.VARCHAR, max_length=65535),
FieldSchema(name="timestamp", dtype=DataType.VARCHAR, max_length=32), # 时间 FieldSchema(name="timestamp", dtype=DataType.VARCHAR, max_length=32),
FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=MS_DIMENSION) # 向量字段,维度为 200 FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=MS_DIMENSION)
] ]
schema_description = "Chat records collection with document_id , user_input, and timestamp" schema_description = "Chat records collection with document_id , user_input, and timestamp"

@ -46,14 +46,14 @@ for filename in os.listdir(txt_dir):
# 5. 获取当前时间和会话ID # 5. 获取当前时间和会话ID
timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
document_id = "MATH_DATA_1" # 史校长的这本书定义为 MATH_DATA_1 tags = {"tags": ["MATH_DATA_1", "小学数学"]} # 直接使用Python字典Milvus会自动转换为JSON
# 6. 将文本转换为嵌入向量 # 6. 将文本转换为嵌入向量
embedding = text_to_embedding(content) embedding = text_to_embedding(content)
# 7. 插入数据 # 7. 插入数据
entities = [ entities = [
[document_id], # document_id [tags], # tags
[content], # user_input [content], # user_input
[timestamp], # timestamp [timestamp], # timestamp
[embedding] # embedding [embedding] # embedding

@ -21,7 +21,7 @@ try:
# 使用 Milvus 的 query 方法查询所有数据 # 使用 Milvus 的 query 方法查询所有数据
results = collection_manager.collection.query( results = collection_manager.collection.query(
expr="", # 空表达式表示查询所有数据 expr="", # 空表达式表示查询所有数据
output_fields=["id", "document_id", "user_input", "timestamp", "embedding"], # 指定返回的字段 output_fields=["id", "tags", "user_input", "timestamp", "embedding"], # 指定返回的字段
limit=1000 # 设置最大返回记录数 limit=1000 # 设置最大返回记录数
) )
print("查询结果:") print("查询结果:")
@ -29,13 +29,13 @@ try:
for result in results: for result in results:
try: try:
# 获取字段值 # 获取字段值
document_id = result["document_id"] tags = result["tags"]
user_input = result["user_input"] user_input = result["user_input"]
timestamp = result["timestamp"] timestamp = result["timestamp"]
embedding = result["embedding"] embedding = result["embedding"]
# 打印结果 # 打印结果
print(f"ID: {result['id']}") print(f"ID: {result['id']}")
print(f"文档 ID: {document_id}") print(f"标签: {tags}")
print(f"用户问题: {user_input}") print(f"用户问题: {user_input}")
print(f"时间: {timestamp}") print(f"时间: {timestamp}")
print(f"向量: {embedding[:5]}...") # 只打印前 5 维向量 print(f"向量: {embedding[:5]}...") # 只打印前 5 维向量

@ -55,7 +55,7 @@ search_params = {
# 哪些文档查询,哪些不查询,我说了算! # 哪些文档查询,哪些不查询,我说了算!
# 这样的话,我就可以打多个标签了! # 这样的话,我就可以打多个标签了!
# expr = "document_id in ['MATH_DATA_1', 'MATH_DATA_2']" # expr = "document_id in ['MATH_DATA_1', 'MATH_DATA_2']"
expr = "document_id in ['MATH_DATA_1']" expr = "tags['tags'] == 'MATH_DATA_1'"
results = collection_manager.search( results = collection_manager.search(
current_embedding, current_embedding,
search_params, search_params,
@ -74,7 +74,7 @@ if results:
# 查询非向量字段 # 查询非向量字段
record = collection_manager.query_by_id(hit.id) record = collection_manager.query_by_id(hit.id)
print(f"ID: {hit.id}") print(f"ID: {hit.id}")
print(f"文档 ID: {record['document_id']}") print(f"标签: {record['tags']}")
print(f"用户问题: {record['user_input']}") print(f"用户问题: {record['user_input']}")
print(f"时间: {record['timestamp']}") print(f"时间: {record['timestamp']}")
print(f"距离: {hit.distance}") print(f"距离: {hit.distance}")

Loading…
Cancel
Save