You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

35 lines
1009 B

1 month ago
"""
pip install faiss-cpu
"""
import re
from Util.SplitDocxUtil import SplitDocxUtil
1 month ago
# 源文件
input_file = '../Txt/小学数学(史校长).docx'
1 month ago
def split_into_blocks(text):
"""使用正则表达式匹配问题和话题的标题及内容"""
pattern = r'(问题\d+|话题\d+)([\s\S]+?)(?=问题\d+|话题\d+|$)'
blocks = re.findall(pattern, text, re.DOTALL)
1 month ago
return [(i + 1, title + content) for i, (title, content) in enumerate(blocks)]
1 month ago
"""处理文档主函数"""
text = SplitDocxUtil.read_docx(input_file)
1 month ago
# 切开块
1 month ago
blocks = split_into_blocks(text)
1 month ago
# 将块编码并添加到向量数据库
for block in blocks:
1 month ago
if (len(block[1]) > 10):
print(block[1])
print("\r\n")
1 month ago
# 初始化 Milvus 连接池
milvus_pool = MilvusConnectionPool(host=MS_HOST, port=MS_PORT, max_connections=MS_MAX_CONNECTIONS)
# 初始化集合管理器
collection_name = MS_COLLECTION_NAME
collection_manager = MilvusCollectionManager(collection_name)