""" pip install faiss-cpu """ import re from Util.SplitDocxUtil import SplitDocxUtil # 源文件 input_file = '../Txt/小学数学(史校长).docx' def split_into_blocks(text): """使用正则表达式匹配问题和话题的标题及内容""" pattern = r'(问题\d+|话题\d+)([\s\S]+?)(?=问题\d+|话题\d+|$)' blocks = re.findall(pattern, text, re.DOTALL) return [(i + 1, title + content) for i, (title, content) in enumerate(blocks)] """处理文档主函数""" text = SplitDocxUtil.read_docx(input_file) # 切开块 blocks = split_into_blocks(text) # 将块编码并添加到向量数据库 for block in blocks: if (len(block[1]) > 10): print(block[1]) print("\r\n")