From 2fbffe2fd469ab2353befd271e10a2f74460e497 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Sat, 28 Jun 2025 10:50:31 +0800 Subject: [PATCH] 'commit' --- dsRag/ElasticSearch/T2_SplitTxt.py | 200 +++++++++++++++++++++-------- dsRag/Test/Test_MatchImage.py | 185 -------------------------- 2 files changed, 143 insertions(+), 242 deletions(-) delete mode 100644 dsRag/Test/Test_MatchImage.py diff --git a/dsRag/ElasticSearch/T2_SplitTxt.py b/dsRag/ElasticSearch/T2_SplitTxt.py index 3476c8c4..b63a0854 100644 --- a/dsRag/ElasticSearch/T2_SplitTxt.py +++ b/dsRag/ElasticSearch/T2_SplitTxt.py @@ -1,6 +1,106 @@ +import re +import docx + import os +import shutil +import uuid +import zipfile + +from docx import Document +from docx.oxml.ns import nsmap + + +def extract_images_from_docx(docx_path, output_folder): + """ + 从docx提取图片并记录位置 + :param docx_path: Word文档路径 + :param output_folder: 图片输出文件夹 + :return: 包含图片路径和位置的列表 + """ + # 创建一个List 记录每个图片的名称和序号 + image_data = [] + # 创建临时解压目录 + temp_dir = os.path.join(output_folder, "temp_docx") + os.makedirs(temp_dir, exist_ok=True) + + # 解压docx文件 + with zipfile.ZipFile(docx_path, 'r') as zip_ref: + zip_ref.extractall(temp_dir) + + # 读取主文档关系 + with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file: + rels_content = rels_file.read() + + # 加载主文档 + doc = Document(docx_path) + img_counter = 1 + + # 遍历所有段落 + for para_idx, paragraph in enumerate(doc.paragraphs): + for run_idx, run in enumerate(paragraph.runs): + # 检查运行中的图形 + for element in run._element: + if element.tag.endswith('drawing'): + # 提取图片关系ID + blip = element.find('.//a:blip', namespaces=nsmap) + if blip is not None: + embed_id = blip.get('{%s}embed' % nsmap['r']) + + # 从关系文件中获取图片文件名 + rel_entry = f' 0 # 保留原有的save_to_txt函数 def save_to_txt(content, file_path, mode='w'): @@ -66,34 +142,44 @@ def save_to_txt(content, file_path, mode='w'): print(f"保存文件{file_path}时出错: {str(e)}") return False -def process_directory(input_dir, output_dir): - """处理目录下所有docx文件""" - if not os.path.exists(input_dir): - print(f"输入目录不存在: {input_dir}") - return False - - # 确保输出目录存在并清空目录(只需一次) - if os.path.exists(output_dir): - for file in os.listdir(output_dir): - os.remove(os.path.join(output_dir, file)) - os.makedirs(output_dir, exist_ok=True) - - docx_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.docx')] - if not docx_files: - print(f"目录中没有找到docx文件: {input_dir}") - return False - - success_count = 0 - for docx_file in docx_files: - input_path = os.path.join(input_dir, docx_file) - print(f"正在处理文件: {docx_file}") - if process_document(input_path, output_dir): - success_count += 1 - - print(f"处理完成,共处理{success_count}/{len(docx_files)}个文件") - return success_count > 0 if __name__ == "__main__": - input_dir = '../static/Txt' - output_dir = '../Txt' - process_directory(input_dir, output_dir) + word_document_path = "/static/Test/带图的WORD文档_MATH_3.docx" + txt_output_dir = "/Txt/" + img_output_dir = "/static/Images/" + + # 提取图片 + listImage = extract_images_from_docx(word_document_path, img_output_dir) + # 读取内容 + res = read_word_content(word_document_path) + # 分块 + chunks = split_into_blocks(res) + saved_count = 0 + + # 使用原来的正则表达式 + pattern = re.compile(r'【图片\d+】') + # 初始化图片索引 + img_idx = 0 + + for x in chunks: + firstLine = x[1].split("\n")[0].strip() + content = x[1][len(firstLine):].strip() + + # 使用finditer查找所有匹配项 + # 使用闭包函数替换所有匹配项 + img_idx = [0] # 使用列表实现可变状态 + def replacer(match): + if img_idx[0] < len(listImage): + result = f"" + img_idx[0] += 1 + return result + return match.group() + + content = pattern.sub(replacer, content) + # 保存文本文件 + output_file = os.path.join(txt_output_dir, f"MATH_3_{x[0]}.txt") + full_content = f"{firstLine}\n{content}" + if save_to_txt(full_content, output_file, mode='w'): + saved_count += 1 + + print(f"处理完成,共保存{saved_count}个文件到目录: {txt_output_dir}") diff --git a/dsRag/Test/Test_MatchImage.py b/dsRag/Test/Test_MatchImage.py deleted file mode 100644 index 9303f272..00000000 --- a/dsRag/Test/Test_MatchImage.py +++ /dev/null @@ -1,185 +0,0 @@ -import re -import docx - -import os -import shutil -import uuid -import zipfile - -from docx import Document -from docx.oxml.ns import nsmap - - -def extract_images_from_docx(docx_path, output_folder): - """ - 从docx提取图片并记录位置 - :param docx_path: Word文档路径 - :param output_folder: 图片输出文件夹 - :return: 包含图片路径和位置的列表 - """ - # 创建一个List 记录每个图片的名称和序号 - image_data = [] - # 创建临时解压目录 - temp_dir = os.path.join(output_folder, "temp_docx") - os.makedirs(temp_dir, exist_ok=True) - - # 解压docx文件 - with zipfile.ZipFile(docx_path, 'r') as zip_ref: - zip_ref.extractall(temp_dir) - - # 读取主文档关系 - with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file: - rels_content = rels_file.read() - - # 加载主文档 - doc = Document(docx_path) - img_counter = 1 - - # 遍历所有段落 - for para_idx, paragraph in enumerate(doc.paragraphs): - for run_idx, run in enumerate(paragraph.runs): - # 检查运行中的图形 - for element in run._element: - if element.tag.endswith('drawing'): - # 提取图片关系ID - blip = element.find('.//a:blip', namespaces=nsmap) - if blip is not None: - embed_id = blip.get('{%s}embed' % nsmap['r']) - - # 从关系文件中获取图片文件名 - rel_entry = f' 2 else line - elif line and line[0].isdigit(): - line = line[1:] if len(line) > 1 else line - line = line.strip() - if in_block and line: # 只添加非空行 - current_block.append(line) - - if current_block: - blocks.append('\n'.join(current_block)) - - return [(i + 1, block) for i, block in enumerate(blocks)] - - -# 保留原有的save_to_txt函数 -def save_to_txt(content, file_path, mode='w'): - """将内容保存到文本文件""" - try: - with open(file_path, mode, encoding='utf-8') as f: - f.write(content) - return True - except Exception as e: - print(f"保存文件{file_path}时出错: {str(e)}") - return False - - -if __name__ == "__main__": - word_document_path = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\带图的WORD文档_MATH_3.docx" - txt_output_dir = "d:\\dsWork\\dsProject\\dsRag\\Txt\\" - img_output_dir = "d:\\dsWork\\dsProject\\dsRag\\static\\Images\\" - - # 提取图片 - listImage = extract_images_from_docx(word_document_path, img_output_dir) - # 读取内容 - res = read_word_content(word_document_path) - # 分块 - chunks = split_into_blocks(res) - saved_count = 0 - - # 使用原来的正则表达式 - pattern = re.compile(r'【图片\d+】') - # 初始化图片索引 - img_idx = 0 - - for x in chunks: - firstLine = x[1].split("\n")[0].strip() - content = x[1][len(firstLine):].strip() - - # 使用finditer查找所有匹配项 - # 使用闭包函数替换所有匹配项 - img_idx = [0] # 使用列表实现可变状态 - def replacer(match): - if img_idx[0] < len(listImage): - result = f"" - img_idx[0] += 1 - return result - return match.group() - - content = pattern.sub(replacer, content) - # 保存文本文件 - output_file = os.path.join(txt_output_dir, f"MATH_3_{x[0]}.txt") - full_content = f"{firstLine}\n{content}" - if save_to_txt(full_content, output_file, mode='w'): - saved_count += 1 - - print(f"处理完成,共保存{saved_count}个文件到目录: {txt_output_dir}")