diff --git a/dsRag/Test/TestReadWordImage.py b/dsRag/Test/T1.py similarity index 51% rename from dsRag/Test/TestReadWordImage.py rename to dsRag/Test/T1.py index 9c873a85..d23cfb1c 100644 --- a/dsRag/Test/TestReadWordImage.py +++ b/dsRag/Test/T1.py @@ -8,10 +8,4 @@ if __name__ == "__main__": word_path = "d:\\dsWork\\dsProject\\dsRag\\Test\\带图的WORD文档.docx" output_dir = "../static/Images" # 图片输出目录 os.makedirs(output_dir, exist_ok=True) - images = extract_images_from_docx(word_path, output_dir) - - # 打印结果 - for img in images: - print(f"图片保存至: {img['image_path']}") - loc = img['location'] - print(f"位置信息: 段落 {loc['paragraph_index']}") + extract_images_from_docx(word_path, output_dir) diff --git a/dsRag/Test/TestReadWordContent.py b/dsRag/Test/T2.py similarity index 94% rename from dsRag/Test/TestReadWordContent.py rename to dsRag/Test/T2.py index 74fa6c51..b4b22ca0 100644 --- a/dsRag/Test/TestReadWordContent.py +++ b/dsRag/Test/T2.py @@ -1,6 +1,6 @@ import os + import docx -from docx.oxml.ns import nsmap def read_word_content(docx_path): diff --git a/dsRag/Test/T3.py b/dsRag/Test/T3.py new file mode 100644 index 00000000..3476c8c4 --- /dev/null +++ b/dsRag/Test/T3.py @@ -0,0 +1,99 @@ +import os + +from Util.SplitDocxUtil import SplitDocxUtil + + +def split_into_blocks(text): + """按行遍历文本,发现'问题X'或'话题X'时开始分割,只移除前缀但保留整行内容""" + blocks = [] + current_block = [] + in_block = False + + for line in text.splitlines(): + if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]): + if in_block: + blocks.append('\n'.join(current_block)) + current_block = [] + in_block = True + # 循环移除问题和话题前缀后的数字 + while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())): + if line.startswith(('问题', '话题')): + line = line[2:] if len(line) > 2 else line + elif line and line[0].isdigit(): + line = line[1:] if len(line) > 1 else line + line = line.strip() + if in_block and line: # 只添加非空行 + current_block.append(line) + + if current_block: + blocks.append('\n'.join(current_block)) + + return [(i+1, block) for i, block in enumerate(blocks)] + +def process_document(input_path, output_dir): + """处理文档主函数""" + text = SplitDocxUtil.read_docx(input_path) + if not text: + print("无法读取输入文件内容") + return False + + # 清空目录操作已移到process_directory函数中 + + chunks = split_into_blocks(text) + print(f"共分割出{len(chunks)}个段落块") + + saved_count = 0 + # 从输入文件名中提取MATH_1部分 + file_prefix = os.path.basename(input_path).split('.')[0].split('_')[-2] + '_' + os.path.basename(input_path).split('.')[0].split('_')[-1] + + for chunk_num, chunk in chunks: + chunk = chunk.strip() # 确保去除空白字符 + output_file = os.path.join(output_dir, f"{file_prefix}_{chunk_num}.txt") + if save_to_txt(chunk, output_file, mode='w'): + saved_count += 1 + + print(f"处理完成,共保存{saved_count}个文件到目录: {output_dir}") + return saved_count > 0 + +# 保留原有的save_to_txt函数 +def save_to_txt(content, file_path, mode='w'): + """将内容保存到文本文件""" + try: + with open(file_path, mode, encoding='utf-8') as f: + f.write(content) + return True + except Exception as e: + print(f"保存文件{file_path}时出错: {str(e)}") + return False + +def process_directory(input_dir, output_dir): + """处理目录下所有docx文件""" + if not os.path.exists(input_dir): + print(f"输入目录不存在: {input_dir}") + return False + + # 确保输出目录存在并清空目录(只需一次) + if os.path.exists(output_dir): + for file in os.listdir(output_dir): + os.remove(os.path.join(output_dir, file)) + os.makedirs(output_dir, exist_ok=True) + + docx_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.docx')] + if not docx_files: + print(f"目录中没有找到docx文件: {input_dir}") + return False + + success_count = 0 + for docx_file in docx_files: + input_path = os.path.join(input_dir, docx_file) + print(f"正在处理文件: {docx_file}") + if process_document(input_path, output_dir): + success_count += 1 + + print(f"处理完成,共处理{success_count}/{len(docx_files)}个文件") + return success_count > 0 + +if __name__ == "__main__": + input_dir = '../static/Txt' + output_dir = '../Txt' + process_directory(input_dir, output_dir) diff --git a/dsRag/static/Images/ccef361db7f6463cb6c79fa980081c56.png b/dsRag/Test/带图的WORD文档.docx similarity index 76% rename from dsRag/static/Images/ccef361db7f6463cb6c79fa980081c56.png rename to dsRag/Test/带图的WORD文档.docx index 238ad2cb..d65d78e4 100644 Binary files a/dsRag/static/Images/ccef361db7f6463cb6c79fa980081c56.png and b/dsRag/Test/带图的WORD文档.docx differ diff --git a/dsRag/Util/WordImageUtil.py b/dsRag/Util/WordImageUtil.py index 501fb112..fc0e20e3 100644 --- a/dsRag/Util/WordImageUtil.py +++ b/dsRag/Util/WordImageUtil.py @@ -28,9 +28,10 @@ def extract_images_from_docx(docx_path, output_folder): # 加载主文档 doc = Document(docx_path) - image_data = [] img_counter = 1 + idx = 0 + # 遍历所有段落 for para_idx, paragraph in enumerate(doc.paragraphs): for run_idx, run in enumerate(paragraph.runs): @@ -57,24 +58,13 @@ def extract_images_from_docx(docx_path, output_folder): # 创建输出文件名 ext = os.path.splitext(src_path)[1] # 名称为uuid - img_name = f"{uuid.uuid4().hex}{ext}" + idx = idx + 1 + img_name = f"{idx}{ext}" dest_path = os.path.join(output_folder, img_name) - # 复制图片 shutil.copy(src_path, dest_path) - # 记录位置信息 - location = { - "paragraph_index": para_idx - } - - image_data.append({ - "image_path": dest_path, - "location": location - }) - img_counter += 1 # 清理临时目录 shutil.rmtree(temp_dir) - return image_data \ No newline at end of file diff --git a/dsRag/Util/__pycache__/WordImageUtil.cpython-310.pyc b/dsRag/Util/__pycache__/WordImageUtil.cpython-310.pyc index ae795c2f..89537786 100644 Binary files a/dsRag/Util/__pycache__/WordImageUtil.cpython-310.pyc and b/dsRag/Util/__pycache__/WordImageUtil.cpython-310.pyc differ diff --git a/dsRag/static/Images/a61b80b17c914030a6e28f5846f40407.png b/dsRag/static/Images/1.png similarity index 100% rename from dsRag/static/Images/a61b80b17c914030a6e28f5846f40407.png rename to dsRag/static/Images/1.png diff --git a/dsRag/static/Images/327c585f636f44a9a5a741d429344e55.png b/dsRag/static/Images/327c585f636f44a9a5a741d429344e55.png deleted file mode 100644 index 73c7f205..00000000 Binary files a/dsRag/static/Images/327c585f636f44a9a5a741d429344e55.png and /dev/null differ diff --git a/dsRag/static/Images/8a2ee23572fd4a0ebb90e2ad6334b85a.png b/dsRag/static/Images/8a2ee23572fd4a0ebb90e2ad6334b85a.png deleted file mode 100644 index 73c7f205..00000000 Binary files a/dsRag/static/Images/8a2ee23572fd4a0ebb90e2ad6334b85a.png and /dev/null differ