diff --git a/dsRag/Test/T1_extract_images_from_docx.py b/dsRag/Test/T1_extract_images_from_docx.py deleted file mode 100644 index 72c9a75a..00000000 --- a/dsRag/Test/T1_extract_images_from_docx.py +++ /dev/null @@ -1,8 +0,0 @@ -from Util.WordImageUtil import extract_images_from_docx - - -# 使用示例 -if __name__ == "__main__": - word_path = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\带图的WORD文档_MATH_3.docx" - output_dir = "../static/Images" # 图片输出目录 - extract_images_from_docx(word_path, output_dir) diff --git a/dsRag/Test/T3.py b/dsRag/Test/T3.py deleted file mode 100644 index 3476c8c4..00000000 --- a/dsRag/Test/T3.py +++ /dev/null @@ -1,99 +0,0 @@ -import os - -from Util.SplitDocxUtil import SplitDocxUtil - - -def split_into_blocks(text): - """按行遍历文本,发现'问题X'或'话题X'时开始分割,只移除前缀但保留整行内容""" - blocks = [] - current_block = [] - in_block = False - - for line in text.splitlines(): - if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]): - if in_block: - blocks.append('\n'.join(current_block)) - current_block = [] - in_block = True - # 循环移除问题和话题前缀后的数字 - while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())): - if line.startswith(('问题', '话题')): - line = line[2:] if len(line) > 2 else line - elif line and line[0].isdigit(): - line = line[1:] if len(line) > 1 else line - line = line.strip() - if in_block and line: # 只添加非空行 - current_block.append(line) - - if current_block: - blocks.append('\n'.join(current_block)) - - return [(i+1, block) for i, block in enumerate(blocks)] - -def process_document(input_path, output_dir): - """处理文档主函数""" - text = SplitDocxUtil.read_docx(input_path) - if not text: - print("无法读取输入文件内容") - return False - - # 清空目录操作已移到process_directory函数中 - - chunks = split_into_blocks(text) - print(f"共分割出{len(chunks)}个段落块") - - saved_count = 0 - # 从输入文件名中提取MATH_1部分 - file_prefix = os.path.basename(input_path).split('.')[0].split('_')[-2] + '_' + os.path.basename(input_path).split('.')[0].split('_')[-1] - - for chunk_num, chunk in chunks: - chunk = chunk.strip() # 确保去除空白字符 - output_file = os.path.join(output_dir, f"{file_prefix}_{chunk_num}.txt") - if save_to_txt(chunk, output_file, mode='w'): - saved_count += 1 - - print(f"处理完成,共保存{saved_count}个文件到目录: {output_dir}") - return saved_count > 0 - -# 保留原有的save_to_txt函数 -def save_to_txt(content, file_path, mode='w'): - """将内容保存到文本文件""" - try: - with open(file_path, mode, encoding='utf-8') as f: - f.write(content) - return True - except Exception as e: - print(f"保存文件{file_path}时出错: {str(e)}") - return False - -def process_directory(input_dir, output_dir): - """处理目录下所有docx文件""" - if not os.path.exists(input_dir): - print(f"输入目录不存在: {input_dir}") - return False - - # 确保输出目录存在并清空目录(只需一次) - if os.path.exists(output_dir): - for file in os.listdir(output_dir): - os.remove(os.path.join(output_dir, file)) - os.makedirs(output_dir, exist_ok=True) - - docx_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.docx')] - if not docx_files: - print(f"目录中没有找到docx文件: {input_dir}") - return False - - success_count = 0 - for docx_file in docx_files: - input_path = os.path.join(input_dir, docx_file) - print(f"正在处理文件: {docx_file}") - if process_document(input_path, output_dir): - success_count += 1 - - print(f"处理完成,共处理{success_count}/{len(docx_files)}个文件") - return success_count > 0 - -if __name__ == "__main__": - input_dir = '../static/Txt' - output_dir = '../Txt' - process_directory(input_dir, output_dir) diff --git a/dsRag/Test/T2_read_word_content.py b/dsRag/Test/Test_MatchImage.py similarity index 93% rename from dsRag/Test/T2_read_word_content.py rename to dsRag/Test/Test_MatchImage.py index 7b16d2bb..557d5e29 100644 --- a/dsRag/Test/T2_read_word_content.py +++ b/dsRag/Test/Test_MatchImage.py @@ -2,6 +2,8 @@ import os import docx +from Util.WordImageUtil import extract_images_from_docx + def read_word_content(docx_path): res = "" @@ -74,6 +76,7 @@ def save_to_txt(content, file_path, mode='w'): if __name__ == "__main__": word_document_path = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\带图的WORD文档_MATH_3.docx" output_dir="d:\\dsWork\\dsProject\\dsRag\\static\\Test\\" + extract_images_from_docx(word_document_path, output_dir) res = read_word_content(word_document_path) chunks = split_into_blocks(res) for x in chunks: diff --git a/dsRag/static/Test/1.png b/dsRag/static/Test/1.png new file mode 100644 index 00000000..238ad2cb Binary files /dev/null and b/dsRag/static/Test/1.png differ diff --git a/dsRag/static/Test/2.png b/dsRag/static/Test/2.png new file mode 100644 index 00000000..73c7f205 Binary files /dev/null and b/dsRag/static/Test/2.png differ