diff --git a/dsRag/Test/TestReadWordTextAndImage.py b/dsRag/Test/TestReadWordTextAndImage.py new file mode 100644 index 00000000..3476c8c4 --- /dev/null +++ b/dsRag/Test/TestReadWordTextAndImage.py @@ -0,0 +1,99 @@ +import os + +from Util.SplitDocxUtil import SplitDocxUtil + + +def split_into_blocks(text): + """按行遍历文本,发现'问题X'或'话题X'时开始分割,只移除前缀但保留整行内容""" + blocks = [] + current_block = [] + in_block = False + + for line in text.splitlines(): + if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]): + if in_block: + blocks.append('\n'.join(current_block)) + current_block = [] + in_block = True + # 循环移除问题和话题前缀后的数字 + while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())): + if line.startswith(('问题', '话题')): + line = line[2:] if len(line) > 2 else line + elif line and line[0].isdigit(): + line = line[1:] if len(line) > 1 else line + line = line.strip() + if in_block and line: # 只添加非空行 + current_block.append(line) + + if current_block: + blocks.append('\n'.join(current_block)) + + return [(i+1, block) for i, block in enumerate(blocks)] + +def process_document(input_path, output_dir): + """处理文档主函数""" + text = SplitDocxUtil.read_docx(input_path) + if not text: + print("无法读取输入文件内容") + return False + + # 清空目录操作已移到process_directory函数中 + + chunks = split_into_blocks(text) + print(f"共分割出{len(chunks)}个段落块") + + saved_count = 0 + # 从输入文件名中提取MATH_1部分 + file_prefix = os.path.basename(input_path).split('.')[0].split('_')[-2] + '_' + os.path.basename(input_path).split('.')[0].split('_')[-1] + + for chunk_num, chunk in chunks: + chunk = chunk.strip() # 确保去除空白字符 + output_file = os.path.join(output_dir, f"{file_prefix}_{chunk_num}.txt") + if save_to_txt(chunk, output_file, mode='w'): + saved_count += 1 + + print(f"处理完成,共保存{saved_count}个文件到目录: {output_dir}") + return saved_count > 0 + +# 保留原有的save_to_txt函数 +def save_to_txt(content, file_path, mode='w'): + """将内容保存到文本文件""" + try: + with open(file_path, mode, encoding='utf-8') as f: + f.write(content) + return True + except Exception as e: + print(f"保存文件{file_path}时出错: {str(e)}") + return False + +def process_directory(input_dir, output_dir): + """处理目录下所有docx文件""" + if not os.path.exists(input_dir): + print(f"输入目录不存在: {input_dir}") + return False + + # 确保输出目录存在并清空目录(只需一次) + if os.path.exists(output_dir): + for file in os.listdir(output_dir): + os.remove(os.path.join(output_dir, file)) + os.makedirs(output_dir, exist_ok=True) + + docx_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.docx')] + if not docx_files: + print(f"目录中没有找到docx文件: {input_dir}") + return False + + success_count = 0 + for docx_file in docx_files: + input_path = os.path.join(input_dir, docx_file) + print(f"正在处理文件: {docx_file}") + if process_document(input_path, output_dir): + success_count += 1 + + print(f"处理完成,共处理{success_count}/{len(docx_files)}个文件") + return success_count > 0 + +if __name__ == "__main__": + input_dir = '../static/Txt' + output_dir = '../Txt' + process_directory(input_dir, output_dir) diff --git a/dsRag/Test/TestWriteDoc.py b/dsRag/Test/TestWriteDoc.py deleted file mode 100644 index dc8469e1..00000000 --- a/dsRag/Test/TestWriteDoc.py +++ /dev/null @@ -1,44 +0,0 @@ -from bs4 import BeautifulSoup -from docx import Document -""" -pip install python-docx html2text beautifulsoup4 -""" - -def html_to_word(html_content, word_path): - # 解析 HTML - soup = BeautifulSoup(html_content, 'html.parser') - - # 创建 Word 文档 - doc = Document() - - # 遍历 HTML 的所有段落 - for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div']): - # 获取文本内容 - text = element.get_text(strip=True) - if text: - # 添加到 Word 文档 - doc.add_paragraph(text) - - # 保存 Word 文档 - doc.save(word_path) - print(f"HTML content saved to {word_path}") - - -# 示例 HTML 内容 -html_content = """ - - -
-This is a paragraph.
-