'commit'

4 weeks ago · ad174680e6
parent 9157880400
commit ad174680e6
5 changed files with 99 additions and 49 deletions
--- a/dsRag/Test/TestReadWordTextAndImage.py
+++ b/dsRag/Test/TestReadWordTextAndImage.py
@ -0,0 +1,99 @@
+import os
+
+from Util.SplitDocxUtil import SplitDocxUtil
+
+
+def split_into_blocks(text):
+    """按行遍历文本，发现'问题X'或'话题X'时开始分割，只移除前缀但保留整行内容"""
+    blocks = []
+    current_block = []
+    in_block = False
+    
+    for line in text.splitlines():
+        if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
+            if in_block:
+                blocks.append('\n'.join(current_block))
+                current_block = []
+            in_block = True
+            # 循环移除问题和话题前缀后的数字
+            while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
+                if line.startswith(('问题', '话题')):
+                    line = line[2:] if len(line) > 2 else line
+                elif line and line[0].isdigit():
+                    line = line[1:] if len(line) > 1 else line
+                line = line.strip()
+        if in_block and line:  # 只添加非空行
+            current_block.append(line)
+    
+    if current_block:
+        blocks.append('\n'.join(current_block))
+    
+    return [(i+1, block) for i, block in enumerate(blocks)]
+
+def process_document(input_path, output_dir):
+    """处理文档主函数"""
+    text = SplitDocxUtil.read_docx(input_path)
+    if not text:
+        print("无法读取输入文件内容")
+        return False
+    
+    # 清空目录操作已移到process_directory函数中
+
+    chunks = split_into_blocks(text)
+    print(f"共分割出{len(chunks)}个段落块")
+
+    saved_count = 0
+    # 从输入文件名中提取MATH_1部分
+    file_prefix = os.path.basename(input_path).split('.')[0].split('_')[-2] + '_' + os.path.basename(input_path).split('.')[0].split('_')[-1]
+    
+    for chunk_num, chunk in chunks:
+        chunk = chunk.strip()  # 确保去除空白字符
+        output_file = os.path.join(output_dir, f"{file_prefix}_{chunk_num}.txt")
+        if save_to_txt(chunk, output_file, mode='w'):
+            saved_count += 1
+
+    print(f"处理完成，共保存{saved_count}个文件到目录: {output_dir}")
+    return saved_count > 0
+
+# 保留原有的save_to_txt函数
+def save_to_txt(content, file_path, mode='w'):
+    """将内容保存到文本文件"""
+    try:
+        with open(file_path, mode, encoding='utf-8') as f:
+            f.write(content)
+        return True
+    except Exception as e:
+        print(f"保存文件{file_path}时出错: {str(e)}")
+        return False
+
+def process_directory(input_dir, output_dir):
+    """处理目录下所有docx文件"""
+    if not os.path.exists(input_dir):
+        print(f"输入目录不存在: {input_dir}")
+        return False
+    
+    # 确保输出目录存在并清空目录（只需一次）
+    if os.path.exists(output_dir):
+        for file in os.listdir(output_dir):
+            os.remove(os.path.join(output_dir, file))
+    os.makedirs(output_dir, exist_ok=True)
+    
+    docx_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.docx')]
+    if not docx_files:
+        print(f"目录中没有找到docx文件: {input_dir}")
+        return False
+    
+    success_count = 0
+    for docx_file in docx_files:
+        input_path = os.path.join(input_dir, docx_file)
+        print(f"正在处理文件: {docx_file}")
+        if process_document(input_path, output_dir):
+            success_count += 1
+    
+    print(f"处理完成，共处理{success_count}/{len(docx_files)}个文件")
+    return success_count > 0
+
+if __name__ == "__main__":
+    input_dir = '../static/Txt'
+    output_dir = '../Txt'
+    process_directory(input_dir, output_dir)
--- a/dsRag/Test/TestWriteDoc.py
+++ b/dsRag/Test/TestWriteDoc.py
@ -1,44 +0,0 @@
-from bs4 import BeautifulSoup
-from docx import Document
-"""
-pip install python-docx html2text beautifulsoup4
-"""
-
-def html_to_word(html_content, word_path):
-    # 解析 HTML
-    soup = BeautifulSoup(html_content, 'html.parser')
-
-    # 创建 Word 文档
-    doc = Document()
-
-    # 遍历 HTML 的所有段落
-    for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div']):
-        # 获取文本内容
-        text = element.get_text(strip=True)
-        if text:
-            # 添加到 Word 文档
-            doc.add_paragraph(text)
-
-    # 保存 Word 文档
-    doc.save(word_path)
-    print(f"HTML content saved to {word_path}")
-
-
-# 示例 HTML 内容
-html_content = """
-<!DOCTYPE html>
-<html>
-<head>
-    <title>Sample HTML</title>
-</head>
-<body>
-    <h1>Heading 1</h1>
-    <p>This is a paragraph.</p>
-    <h2>Heading 2</h2>
-    <div>Content inside a div.</div>
-</body>
-</html>
-"""
-
-# 调用函数
-html_to_word(html_content, "output.docx")
--- a/dsRag/Test/output.docx
+++ b/dsRag/Test/output.docx
--- a/dsRag/Test/question_1.txt
+++ b/dsRag/Test/question_1.txt
@ -1,4 +0,0 @@
-问题1 教学建议与意义
-在教学过程中，引导学生构建和理解模型，不仅能提升他们分析和解决问题的能力，还能激发他们发现问题和提出问题的意识。例如，在认识路程模型时，教师可通过生活化情境让学生理解速度的概念及其单位表示。
-模型思想是《义务教育数学课程标准》中强调的核心素养之一，它帮助学生建立从现实世界抽象出数学问题的能力，并通过数学语言进行描述和解释。
-因此，在“综合与实践”类教学内容中，应加强模型的应用训练，以培养学生应用数学知识解决实际问题的能力。
--- a/dsRag/Test/question_2.txt
+++ b/dsRag/Test/question_2.txt
@ -1 +0,0 @@
-问题2 我随便写点什么