'commit'

4 weeks ago · 35d113b86f
parent 57a2ed2788
commit 35d113b86f
9 changed files with 105 additions and 22 deletions
--- a/dsRag/Test/TestReadWordImage.py
+++ b/dsRag/Test/TestReadWordImage.py
@ -8,10 +8,4 @@ if __name__ == "__main__":
    word_path = "d:\\dsWork\\dsProject\\dsRag\\Test\\带图的WORD文档.docx"
    output_dir = "../static/Images"  # 图片输出目录
    os.makedirs(output_dir, exist_ok=True)
-    images = extract_images_from_docx(word_path, output_dir)
-
-    # 打印结果
-    for img in images:
-        print(f"图片保存至: {img['image_path']}")
-        loc = img['location']
-        print(f"位置信息: 段落 {loc['paragraph_index']}")
+    extract_images_from_docx(word_path, output_dir)
--- a/dsRag/Test/TestReadWordContent.py
+++ b/dsRag/Test/TestReadWordContent.py
@ -1,6 +1,6 @@
 import os
+
 import docx
-from docx.oxml.ns import nsmap


 def read_word_content(docx_path):
--- a/dsRag/Test/T3.py
+++ b/dsRag/Test/T3.py
@ -0,0 +1,99 @@
+import os
+
+from Util.SplitDocxUtil import SplitDocxUtil
+
+
+def split_into_blocks(text):
+    """按行遍历文本，发现'问题X'或'话题X'时开始分割，只移除前缀但保留整行内容"""
+    blocks = []
+    current_block = []
+    in_block = False
+    
+    for line in text.splitlines():
+        if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
+            if in_block:
+                blocks.append('\n'.join(current_block))
+                current_block = []
+            in_block = True
+            # 循环移除问题和话题前缀后的数字
+            while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
+                if line.startswith(('问题', '话题')):
+                    line = line[2:] if len(line) > 2 else line
+                elif line and line[0].isdigit():
+                    line = line[1:] if len(line) > 1 else line
+                line = line.strip()
+        if in_block and line:  # 只添加非空行
+            current_block.append(line)
+    
+    if current_block:
+        blocks.append('\n'.join(current_block))
+    
+    return [(i+1, block) for i, block in enumerate(blocks)]
+
+def process_document(input_path, output_dir):
+    """处理文档主函数"""
+    text = SplitDocxUtil.read_docx(input_path)
+    if not text:
+        print("无法读取输入文件内容")
+        return False
+    
+    # 清空目录操作已移到process_directory函数中
+
+    chunks = split_into_blocks(text)
+    print(f"共分割出{len(chunks)}个段落块")
+
+    saved_count = 0
+    # 从输入文件名中提取MATH_1部分
+    file_prefix = os.path.basename(input_path).split('.')[0].split('_')[-2] + '_' + os.path.basename(input_path).split('.')[0].split('_')[-1]
+    
+    for chunk_num, chunk in chunks:
+        chunk = chunk.strip()  # 确保去除空白字符
+        output_file = os.path.join(output_dir, f"{file_prefix}_{chunk_num}.txt")
+        if save_to_txt(chunk, output_file, mode='w'):
+            saved_count += 1
+
+    print(f"处理完成，共保存{saved_count}个文件到目录: {output_dir}")
+    return saved_count > 0
+
+# 保留原有的save_to_txt函数
+def save_to_txt(content, file_path, mode='w'):
+    """将内容保存到文本文件"""
+    try:
+        with open(file_path, mode, encoding='utf-8') as f:
+            f.write(content)
+        return True
+    except Exception as e:
+        print(f"保存文件{file_path}时出错: {str(e)}")
+        return False
+
+def process_directory(input_dir, output_dir):
+    """处理目录下所有docx文件"""
+    if not os.path.exists(input_dir):
+        print(f"输入目录不存在: {input_dir}")
+        return False
+    
+    # 确保输出目录存在并清空目录（只需一次）
+    if os.path.exists(output_dir):
+        for file in os.listdir(output_dir):
+            os.remove(os.path.join(output_dir, file))
+    os.makedirs(output_dir, exist_ok=True)
+    
+    docx_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.docx')]
+    if not docx_files:
+        print(f"目录中没有找到docx文件: {input_dir}")
+        return False
+    
+    success_count = 0
+    for docx_file in docx_files:
+        input_path = os.path.join(input_dir, docx_file)
+        print(f"正在处理文件: {docx_file}")
+        if process_document(input_path, output_dir):
+            success_count += 1
+    
+    print(f"处理完成，共处理{success_count}/{len(docx_files)}个文件")
+    return success_count > 0
+
+if __name__ == "__main__":
+    input_dir = '../static/Txt'
+    output_dir = '../Txt'
+    process_directory(input_dir, output_dir)
--- a/dsRag/static/Images/ccef361db7f6463cb6c79fa980081c56.png
+++ b/dsRag/static/Images/ccef361db7f6463cb6c79fa980081c56.png
--- a/dsRag/Util/WordImageUtil.py
+++ b/dsRag/Util/WordImageUtil.py
@ -28,9 +28,10 @@ def extract_images_from_docx(docx_path, output_folder):

    # 加载主文档
    doc = Document(docx_path)
-    image_data = []
    img_counter = 1

+    idx = 0
+
    # 遍历所有段落
    for para_idx, paragraph in enumerate(doc.paragraphs):
        for run_idx, run in enumerate(paragraph.runs):
@ -57,24 +58,13 @@ def extract_images_from_docx(docx_path, output_folder):
                                # 创建输出文件名
                                ext = os.path.splitext(src_path)[1]
                                # 名称为uuid
-                                img_name = f"{uuid.uuid4().hex}{ext}"
+                                idx = idx + 1
+                                img_name = f"{idx}{ext}"
                                dest_path = os.path.join(output_folder, img_name)
-
                                # 复制图片
                                shutil.copy(src_path, dest_path)

-                                # 记录位置信息
-                                location = {
-                                    "paragraph_index": para_idx
-                                }
-
-                                image_data.append({
-                                    "image_path": dest_path,
-                                    "location": location
-                                })
-
                                img_counter += 1

    # 清理临时目录
    shutil.rmtree(temp_dir)
-    return image_data
--- a/dsRag/Util/pycache/WordImageUtil.cpython-310.pyc
+++ b/dsRag/Util/pycache/WordImageUtil.cpython-310.pyc
--- a/dsRag/static/Images/a61b80b17c914030a6e28f5846f40407.png
+++ b/dsRag/static/Images/a61b80b17c914030a6e28f5846f40407.png
--- a/dsRag/static/Images/327c585f636f44a9a5a741d429344e55.png
+++ b/dsRag/static/Images/327c585f636f44a9a5a741d429344e55.png
--- a/dsRag/static/Images/8a2ee23572fd4a0ebb90e2ad6334b85a.png
+++ b/dsRag/static/Images/8a2ee23572fd4a0ebb90e2ad6334b85a.png