'commit'

4 weeks ago · 2fbffe2fd4
parent e6fee1edb4
commit 2fbffe2fd4
2 changed files with 143 additions and 242 deletions
--- a/dsRag/ElasticSearch/T2_SplitTxt.py
+++ b/dsRag/ElasticSearch/T2_SplitTxt.py
@ -1,6 +1,106 @@
+import re
+import docx
+
 import os
+import shutil
+import uuid
+import zipfile
+
+from docx import Document
+from docx.oxml.ns import nsmap
+
+
+def extract_images_from_docx(docx_path, output_folder):
+    """
+    从docx提取图片并记录位置
+    :param docx_path: Word文档路径
+    :param output_folder: 图片输出文件夹
+    :return: 包含图片路径和位置的列表
+    """
+    # 创建一个List<String> 记录每个图片的名称和序号
+    image_data = []
+    # 创建临时解压目录
+    temp_dir = os.path.join(output_folder, "temp_docx")
+    os.makedirs(temp_dir, exist_ok=True)
+
+    # 解压docx文件
+    with zipfile.ZipFile(docx_path, 'r') as zip_ref:
+        zip_ref.extractall(temp_dir)
+
+    # 读取主文档关系
+    with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
+        rels_content = rels_file.read()
+
+    # 加载主文档
+    doc = Document(docx_path)
+    img_counter = 1
+
+    # 遍历所有段落
+    for para_idx, paragraph in enumerate(doc.paragraphs):
+        for run_idx, run in enumerate(paragraph.runs):
+            # 检查运行中的图形
+            for element in run._element:
+                if element.tag.endswith('drawing'):
+                    # 提取图片关系ID
+                    blip = element.find('.//a:blip', namespaces=nsmap)
+                    if blip is not None:
+                        embed_id = blip.get('{%s}embed' % nsmap['r'])
+
+                        # 从关系文件中获取图片文件名
+                        rel_entry = f'<Relationship Id="{embed_id}"'
+                        if rel_entry in rels_content:
+                            start = rels_content.find(rel_entry)
+                            target_start = rels_content.find('Target="', start) + 8
+                            target_end = rels_content.find('"', target_start)
+                            image_path = rels_content[target_start:target_end]
+
+                            # 构建图片源路径
+                            src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
+
+                            if os.path.exists(src_path):
+                                # 创建输出文件名
+                                ext = os.path.splitext(src_path)[1]
+                                # 名称为uuid
+                                fileName=uuid.uuid4().hex
+                                img_name = f"{fileName}{ext}"
+                                image_data.append(img_name)
+                                dest_path = os.path.join(output_folder, img_name)
+                                # 复制图片
+                                shutil.copy(src_path, dest_path)
+
+                                img_counter += 1
+
+    # 清理临时目录
+    shutil.rmtree(temp_dir)
+    return image_data
+
+def read_word_content(docx_path):
+    res = ""
+    idx = 0
+    """遍历Word文档的每个段落，输出文字或图片标识"""
+    try:
+        doc = docx.Document(docx_path)
+
+        for paragraph in doc.paragraphs:
+            has_image = False
+            # 检查段落中是否有图片
+            for run in paragraph.runs:
+                for element in run._element:
+                    if element.tag.endswith('drawing'):
+                        # 找到图片元素
+                        has_image = True
+                        break
+                if has_image:
+                    break

-from Util.SplitDocxUtil import SplitDocxUtil
+            if has_image:
+                idx = idx + 1
+                res = res + "\n" + "【图片" + str(idx) + "】"
+            elif paragraph.text.strip():
+                res = res + "\n" + paragraph.text.strip()
+        return res
+    except Exception as e:
+        print(f"处理Word文档时出错: {str(e)}")


 def split_into_blocks(text):
@ -8,7 +108,7 @@ def split_into_blocks(text):
    blocks = []
    current_block = []
    in_block = False
-    
+
    for line in text.splitlines():
        if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
            if in_block:
@ -24,36 +124,12 @@ def split_into_blocks(text):
                line = line.strip()
        if in_block and line:  # 只添加非空行
            current_block.append(line)
-    
+
    if current_block:
        blocks.append('\n'.join(current_block))
-    
-    return [(i+1, block) for i, block in enumerate(blocks)]
-
-def process_document(input_path, output_dir):
-    """处理文档主函数"""
-    text = SplitDocxUtil.read_docx(input_path)
-    if not text:
-        print("无法读取输入文件内容")
-        return False
-    
-    # 清空目录操作已移到process_directory函数中

-    chunks = split_into_blocks(text)
-    print(f"共分割出{len(chunks)}个段落块")
+    return [(i + 1, block) for i, block in enumerate(blocks)]

-    saved_count = 0
-    # 从输入文件名中提取MATH_1部分
-    file_prefix = os.path.basename(input_path).split('.')[0].split('_')[-2] + '_' + os.path.basename(input_path).split('.')[0].split('_')[-1]
-    
-    for chunk_num, chunk in chunks:
-        chunk = chunk.strip()  # 确保去除空白字符
-        output_file = os.path.join(output_dir, f"{file_prefix}_{chunk_num}.txt")
-        if save_to_txt(chunk, output_file, mode='w'):
-            saved_count += 1
-
-    print(f"处理完成，共保存{saved_count}个文件到目录: {output_dir}")
-    return saved_count > 0

 # 保留原有的save_to_txt函数
 def save_to_txt(content, file_path, mode='w'):
@ -66,34 +142,44 @@ def save_to_txt(content, file_path, mode='w'):
        print(f"保存文件{file_path}时出错: {str(e)}")
        return False

-def process_directory(input_dir, output_dir):
-    """处理目录下所有docx文件"""
-    if not os.path.exists(input_dir):
-        print(f"输入目录不存在: {input_dir}")
-        return False
-    
-    # 确保输出目录存在并清空目录（只需一次）
-    if os.path.exists(output_dir):
-        for file in os.listdir(output_dir):
-            os.remove(os.path.join(output_dir, file))
-    os.makedirs(output_dir, exist_ok=True)
-    
-    docx_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.docx')]
-    if not docx_files:
-        print(f"目录中没有找到docx文件: {input_dir}")
-        return False
-    
-    success_count = 0
-    for docx_file in docx_files:
-        input_path = os.path.join(input_dir, docx_file)
-        print(f"正在处理文件: {docx_file}")
-        if process_document(input_path, output_dir):
-            success_count += 1
-    
-    print(f"处理完成，共处理{success_count}/{len(docx_files)}个文件")
-    return success_count > 0

 if __name__ == "__main__":
-    input_dir = '../static/Txt'
-    output_dir = '../Txt'
-    process_directory(input_dir, output_dir)
+    word_document_path = "/static/Test/带图的WORD文档_MATH_3.docx"
+    txt_output_dir = "/Txt/"
+    img_output_dir = "/static/Images/"
+
+    # 提取图片
+    listImage = extract_images_from_docx(word_document_path, img_output_dir)
+    # 读取内容
+    res = read_word_content(word_document_path)
+    # 分块
+    chunks = split_into_blocks(res)
+    saved_count = 0
+
+    # 使用原来的正则表达式
+    pattern = re.compile(r'【图片\d+】')
+    # 初始化图片索引
+    img_idx = 0
+
+    for x in chunks:
+        firstLine = x[1].split("\n")[0].strip()
+        content = x[1][len(firstLine):].strip()
+
+        # 使用finditer查找所有匹配项
+        # 使用闭包函数替换所有匹配项
+        img_idx = [0]  # 使用列表实现可变状态
+        def replacer(match):
+            if img_idx[0] < len(listImage):
+                result = f"<img src=\"./static/Images/{listImage[img_idx[0]]}\">"
+                img_idx[0] += 1
+                return result
+            return match.group()
+            
+        content = pattern.sub(replacer, content)
+        # 保存文本文件
+        output_file = os.path.join(txt_output_dir, f"MATH_3_{x[0]}.txt")
+        full_content = f"{firstLine}\n{content}"
+        if save_to_txt(full_content, output_file, mode='w'):
+            saved_count += 1
+
+    print(f"处理完成，共保存{saved_count}个文件到目录: {txt_output_dir}")
--- a/dsRag/Test/Test_MatchImage.py
+++ b/dsRag/Test/Test_MatchImage.py
@ -1,185 +0,0 @@
-import re
-import docx
-
-import os
-import shutil
-import uuid
-import zipfile
-
-from docx import Document
-from docx.oxml.ns import nsmap
-
-
-def extract_images_from_docx(docx_path, output_folder):
-    """
-    从docx提取图片并记录位置
-    :param docx_path: Word文档路径
-    :param output_folder: 图片输出文件夹
-    :return: 包含图片路径和位置的列表
-    """
-    # 创建一个List<String> 记录每个图片的名称和序号
-    image_data = []
-    # 创建临时解压目录
-    temp_dir = os.path.join(output_folder, "temp_docx")
-    os.makedirs(temp_dir, exist_ok=True)
-
-    # 解压docx文件
-    with zipfile.ZipFile(docx_path, 'r') as zip_ref:
-        zip_ref.extractall(temp_dir)
-
-    # 读取主文档关系
-    with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
-        rels_content = rels_file.read()
-
-    # 加载主文档
-    doc = Document(docx_path)
-    img_counter = 1
-
-    # 遍历所有段落
-    for para_idx, paragraph in enumerate(doc.paragraphs):
-        for run_idx, run in enumerate(paragraph.runs):
-            # 检查运行中的图形
-            for element in run._element:
-                if element.tag.endswith('drawing'):
-                    # 提取图片关系ID
-                    blip = element.find('.//a:blip', namespaces=nsmap)
-                    if blip is not None:
-                        embed_id = blip.get('{%s}embed' % nsmap['r'])
-
-                        # 从关系文件中获取图片文件名
-                        rel_entry = f'<Relationship Id="{embed_id}"'
-                        if rel_entry in rels_content:
-                            start = rels_content.find(rel_entry)
-                            target_start = rels_content.find('Target="', start) + 8
-                            target_end = rels_content.find('"', target_start)
-                            image_path = rels_content[target_start:target_end]
-
-                            # 构建图片源路径
-                            src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))
-
-                            if os.path.exists(src_path):
-                                # 创建输出文件名
-                                ext = os.path.splitext(src_path)[1]
-                                # 名称为uuid
-                                fileName=uuid.uuid4().hex
-                                img_name = f"{fileName}{ext}"
-                                image_data.append(img_name)
-                                dest_path = os.path.join(output_folder, img_name)
-                                # 复制图片
-                                shutil.copy(src_path, dest_path)
-
-                                img_counter += 1
-
-    # 清理临时目录
-    shutil.rmtree(temp_dir)
-    return image_data
-
-def read_word_content(docx_path):
-    res = ""
-    idx = 0
-    """遍历Word文档的每个段落，输出文字或图片标识"""
-    try:
-        doc = docx.Document(docx_path)
-
-        for paragraph in doc.paragraphs:
-            has_image = False
-            # 检查段落中是否有图片
-            for run in paragraph.runs:
-                for element in run._element:
-                    if element.tag.endswith('drawing'):
-                        # 找到图片元素
-                        has_image = True
-                        break
-                if has_image:
-                    break
-
-            if has_image:
-                idx = idx + 1
-                res = res + "\n" + "【图片" + str(idx) + "】"
-            elif paragraph.text.strip():
-                res = res + "\n" + paragraph.text.strip()
-        return res
-    except Exception as e:
-        print(f"处理Word文档时出错: {str(e)}")
-
-
-def split_into_blocks(text):
-    """按行遍历文本，发现'问题X'或'话题X'时开始分割，只移除前缀但保留整行内容"""
-    blocks = []
-    current_block = []
-    in_block = False
-
-    for line in text.splitlines():
-        if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
-            if in_block:
-                blocks.append('\n'.join(current_block))
-                current_block = []
-            in_block = True
-            # 循环移除问题和话题前缀后的数字
-            while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
-                if line.startswith(('问题', '话题')):
-                    line = line[2:] if len(line) > 2 else line
-                elif line and line[0].isdigit():
-                    line = line[1:] if len(line) > 1 else line
-                line = line.strip()
-        if in_block and line:  # 只添加非空行
-            current_block.append(line)
-
-    if current_block:
-        blocks.append('\n'.join(current_block))
-
-    return [(i + 1, block) for i, block in enumerate(blocks)]
-
-
-# 保留原有的save_to_txt函数
-def save_to_txt(content, file_path, mode='w'):
-    """将内容保存到文本文件"""
-    try:
-        with open(file_path, mode, encoding='utf-8') as f:
-            f.write(content)
-        return True
-    except Exception as e:
-        print(f"保存文件{file_path}时出错: {str(e)}")
-        return False
-
-
-if __name__ == "__main__":
-    word_document_path = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\带图的WORD文档_MATH_3.docx"
-    txt_output_dir = "d:\\dsWork\\dsProject\\dsRag\\Txt\\"
-    img_output_dir = "d:\\dsWork\\dsProject\\dsRag\\static\\Images\\"
-
-    # 提取图片
-    listImage = extract_images_from_docx(word_document_path, img_output_dir)
-    # 读取内容
-    res = read_word_content(word_document_path)
-    # 分块
-    chunks = split_into_blocks(res)
-    saved_count = 0
-
-    # 使用原来的正则表达式
-    pattern = re.compile(r'【图片\d+】')
-    # 初始化图片索引
-    img_idx = 0
-
-    for x in chunks:
-        firstLine = x[1].split("\n")[0].strip()
-        content = x[1][len(firstLine):].strip()
-
-        # 使用finditer查找所有匹配项
-        # 使用闭包函数替换所有匹配项
-        img_idx = [0]  # 使用列表实现可变状态
-        def replacer(match):
-            if img_idx[0] < len(listImage):
-                result = f"<img src=\"./static/Images/{listImage[img_idx[0]]}\">"
-                img_idx[0] += 1
-                return result
-            return match.group()
-            
-        content = pattern.sub(replacer, content)
-        # 保存文本文件
-        output_file = os.path.join(txt_output_dir, f"MATH_3_{x[0]}.txt")
-        full_content = f"{firstLine}\n{content}"
-        if save_to_txt(full_content, output_file, mode='w'):
-            saved_count += 1
-
-    print(f"处理完成，共保存{saved_count}个文件到目录: {txt_output_dir}")