dsProject/dsRag/Test/Test_MatchImage.py

import os

import docx

from Util.WordImageUtil import extract_images_from_docx


def read_word_content(docx_path):
    res = ""
    idx = 0
    """遍历Word文档的每个段落，输出文字或图片标识"""
    try:
        doc = docx.Document(docx_path)

        for paragraph in doc.paragraphs:
            has_image = False
            # 检查段落中是否有图片
            for run in paragraph.runs:
                for element in run._element:
                    if element.tag.endswith('drawing'):
                        # 找到图片元素
                        has_image = True
                        break
                if has_image:
                    break

            if has_image:
                idx = idx + 1
                res = res + "\n" + "【图片" + str(idx) + "】"
            elif paragraph.text.strip():
                res = res + "\n" + paragraph.text.strip()
        return res
    except Exception as e:
        print(f"处理Word文档时出错: {str(e)}")


def split_into_blocks(text):
    """按行遍历文本，发现'问题X'或'话题X'时开始分割，只移除前缀但保留整行内容"""
    blocks = []
    current_block = []
    in_block = False

    for line in text.splitlines():
        if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
            if in_block:
                blocks.append('\n'.join(current_block))
                current_block = []
            in_block = True
            # 循环移除问题和话题前缀后的数字
            while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
                if line.startswith(('问题', '话题')):
                    line = line[2:] if len(line) > 2 else line
                elif line and line[0].isdigit():
                    line = line[1:] if len(line) > 1 else line
                line = line.strip()
        if in_block and line:  # 只添加非空行
            current_block.append(line)

    if current_block:
        blocks.append('\n'.join(current_block))

    return [(i + 1, block) for i, block in enumerate(blocks)]


# 保留原有的save_to_txt函数
def save_to_txt(content, file_path, mode='w'):
    """将内容保存到文本文件"""
    try:
        with open(file_path, mode, encoding='utf-8') as f:
            f.write(content)
        return True
    except Exception as e:
        print(f"保存文件{file_path}时出错: {str(e)}")
        return False

if __name__ == "__main__":
    word_document_path = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\带图的WORD文档_MATH_3.docx"
    output_dir="d:\\dsWork\\dsProject\\dsRag\\static\\Test\\"
    extract_images_from_docx(word_document_path, output_dir)
    res = read_word_content(word_document_path)
    chunks = split_into_blocks(res)
    for x in chunks:
        print("===段落开始：===")
        firstLine = x[1].split("\n")[0].strip()
        content = x[1][len(firstLine):].strip()
        print("firstLine=" + firstLine)
        print("content=" + content)
        print("===段落结束：===\n")

        saved_count=0
        for chunk_num, chunk in chunks:
            chunk = chunk.strip()  # 确保去除空白字符
            output_file = os.path.join(output_dir, f"{chunk_num}.txt")
            if save_to_txt(chunk, output_file, mode='w'):
                saved_count += 1

        print(f"处理完成，共保存{saved_count}个文件到目录: {output_dir}")
-												'commit'

											
										
										
											4 weeks ago
+								import os
-												'commit'

											
										
										
											4 weeks ago
+								import docx
-												'commit'

											
										
										
											4 weeks ago
+								from Util.WordImageUtil import extract_images_from_docx
-												'commit'

											
										
										
											4 weeks ago
-												'commit'

											
										
										
											4 weeks ago
+								def read_word_content(docx_path):
-												'commit'

											
										
										
											4 weeks ago
+								    res = ""
-												'commit'

											
										
										
											4 weeks ago
+								    idx = 0
-												'commit'

											
										
										
											4 weeks ago
+								    """遍历Word文档的每个段落，输出文字或图片标识"""
 								    try:
 								        doc = docx.Document(docx_path)
-												'commit'

											
										
										
											4 weeks ago
-												'commit'

											
										
										
											4 weeks ago
+								        for paragraph in doc.paragraphs:
 								            has_image = False
 								            # 检查段落中是否有图片
 								            for run in paragraph.runs:
 								                for element in run._element:
 								                    if element.tag.endswith('drawing'):
 								                        # 找到图片元素
 								                        has_image = True
 								                        break
 								                if has_image:
 								                    break
-												'commit'

											
										
										
											4 weeks ago
-												'commit'

											
										
										
											4 weeks ago
+								            if has_image:
-												'commit'

											
										
										
											4 weeks ago
+								                idx = idx + 1
-												'commit'

											
										
										
											4 weeks ago
+								                res = res + "\n" + "【图片" + str(idx) + "】"
-												'commit'

											
										
										
											4 weeks ago
+								            elif paragraph.text.strip():
-												'commit'

											
										
										
											4 weeks ago
+								                res = res + "\n" + paragraph.text.strip()
 								        return res
-												'commit'

											
										
										
											4 weeks ago
+								    except Exception as e:
 								        print(f"处理Word文档时出错: {str(e)}")
-												'commit'

											
										
										
											4 weeks ago
-												'commit'

											
										
										
											4 weeks ago
+								def split_into_blocks(text):
 								    """按行遍历文本，发现'问题X'或'话题X'时开始分割，只移除前缀但保留整行内容"""
 								    blocks = []
 								    current_block = []
 								    in_block = False
 								    for line in text.splitlines():
 								        if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
 								            if in_block:
 								                blocks.append('\n'.join(current_block))
 								                current_block = []
 								            in_block = True
 								            # 循环移除问题和话题前缀后的数字
 								            while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
 								                if line.startswith(('问题', '话题')):
 								                    line = line[2:] if len(line) > 2 else line
 								                elif line and line[0].isdigit():
 								                    line = line[1:] if len(line) > 1 else line
 								                line = line.strip()
 								        if in_block and line:  # 只添加非空行
 								            current_block.append(line)
 								    if current_block:
 								        blocks.append('\n'.join(current_block))
 								    return [(i + 1, block) for i, block in enumerate(blocks)]
-												'commit'

											
										
										
											4 weeks ago
+								# 保留原有的save_to_txt函数
 								def save_to_txt(content, file_path, mode='w'):
 								    """将内容保存到文本文件"""
 								    try:
 								        with open(file_path, mode, encoding='utf-8') as f:
 								            f.write(content)
 								        return True
 								    except Exception as e:
 								        print(f"保存文件{file_path}时出错: {str(e)}")
 								        return False
-												'commit'

											
										
										
											4 weeks ago
+								if __name__ == "__main__":
 								    word_document_path = "d:\\dsWork\\dsProject\\dsRag\\static\\Test\\带图的WORD文档_MATH_3.docx"
-												'commit'

											
										
										
											4 weeks ago
+								    output_dir="d:\\dsWork\\dsProject\\dsRag\\static\\Test\\"
-												'commit'

											
										
										
											4 weeks ago
+								    extract_images_from_docx(word_document_path, output_dir)
-												'commit'

											
										
										
											4 weeks ago
+								    res = read_word_content(word_document_path)
-												'commit'

											
										
										
											4 weeks ago
+								    chunks = split_into_blocks(res)
 								    for x in chunks:
-												'commit'

											
										
										
											4 weeks ago
+								        print("===段落开始：===")
 								        firstLine = x[1].split("\n")[0].strip()
 								        content = x[1][len(firstLine):].strip()
 								        print("firstLine=" + firstLine)
 								        print("content=" + content)
 								        print("===段落结束：===\n")
-												'commit'

											
										
										
											4 weeks ago
 								        saved_count=0
 								        for chunk_num, chunk in chunks:
 								            chunk = chunk.strip()  # 确保去除空白字符
 								            output_file = os.path.join(output_dir, f"{chunk_num}.txt")
 								            if save_to_txt(chunk, output_file, mode='w'):
 								                saved_count += 1
 								        print(f"处理完成，共保存{saved_count}个文件到目录: {output_dir}")