dsProject/dsRag/ElasticSearch/T2_SplitTxt.py

import re
import warnings

import docx

import os
import shutil
import uuid
import zipfile

from docx import Document
from docx.oxml.ns import nsmap

# 抑制HTTPS相关警告
warnings.filterwarnings('ignore', message='Connecting to .* using TLS with verify_certs=False is insecure')
warnings.filterwarnings('ignore', message='Unverified HTTPS request is being made to host')


def extract_images_from_docx(docx_path, output_folder):
    """
    从docx提取图片并记录位置
    :param docx_path: Word文档路径
    :param output_folder: 图片输出文件夹
    :return: 包含图片路径和位置的列表
    """
    # 创建一个List<String> 记录每个图片的名称和序号
    image_data = []
    # 创建临时解压目录
    temp_dir = os.path.join(output_folder, "temp_docx")
    os.makedirs(temp_dir, exist_ok=True)

    # 解压docx文件
    with zipfile.ZipFile(docx_path, 'r') as zip_ref:
        zip_ref.extractall(temp_dir)

    # 读取主文档关系
    with open(os.path.join(temp_dir, 'word', '_rels', 'document.xml.rels'), 'r') as rels_file:
        rels_content = rels_file.read()

    # 加载主文档
    doc = Document(docx_path)
    img_counter = 1

    # 遍历所有段落
    for para_idx, paragraph in enumerate(doc.paragraphs):
        for run_idx, run in enumerate(paragraph.runs):
            # 检查运行中的图形
            for element in run._element:
                if element.tag.endswith('drawing'):
                    # 提取图片关系ID
                    blip = element.find('.//a:blip', namespaces=nsmap)
                    if blip is not None:
                        embed_id = blip.get('{%s}embed' % nsmap['r'])

                        # 从关系文件中获取图片文件名
                        rel_entry = f'<Relationship Id="{embed_id}"'
                        if rel_entry in rels_content:
                            start = rels_content.find(rel_entry)
                            target_start = rels_content.find('Target="', start) + 8
                            target_end = rels_content.find('"', target_start)
                            image_path = rels_content[target_start:target_end]

                            # 构建图片源路径
                            src_path = os.path.join(temp_dir, 'word', image_path.replace('..', '').lstrip('/'))

                            if os.path.exists(src_path):
                                # 创建输出文件名
                                ext = os.path.splitext(src_path)[1]
                                # 名称为uuid
                                fileName = uuid.uuid4().hex
                                img_name = f"{fileName}{ext}"
                                image_data.append(img_name)
                                dest_path = os.path.join(output_folder, img_name)
                                # 复制图片
                                shutil.copy(src_path, dest_path)

                                img_counter += 1

    # 清理临时目录
    shutil.rmtree(temp_dir)
    return image_data


def read_word_content(docx_path):
    res = ""
    idx = 0
    """遍历Word文档的每个段落，输出文字或图片标识"""
    try:
        doc = docx.Document(docx_path)

        for paragraph in doc.paragraphs:
            has_image = False
            # 检查段落中是否有图片
            for run in paragraph.runs:
                for element in run._element:
                    if element.tag.endswith('drawing'):
                        # 找到图片元素
                        has_image = True
                        break
                if has_image:
                    break

            if has_image:
                idx = idx + 1
                res = res + "\n" + "【图片" + str(idx) + "】"
            elif paragraph.text.strip():
                res = res + "\n" + paragraph.text.strip()
        return res
    except Exception as e:
        print(f"处理Word文档时出错: {str(e)}")


def split_into_blocks(text):
    """按行遍历文本，发现'问题X'或'话题X'时开始分割，只移除前缀但保留整行内容"""
    blocks = []
    current_block = []
    in_block = False

    for line in text.splitlines():
        if line.startswith(('问题', '话题')) and any(c.isdigit() for c in line[:5]):
            if in_block:
                blocks.append('\n'.join(current_block))
                current_block = []
            in_block = True
            # 循环移除问题和话题前缀后的数字
            while line and (line.startswith(('问题', '话题')) or (line and line and line[0].isdigit())):
                if line.startswith(('问题', '话题')):
                    line = line[2:] if len(line) > 2 else line
                elif line and line[0].isdigit():
                    line = line[1:] if len(line) > 1 else line
                line = line.strip()
        if in_block and line:  # 只添加非空行
            current_block.append(line)

    if current_block:
        blocks.append('\n'.join(current_block))

    return [(i + 1, block) for i, block in enumerate(blocks)]


# 保留原有的save_to_txt函数
def save_to_txt(content, file_path, mode='w'):
    """将内容保存到文本文件"""
    try:
        with open(file_path, mode, encoding='utf-8') as f:
            f.write(content)
        return True
    except Exception as e:
        print(f"保存文件{file_path}时出错: {str(e)}")
        return False


def process_document(word_document_path,txt_output_dir,img_output_dir):
    # 提取图片
    listImage = extract_images_from_docx(word_document_path, img_output_dir)
    # 读取内容
    res = read_word_content(word_document_path)
    # 分块
    chunks = split_into_blocks(res)
    saved_count = 0

    # 使用原来的正则表达式
    pattern = re.compile(r'【图片\d+】')
    # 初始化图片索引
    img_idx = 0

    for x in chunks:
        firstLine = x[1].split("\n")[0].strip()
        content = x[1][len(firstLine):].strip()

        # 使用finditer查找所有匹配项
        # 使用闭包函数替换所有匹配项
        img_idx = [0]  # 使用列表实现可变状态

        def replacer(match):
            if img_idx[0] < len(listImage):
                result = f"<img src=\"./Images/{listImage[img_idx[0]]}\">"
                img_idx[0] += 1
                return result
            return match.group()

        content = pattern.sub(replacer, content)
        # 保存文本文件
        # 从docx文件名提取学科和编号
        docx_name = os.path.basename(word_document_path).split('.')[0]
        subject_part = '_'.join(docx_name.split('_')[-2:])  # 获取最后两部分如CHINESE_1
        output_file = os.path.join(txt_output_dir, f"{subject_part}_{x[0]}.txt")
        full_content = f"{firstLine}\n{content}"
        if save_to_txt(full_content, output_file, mode='w'):
            saved_count += 1

    print(f"处理完成，共保存{saved_count}个文件到目录: {txt_output_dir}")


if __name__ == "__main__":
    txt_output_dir = "../Txt/"
    img_output_dir = "../static/Images/"
    # 遍历static/Txt/下所有的docx
    for filename in os.listdir("../static/Txt/"):
        print("正在处理文件：" + filename)
        # 这里需要文件的全称路径
        filename = os.path.join("../static/Txt/", filename)
        process_document(filename,txt_output_dir,img_output_dir)