'commit'

3 weeks ago · 4a78606de0
parent 0bfba12fb7
commit 4a78606de0
7 changed files with 0 additions and 125 deletions
--- a/dsRag/Test/T2_SplitTxtTest.py
+++ b/dsRag/Test/T2_SplitTxtTest.py
@ -1,37 +0,0 @@
 import zipfile
 import xml.etree.ElementTree as ET
 def parse_docx(docx_path):
    with zipfile.ZipFile(docx_path) as z:
        with z.open('word/document.xml') as f:
            tree = ET.parse(f)
            root = tree.getroot()
            ns = {
                'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
                'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math'
            }
            # 查找所有公式(包括浮动和内联公式)
            formula_count = 0
            for oMath in root.findall('.//m:oMath', ns):
                print(oMath)
                formula_count += 1
                formula_text = ''
                # 处理公式中的文本节点
                for t in oMath.findall('.//m:t', ns):
                    if t.text:
                        formula_text += t.text
                # 处理公式中的特殊符号
                for e in oMath.findall('.//m:e', ns):
                    if e.text:
                        formula_text += e.text
                print(f"公式{formula_count}内容: {formula_text}")
            print(f"共找到{formula_count}个公式")
 if __name__ == "__main__":
    docx_path = r'D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx'
    parse_docx(docx_path)
--- a/dsRag/Test/TestConvertHmtlToDocx.py
+++ b/dsRag/Test/TestConvertHmtlToDocx.py
@ -1,14 +0,0 @@
 import subprocess
 def markdown_to_docx_with_pandoc(md_path, docx_path):
    subprocess.run([
            'pandoc',
            '-s',
            md_path,
            '-o',
            docx_path,
            '--mathml'
        ], check=True)
 # 使用示例
 markdown_to_docx_with_pandoc('input.md', 'output.docx')
--- a/dsRag/Test/TestGongShi.py
+++ b/dsRag/Test/TestGongShi.py
@ -1,33 +0,0 @@
 """
 conda activate rag
 pip install pypandoc
 """
 import pypandoc
 def docx_to_latex(docx_path):
    latex_content = pypandoc.convert_file(docx_path, 'latex')
    import re
    # 替换公式格式
    latex_content = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', latex_content)
    latex_content = re.sub(r'\\\((.*?)\\\)', r'$\1$', latex_content)
    # 替换图片路径为【图片X】格式
    img_count = 1
    def replacer(match):
        nonlocal img_count
        result = f'【图片{img_count}】'
        img_count += 1
        return result
    latex_content = re.sub(r'\\includegraphics\[.*?\]\{.*?\}', replacer, latex_content)
    return latex_content
 latex_content = docx_to_latex('带公式的WORD文档.docx')
 # 遍历字符串的每一行
 for line in latex_content.split('\n'):
    if len(line.strip()) > 0:
        print(line.strip())
--- a/dsRag/Test/TestReadGongShi.py
+++ b/dsRag/Test/TestReadGongShi.py
@ -1,23 +0,0 @@
 from docx import Document
 def run_has_ole_object(run):
    """
    检查run对象是否包含OLE对象
    :param run: docx.text.run.Run对象
    :return: bool
    """
    # 检查run的XML中是否包含OLE对象标签
    run_element = run._r
    for child in run_element.iterchildren():
        if child.tag.endswith('object') or child.tag.endswith('OLEObject'):
            print(str(child))
            return True
    return False
 # 测试代码
 doc = Document(r'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx')
 for paragraph in doc.paragraphs:
    for run in paragraph.runs:
        if run_has_ole_object(run):
            print("Found Ole")
--- a/dsRag/Test/TestReadMathType.py
+++ b/dsRag/Test/TestReadMathType.py
@ -1,18 +0,0 @@
 import os
 # 可执行文件路径
 mtef = r'D:\dsWork\dsProject\dsRag\mtef-go-3\mtef-go.exe'
 # 源文件路径
 sourceDocx = r'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx'
 # 输出文件路径
 output_file = r"d:\output.txt"  # 可修改为您需要的路径
 # 构建命令
 command = mtef + r" -w " + sourceDocx + " -o " + output_file
 os.system(command)
 # 把output.txt里的内容打印出来看看
 # 加上行号
 with open(output_file, 'r', encoding='utf-8') as file:
    for i, line in enumerate(file):
        print(f"{i+1}: {line.strip()}")
--- a/dsRag/Test/带公式的WORD文档.docx
+++ b/dsRag/Test/带公式的WORD文档.docx
--- a/dsRag/Test/带图的WORD文档.docx
+++ b/dsRag/Test/带图的WORD文档.docx