diff --git a/dsRag/Test/T2_SplitTxtTest.py b/dsRag/Test/T2_SplitTxtTest.py deleted file mode 100644 index 6d031ef1..00000000 --- a/dsRag/Test/T2_SplitTxtTest.py +++ /dev/null @@ -1,37 +0,0 @@ -import zipfile -import xml.etree.ElementTree as ET - -def parse_docx(docx_path): - with zipfile.ZipFile(docx_path) as z: - with z.open('word/document.xml') as f: - tree = ET.parse(f) - root = tree.getroot() - ns = { - 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', - 'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math' - } - - # 查找所有公式(包括浮动和内联公式) - formula_count = 0 - for oMath in root.findall('.//m:oMath', ns): - - print(oMath) - formula_count += 1 - formula_text = '' - # 处理公式中的文本节点 - for t in oMath.findall('.//m:t', ns): - if t.text: - formula_text += t.text - - # 处理公式中的特殊符号 - for e in oMath.findall('.//m:e', ns): - if e.text: - formula_text += e.text - - print(f"公式{formula_count}内容: {formula_text}") - - print(f"共找到{formula_count}个公式") - -if __name__ == "__main__": - docx_path = r'D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx' - parse_docx(docx_path) \ No newline at end of file diff --git a/dsRag/Test/TestConvertHmtlToDocx.py b/dsRag/Test/TestConvertHmtlToDocx.py deleted file mode 100644 index fc29604c..00000000 --- a/dsRag/Test/TestConvertHmtlToDocx.py +++ /dev/null @@ -1,14 +0,0 @@ -import subprocess - -def markdown_to_docx_with_pandoc(md_path, docx_path): - subprocess.run([ - 'pandoc', - '-s', - md_path, - '-o', - docx_path, - '--mathml' - ], check=True) - -# 使用示例 -markdown_to_docx_with_pandoc('input.md', 'output.docx') \ No newline at end of file diff --git a/dsRag/Test/TestGongShi.py b/dsRag/Test/TestGongShi.py deleted file mode 100644 index 3a524368..00000000 --- a/dsRag/Test/TestGongShi.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -conda activate rag -pip install pypandoc -""" - -import pypandoc - - -def docx_to_latex(docx_path): - latex_content = pypandoc.convert_file(docx_path, 'latex') - import re - # 替换公式格式 - latex_content = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', latex_content) - latex_content = re.sub(r'\\\((.*?)\\\)', r'$\1$', latex_content) - # 替换图片路径为【图片X】格式 - img_count = 1 - - def replacer(match): - nonlocal img_count - result = f'【图片{img_count}】' - img_count += 1 - return result - - latex_content = re.sub(r'\\includegraphics\[.*?\]\{.*?\}', replacer, latex_content) - return latex_content - - -latex_content = docx_to_latex('带公式的WORD文档.docx') - -# 遍历字符串的每一行 -for line in latex_content.split('\n'): - if len(line.strip()) > 0: - print(line.strip()) diff --git a/dsRag/Test/TestReadGongShi.py b/dsRag/Test/TestReadGongShi.py deleted file mode 100644 index 658f9b81..00000000 --- a/dsRag/Test/TestReadGongShi.py +++ /dev/null @@ -1,23 +0,0 @@ -from docx import Document - -def run_has_ole_object(run): - """ - 检查run对象是否包含OLE对象 - :param run: docx.text.run.Run对象 - :return: bool - """ - # 检查run的XML中是否包含OLE对象标签 - run_element = run._r - for child in run_element.iterchildren(): - if child.tag.endswith('object') or child.tag.endswith('OLEObject'): - print(str(child)) - return True - return False - - -# 测试代码 -doc = Document(r'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx') -for paragraph in doc.paragraphs: - for run in paragraph.runs: - if run_has_ole_object(run): - print("Found Ole") diff --git a/dsRag/Test/TestReadMathType.py b/dsRag/Test/TestReadMathType.py deleted file mode 100644 index 5fa5c3c5..00000000 --- a/dsRag/Test/TestReadMathType.py +++ /dev/null @@ -1,18 +0,0 @@ -import os - - -# 可执行文件路径 -mtef = r'D:\dsWork\dsProject\dsRag\mtef-go-3\mtef-go.exe' -# 源文件路径 -sourceDocx = r'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx' -# 输出文件路径 -output_file = r"d:\output.txt" # 可修改为您需要的路径 - -# 构建命令 -command = mtef + r" -w " + sourceDocx + " -o " + output_file -os.system(command) -# 把output.txt里的内容打印出来看看 -# 加上行号 -with open(output_file, 'r', encoding='utf-8') as file: - for i, line in enumerate(file): - print(f"{i+1}: {line.strip()}") diff --git a/dsRag/Test/带公式的WORD文档.docx b/dsRag/Test/带公式的WORD文档.docx deleted file mode 100644 index 3950ef6e..00000000 Binary files a/dsRag/Test/带公式的WORD文档.docx and /dev/null differ diff --git a/dsRag/Test/带图的WORD文档.docx b/dsRag/Test/带图的WORD文档.docx deleted file mode 100644 index 4c081e2c..00000000 Binary files a/dsRag/Test/带图的WORD文档.docx and /dev/null differ