main
HuangHai 3 weeks ago
parent 0bfba12fb7
commit 4a78606de0

@ -1,37 +0,0 @@
import zipfile
import xml.etree.ElementTree as ET
def parse_docx(docx_path):
with zipfile.ZipFile(docx_path) as z:
with z.open('word/document.xml') as f:
tree = ET.parse(f)
root = tree.getroot()
ns = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math'
}
# 查找所有公式(包括浮动和内联公式)
formula_count = 0
for oMath in root.findall('.//m:oMath', ns):
print(oMath)
formula_count += 1
formula_text = ''
# 处理公式中的文本节点
for t in oMath.findall('.//m:t', ns):
if t.text:
formula_text += t.text
# 处理公式中的特殊符号
for e in oMath.findall('.//m:e', ns):
if e.text:
formula_text += e.text
print(f"公式{formula_count}内容: {formula_text}")
print(f"共找到{formula_count}个公式")
if __name__ == "__main__":
docx_path = r'D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx'
parse_docx(docx_path)

@ -1,14 +0,0 @@
import subprocess
def markdown_to_docx_with_pandoc(md_path, docx_path):
subprocess.run([
'pandoc',
'-s',
md_path,
'-o',
docx_path,
'--mathml'
], check=True)
# 使用示例
markdown_to_docx_with_pandoc('input.md', 'output.docx')

@ -1,33 +0,0 @@
"""
conda activate rag
pip install pypandoc
"""
import pypandoc
def docx_to_latex(docx_path):
latex_content = pypandoc.convert_file(docx_path, 'latex')
import re
# 替换公式格式
latex_content = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', latex_content)
latex_content = re.sub(r'\\\((.*?)\\\)', r'$\1$', latex_content)
# 替换图片路径为【图片X】格式
img_count = 1
def replacer(match):
nonlocal img_count
result = f'【图片{img_count}'
img_count += 1
return result
latex_content = re.sub(r'\\includegraphics\[.*?\]\{.*?\}', replacer, latex_content)
return latex_content
latex_content = docx_to_latex('带公式的WORD文档.docx')
# 遍历字符串的每一行
for line in latex_content.split('\n'):
if len(line.strip()) > 0:
print(line.strip())

@ -1,23 +0,0 @@
from docx import Document
def run_has_ole_object(run):
"""
检查run对象是否包含OLE对象
:param run: docx.text.run.Run对象
:return: bool
"""
# 检查run的XML中是否包含OLE对象标签
run_element = run._r
for child in run_element.iterchildren():
if child.tag.endswith('object') or child.tag.endswith('OLEObject'):
print(str(child))
return True
return False
# 测试代码
doc = Document(r'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx')
for paragraph in doc.paragraphs:
for run in paragraph.runs:
if run_has_ole_object(run):
print("Found Ole")

@ -1,18 +0,0 @@
import os
# 可执行文件路径
mtef = r'D:\dsWork\dsProject\dsRag\mtef-go-3\mtef-go.exe'
# 源文件路径
sourceDocx = r'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx'
# 输出文件路径
output_file = r"d:\output.txt" # 可修改为您需要的路径
# 构建命令
command = mtef + r" -w " + sourceDocx + " -o " + output_file
os.system(command)
# 把output.txt里的内容打印出来看看
# 加上行号
with open(output_file, 'r', encoding='utf-8') as file:
for i, line in enumerate(file):
print(f"{i+1}: {line.strip()}")
Loading…
Cancel
Save