parent
0bfba12fb7
commit
4a78606de0
@ -1,37 +0,0 @@
|
||||
import zipfile
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
def parse_docx(docx_path):
|
||||
with zipfile.ZipFile(docx_path) as z:
|
||||
with z.open('word/document.xml') as f:
|
||||
tree = ET.parse(f)
|
||||
root = tree.getroot()
|
||||
ns = {
|
||||
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
||||
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math'
|
||||
}
|
||||
|
||||
# 查找所有公式(包括浮动和内联公式)
|
||||
formula_count = 0
|
||||
for oMath in root.findall('.//m:oMath', ns):
|
||||
|
||||
print(oMath)
|
||||
formula_count += 1
|
||||
formula_text = ''
|
||||
# 处理公式中的文本节点
|
||||
for t in oMath.findall('.//m:t', ns):
|
||||
if t.text:
|
||||
formula_text += t.text
|
||||
|
||||
# 处理公式中的特殊符号
|
||||
for e in oMath.findall('.//m:e', ns):
|
||||
if e.text:
|
||||
formula_text += e.text
|
||||
|
||||
print(f"公式{formula_count}内容: {formula_text}")
|
||||
|
||||
print(f"共找到{formula_count}个公式")
|
||||
|
||||
if __name__ == "__main__":
|
||||
docx_path = r'D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx'
|
||||
parse_docx(docx_path)
|
@ -1,14 +0,0 @@
|
||||
import subprocess
|
||||
|
||||
def markdown_to_docx_with_pandoc(md_path, docx_path):
|
||||
subprocess.run([
|
||||
'pandoc',
|
||||
'-s',
|
||||
md_path,
|
||||
'-o',
|
||||
docx_path,
|
||||
'--mathml'
|
||||
], check=True)
|
||||
|
||||
# 使用示例
|
||||
markdown_to_docx_with_pandoc('input.md', 'output.docx')
|
@ -1,33 +0,0 @@
|
||||
"""
|
||||
conda activate rag
|
||||
pip install pypandoc
|
||||
"""
|
||||
|
||||
import pypandoc
|
||||
|
||||
|
||||
def docx_to_latex(docx_path):
|
||||
latex_content = pypandoc.convert_file(docx_path, 'latex')
|
||||
import re
|
||||
# 替换公式格式
|
||||
latex_content = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', latex_content)
|
||||
latex_content = re.sub(r'\\\((.*?)\\\)', r'$\1$', latex_content)
|
||||
# 替换图片路径为【图片X】格式
|
||||
img_count = 1
|
||||
|
||||
def replacer(match):
|
||||
nonlocal img_count
|
||||
result = f'【图片{img_count}】'
|
||||
img_count += 1
|
||||
return result
|
||||
|
||||
latex_content = re.sub(r'\\includegraphics\[.*?\]\{.*?\}', replacer, latex_content)
|
||||
return latex_content
|
||||
|
||||
|
||||
latex_content = docx_to_latex('带公式的WORD文档.docx')
|
||||
|
||||
# 遍历字符串的每一行
|
||||
for line in latex_content.split('\n'):
|
||||
if len(line.strip()) > 0:
|
||||
print(line.strip())
|
@ -1,23 +0,0 @@
|
||||
from docx import Document
|
||||
|
||||
def run_has_ole_object(run):
|
||||
"""
|
||||
检查run对象是否包含OLE对象
|
||||
:param run: docx.text.run.Run对象
|
||||
:return: bool
|
||||
"""
|
||||
# 检查run的XML中是否包含OLE对象标签
|
||||
run_element = run._r
|
||||
for child in run_element.iterchildren():
|
||||
if child.tag.endswith('object') or child.tag.endswith('OLEObject'):
|
||||
print(str(child))
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# 测试代码
|
||||
doc = Document(r'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx')
|
||||
for paragraph in doc.paragraphs:
|
||||
for run in paragraph.runs:
|
||||
if run_has_ole_object(run):
|
||||
print("Found Ole")
|
@ -1,18 +0,0 @@
|
||||
import os
|
||||
|
||||
|
||||
# 可执行文件路径
|
||||
mtef = r'D:\dsWork\dsProject\dsRag\mtef-go-3\mtef-go.exe'
|
||||
# 源文件路径
|
||||
sourceDocx = r'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx'
|
||||
# 输出文件路径
|
||||
output_file = r"d:\output.txt" # 可修改为您需要的路径
|
||||
|
||||
# 构建命令
|
||||
command = mtef + r" -w " + sourceDocx + " -o " + output_file
|
||||
os.system(command)
|
||||
# 把output.txt里的内容打印出来看看
|
||||
# 加上行号
|
||||
with open(output_file, 'r', encoding='utf-8') as file:
|
||||
for i, line in enumerate(file):
|
||||
print(f"{i+1}: {line.strip()}")
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in new issue