parent
0bfba12fb7
commit
4a78606de0
@ -1,37 +0,0 @@
|
|||||||
import zipfile
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
|
|
||||||
def parse_docx(docx_path):
|
|
||||||
with zipfile.ZipFile(docx_path) as z:
|
|
||||||
with z.open('word/document.xml') as f:
|
|
||||||
tree = ET.parse(f)
|
|
||||||
root = tree.getroot()
|
|
||||||
ns = {
|
|
||||||
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
|
||||||
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math'
|
|
||||||
}
|
|
||||||
|
|
||||||
# 查找所有公式(包括浮动和内联公式)
|
|
||||||
formula_count = 0
|
|
||||||
for oMath in root.findall('.//m:oMath', ns):
|
|
||||||
|
|
||||||
print(oMath)
|
|
||||||
formula_count += 1
|
|
||||||
formula_text = ''
|
|
||||||
# 处理公式中的文本节点
|
|
||||||
for t in oMath.findall('.//m:t', ns):
|
|
||||||
if t.text:
|
|
||||||
formula_text += t.text
|
|
||||||
|
|
||||||
# 处理公式中的特殊符号
|
|
||||||
for e in oMath.findall('.//m:e', ns):
|
|
||||||
if e.text:
|
|
||||||
formula_text += e.text
|
|
||||||
|
|
||||||
print(f"公式{formula_count}内容: {formula_text}")
|
|
||||||
|
|
||||||
print(f"共找到{formula_count}个公式")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
docx_path = r'D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx'
|
|
||||||
parse_docx(docx_path)
|
|
@ -1,14 +0,0 @@
|
|||||||
import subprocess
|
|
||||||
|
|
||||||
def markdown_to_docx_with_pandoc(md_path, docx_path):
|
|
||||||
subprocess.run([
|
|
||||||
'pandoc',
|
|
||||||
'-s',
|
|
||||||
md_path,
|
|
||||||
'-o',
|
|
||||||
docx_path,
|
|
||||||
'--mathml'
|
|
||||||
], check=True)
|
|
||||||
|
|
||||||
# 使用示例
|
|
||||||
markdown_to_docx_with_pandoc('input.md', 'output.docx')
|
|
@ -1,33 +0,0 @@
|
|||||||
"""
|
|
||||||
conda activate rag
|
|
||||||
pip install pypandoc
|
|
||||||
"""
|
|
||||||
|
|
||||||
import pypandoc
|
|
||||||
|
|
||||||
|
|
||||||
def docx_to_latex(docx_path):
|
|
||||||
latex_content = pypandoc.convert_file(docx_path, 'latex')
|
|
||||||
import re
|
|
||||||
# 替换公式格式
|
|
||||||
latex_content = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', latex_content)
|
|
||||||
latex_content = re.sub(r'\\\((.*?)\\\)', r'$\1$', latex_content)
|
|
||||||
# 替换图片路径为【图片X】格式
|
|
||||||
img_count = 1
|
|
||||||
|
|
||||||
def replacer(match):
|
|
||||||
nonlocal img_count
|
|
||||||
result = f'【图片{img_count}】'
|
|
||||||
img_count += 1
|
|
||||||
return result
|
|
||||||
|
|
||||||
latex_content = re.sub(r'\\includegraphics\[.*?\]\{.*?\}', replacer, latex_content)
|
|
||||||
return latex_content
|
|
||||||
|
|
||||||
|
|
||||||
latex_content = docx_to_latex('带公式的WORD文档.docx')
|
|
||||||
|
|
||||||
# 遍历字符串的每一行
|
|
||||||
for line in latex_content.split('\n'):
|
|
||||||
if len(line.strip()) > 0:
|
|
||||||
print(line.strip())
|
|
@ -1,23 +0,0 @@
|
|||||||
from docx import Document
|
|
||||||
|
|
||||||
def run_has_ole_object(run):
|
|
||||||
"""
|
|
||||||
检查run对象是否包含OLE对象
|
|
||||||
:param run: docx.text.run.Run对象
|
|
||||||
:return: bool
|
|
||||||
"""
|
|
||||||
# 检查run的XML中是否包含OLE对象标签
|
|
||||||
run_element = run._r
|
|
||||||
for child in run_element.iterchildren():
|
|
||||||
if child.tag.endswith('object') or child.tag.endswith('OLEObject'):
|
|
||||||
print(str(child))
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
# 测试代码
|
|
||||||
doc = Document(r'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx')
|
|
||||||
for paragraph in doc.paragraphs:
|
|
||||||
for run in paragraph.runs:
|
|
||||||
if run_has_ole_object(run):
|
|
||||||
print("Found Ole")
|
|
@ -1,18 +0,0 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
# 可执行文件路径
|
|
||||||
mtef = r'D:\dsWork\dsProject\dsRag\mtef-go-3\mtef-go.exe'
|
|
||||||
# 源文件路径
|
|
||||||
sourceDocx = r'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx'
|
|
||||||
# 输出文件路径
|
|
||||||
output_file = r"d:\output.txt" # 可修改为您需要的路径
|
|
||||||
|
|
||||||
# 构建命令
|
|
||||||
command = mtef + r" -w " + sourceDocx + " -o " + output_file
|
|
||||||
os.system(command)
|
|
||||||
# 把output.txt里的内容打印出来看看
|
|
||||||
# 加上行号
|
|
||||||
with open(output_file, 'r', encoding='utf-8') as file:
|
|
||||||
for i, line in enumerate(file):
|
|
||||||
print(f"{i+1}: {line.strip()}")
|
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in new issue