parent
43b3d4f602
commit
e975510fa2
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,37 @@
|
||||
import zipfile
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
def parse_docx(docx_path):
|
||||
with zipfile.ZipFile(docx_path) as z:
|
||||
with z.open('word/document.xml') as f:
|
||||
tree = ET.parse(f)
|
||||
root = tree.getroot()
|
||||
ns = {
|
||||
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
||||
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math'
|
||||
}
|
||||
|
||||
# 查找所有公式(包括浮动和内联公式)
|
||||
formula_count = 0
|
||||
for oMath in root.findall('.//m:oMath', ns):
|
||||
|
||||
print(oMath)
|
||||
formula_count += 1
|
||||
formula_text = ''
|
||||
# 处理公式中的文本节点
|
||||
for t in oMath.findall('.//m:t', ns):
|
||||
if t.text:
|
||||
formula_text += t.text
|
||||
|
||||
# 处理公式中的特殊符号
|
||||
for e in oMath.findall('.//m:e', ns):
|
||||
if e.text:
|
||||
formula_text += e.text
|
||||
|
||||
print(f"公式{formula_count}内容: {formula_text}")
|
||||
|
||||
print(f"共找到{formula_count}个公式")
|
||||
|
||||
if __name__ == "__main__":
|
||||
docx_path = r'D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx'
|
||||
parse_docx(docx_path)
|
@ -0,0 +1,19 @@
|
||||
from docx import Document
|
||||
|
||||
|
||||
def extract_text_from_docx(file_path):
|
||||
doc = Document(file_path)
|
||||
formulas = []
|
||||
|
||||
for para in doc.paragraphs:
|
||||
for run in para.runs:
|
||||
if run.text: # 检查文本是否存在
|
||||
formulas.append(run.text)
|
||||
|
||||
# 打印提取的公式
|
||||
for index, formula in enumerate(formulas):
|
||||
print(f'公式 {index + 1}: {formula}')
|
||||
|
||||
|
||||
# 路径可替换为您的 Word 文档路径
|
||||
extract_text_from_docx('D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx')
|
@ -0,0 +1,37 @@
|
||||
import win32com.client
|
||||
import re
|
||||
|
||||
def convert_to_latex(formula_xml):
|
||||
# 简单转换规则 - 实际应用中可能需要更复杂的处理
|
||||
latex = formula_xml
|
||||
latex = re.sub(r'<[^>]+>', '', latex) # 移除XML标签
|
||||
latex = latex.replace('<', '<').replace('>', '>')
|
||||
return f"${latex}$"
|
||||
|
||||
def convert_math_type_to_word_formula(doc_path):
|
||||
word = win32com.client.Dispatch("Word.Application")
|
||||
doc = word.Documents.Open(doc_path)
|
||||
|
||||
formula_count = 0
|
||||
latex_formulas = []
|
||||
|
||||
for omath in doc.OMaths:
|
||||
formula_count += 1
|
||||
try:
|
||||
formula_xml = omath.Range.WordOpenXML
|
||||
if formula_xml:
|
||||
latex = convert_to_latex(formula_xml)
|
||||
latex_formulas.append(latex)
|
||||
print(f"公式{formula_count} LaTeX: {latex[:100]}...")
|
||||
except Exception as e:
|
||||
print(f"公式{formula_count}转换错误: {str(e)}")
|
||||
|
||||
print(f"\n最终LaTeX输出:")
|
||||
for i, latex in enumerate(latex_formulas, 1):
|
||||
print(f"公式{i}: {latex}")
|
||||
|
||||
doc.Close(False)
|
||||
word.Quit()
|
||||
|
||||
if __name__ == '__main__':
|
||||
convert_math_type_to_word_formula(r'D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx')
|
Binary file not shown.
@ -0,0 +1 @@
|
||||
:氢气与氧气燃烧的方程式
|
@ -0,0 +1 @@
|
||||
:硝酸光照分解的方程式
|
@ -0,0 +1 @@
|
||||
Subproject commit 5ede23b5d20dbd12d04f5142828e7f6386763454
|
Binary file not shown.
Loading…
Reference in new issue