parent
43b3d4f602
commit
e975510fa2
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,37 @@
|
|||||||
|
import zipfile
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
def parse_docx(docx_path):
|
||||||
|
with zipfile.ZipFile(docx_path) as z:
|
||||||
|
with z.open('word/document.xml') as f:
|
||||||
|
tree = ET.parse(f)
|
||||||
|
root = tree.getroot()
|
||||||
|
ns = {
|
||||||
|
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
||||||
|
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math'
|
||||||
|
}
|
||||||
|
|
||||||
|
# 查找所有公式(包括浮动和内联公式)
|
||||||
|
formula_count = 0
|
||||||
|
for oMath in root.findall('.//m:oMath', ns):
|
||||||
|
|
||||||
|
print(oMath)
|
||||||
|
formula_count += 1
|
||||||
|
formula_text = ''
|
||||||
|
# 处理公式中的文本节点
|
||||||
|
for t in oMath.findall('.//m:t', ns):
|
||||||
|
if t.text:
|
||||||
|
formula_text += t.text
|
||||||
|
|
||||||
|
# 处理公式中的特殊符号
|
||||||
|
for e in oMath.findall('.//m:e', ns):
|
||||||
|
if e.text:
|
||||||
|
formula_text += e.text
|
||||||
|
|
||||||
|
print(f"公式{formula_count}内容: {formula_text}")
|
||||||
|
|
||||||
|
print(f"共找到{formula_count}个公式")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
docx_path = r'D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx'
|
||||||
|
parse_docx(docx_path)
|
@ -0,0 +1,19 @@
|
|||||||
|
from docx import Document
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_from_docx(file_path):
|
||||||
|
doc = Document(file_path)
|
||||||
|
formulas = []
|
||||||
|
|
||||||
|
for para in doc.paragraphs:
|
||||||
|
for run in para.runs:
|
||||||
|
if run.text: # 检查文本是否存在
|
||||||
|
formulas.append(run.text)
|
||||||
|
|
||||||
|
# 打印提取的公式
|
||||||
|
for index, formula in enumerate(formulas):
|
||||||
|
print(f'公式 {index + 1}: {formula}')
|
||||||
|
|
||||||
|
|
||||||
|
# 路径可替换为您的 Word 文档路径
|
||||||
|
extract_text_from_docx('D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx')
|
@ -0,0 +1,37 @@
|
|||||||
|
import win32com.client
|
||||||
|
import re
|
||||||
|
|
||||||
|
def convert_to_latex(formula_xml):
|
||||||
|
# 简单转换规则 - 实际应用中可能需要更复杂的处理
|
||||||
|
latex = formula_xml
|
||||||
|
latex = re.sub(r'<[^>]+>', '', latex) # 移除XML标签
|
||||||
|
latex = latex.replace('<', '<').replace('>', '>')
|
||||||
|
return f"${latex}$"
|
||||||
|
|
||||||
|
def convert_math_type_to_word_formula(doc_path):
|
||||||
|
word = win32com.client.Dispatch("Word.Application")
|
||||||
|
doc = word.Documents.Open(doc_path)
|
||||||
|
|
||||||
|
formula_count = 0
|
||||||
|
latex_formulas = []
|
||||||
|
|
||||||
|
for omath in doc.OMaths:
|
||||||
|
formula_count += 1
|
||||||
|
try:
|
||||||
|
formula_xml = omath.Range.WordOpenXML
|
||||||
|
if formula_xml:
|
||||||
|
latex = convert_to_latex(formula_xml)
|
||||||
|
latex_formulas.append(latex)
|
||||||
|
print(f"公式{formula_count} LaTeX: {latex[:100]}...")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"公式{formula_count}转换错误: {str(e)}")
|
||||||
|
|
||||||
|
print(f"\n最终LaTeX输出:")
|
||||||
|
for i, latex in enumerate(latex_formulas, 1):
|
||||||
|
print(f"公式{i}: {latex}")
|
||||||
|
|
||||||
|
doc.Close(False)
|
||||||
|
word.Quit()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
convert_math_type_to_word_formula(r'D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx')
|
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
:氢气与氧气燃烧的方程式
|
@ -0,0 +1 @@
|
|||||||
|
:硝酸光照分解的方程式
|
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 5ede23b5d20dbd12d04f5142828e7f6386763454
|
Binary file not shown.
Loading…
Reference in new issue