parent
88cc9c18e4
commit
0c94853adc
@ -1,19 +1,25 @@
|
|||||||
from docx import Document
|
from docx import Document
|
||||||
|
from docx.oxml.shared import qn
|
||||||
|
from docx.oxml import parse_xml
|
||||||
|
|
||||||
|
def run_has_ole_object(run):
|
||||||
|
"""
|
||||||
|
检查run对象是否包含OLE对象
|
||||||
|
:param run: docx.text.run.Run对象
|
||||||
|
:return: bool
|
||||||
|
"""
|
||||||
|
# 检查run的XML中是否包含OLE对象标签
|
||||||
|
run_element = run._r
|
||||||
|
for child in run_element.iterchildren():
|
||||||
|
if child.tag.endswith('object') or child.tag.endswith('OLEObject'):
|
||||||
|
print(str(child))
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def extract_text_from_docx(file_path):
|
|
||||||
doc = Document(file_path)
|
|
||||||
formulas = []
|
|
||||||
|
|
||||||
for para in doc.paragraphs:
|
# 测试代码
|
||||||
for run in para.runs:
|
doc = Document(r'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx')
|
||||||
if run.text: # 检查文本是否存在
|
for paragraph in doc.paragraphs:
|
||||||
formulas.append(run.text)
|
for run in paragraph.runs:
|
||||||
|
if run_has_ole_object(run):
|
||||||
# 打印提取的公式
|
print("Found Ole")
|
||||||
for index, formula in enumerate(formulas):
|
|
||||||
print(f'公式 {index + 1}: {formula}')
|
|
||||||
|
|
||||||
|
|
||||||
# 路径可替换为您的 Word 文档路径
|
|
||||||
extract_text_from_docx('D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx')
|
|
||||||
|
Loading…
Reference in new issue