From 0c94853adce2d99391504b091ee49a8204812db4 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Mon, 30 Jun 2025 16:42:22 +0800 Subject: [PATCH] 'commit' --- dsRag/Test/TestReadGongShi.py | 36 ++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/dsRag/Test/TestReadGongShi.py b/dsRag/Test/TestReadGongShi.py index 499c48ef..4f1f0752 100644 --- a/dsRag/Test/TestReadGongShi.py +++ b/dsRag/Test/TestReadGongShi.py @@ -1,19 +1,25 @@ from docx import Document +from docx.oxml.shared import qn +from docx.oxml import parse_xml +def run_has_ole_object(run): + """ + 检查run对象是否包含OLE对象 + :param run: docx.text.run.Run对象 + :return: bool + """ + # 检查run的XML中是否包含OLE对象标签 + run_element = run._r + for child in run_element.iterchildren(): + if child.tag.endswith('object') or child.tag.endswith('OLEObject'): + print(str(child)) + return True + return False -def extract_text_from_docx(file_path): - doc = Document(file_path) - formulas = [] - for para in doc.paragraphs: - for run in para.runs: - if run.text: # 检查文本是否存在 - formulas.append(run.text) - - # 打印提取的公式 - for index, formula in enumerate(formulas): - print(f'公式 {index + 1}: {formula}') - - -# 路径可替换为您的 Word 文档路径 -extract_text_from_docx('D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx') \ No newline at end of file +# 测试代码 +doc = Document(r'D:\dsWork\dsProject\dsRag\static\Txt\化学方程式_CHEMISTRY_1.docx') +for paragraph in doc.paragraphs: + for run in paragraph.runs: + if run_has_ole_object(run): + print("Found Ole")