diff --git a/dsRag/MathType/MathType-win-zh-7.8.2.441.exe b/dsRag/MathType/MathType-win-zh-7.8.2.441.exe new file mode 100644 index 00000000..8bd59d2c Binary files /dev/null and b/dsRag/MathType/MathType-win-zh-7.8.2.441.exe differ diff --git a/dsRag/MathType/MathType_v7.x_Patch.exe b/dsRag/MathType/MathType_v7.x_Patch.exe new file mode 100644 index 00000000..e3f0de7d Binary files /dev/null and b/dsRag/MathType/MathType_v7.x_Patch.exe differ diff --git a/dsRag/MathType/录入化学方程式的办法.md b/dsRag/MathType/录入化学方程式的办法.md new file mode 100644 index 00000000..7285b0dc --- /dev/null +++ b/dsRag/MathType/录入化学方程式的办法.md @@ -0,0 +1,32 @@ +### 一、安装$MathType$ $7.8 $ + +由于Word中的公式编辑器在编辑数学、化学、生物等公式时,并不是特别方便,所以可以采用使用插件$MathType$的方式来快速录入$Word$中的公式。 + +安装、破解、使用,不再赘述。 + + + +### 二、手动调整公式格式 + +但由于格式的差别,其它软件并不能正确读取$MathType$制作的公式,为了让其它软件也能正确读取$Word$中$MathType$制作的公式,还需要一些手动的调整办法。 + +![](https://dsideal.obs.cn-north-1.myhuaweicloud.com/HuangHai/BlogImages/%7Byear%7D/%7Bmonth%7D/%7Bmd5%7D.%7BextName%7D/20250630104228950.png) + +![](https://dsideal.obs.cn-north-1.myhuaweicloud.com/HuangHai/BlogImages/%7Byear%7D/%7Bmonth%7D/%7Bmd5%7D.%7BextName%7D/20250630104322174.png) + +- 鼠标左键点击,然后Ctrl+C 复制到内存 + +![](https://dsideal.obs.cn-north-1.myhuaweicloud.com/HuangHai/BlogImages/%7Byear%7D/%7Bmonth%7D/%7Bmd5%7D.%7BextName%7D/20250630104444709.png) + +- Ctrl+V 即可 + +![](https://dsideal.obs.cn-north-1.myhuaweicloud.com/HuangHai/BlogImages/%7Byear%7D/%7Bmonth%7D/%7Bmd5%7D.%7BextName%7D/20250630104536077.png) + +### 三、测试一下 + +````cmd +pandoc -f docx -t markdown --extract-media ./images -o c:/1.md D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx +```` + + + diff --git a/dsRag/MathType/激活MathType.docx b/dsRag/MathType/激活MathType.docx new file mode 100644 index 00000000..43ed5fa8 Binary files /dev/null and b/dsRag/MathType/激活MathType.docx differ diff --git a/dsRag/Test/T2_SplitTxtTest.py b/dsRag/Test/T2_SplitTxtTest.py new file mode 100644 index 00000000..6d031ef1 --- /dev/null +++ b/dsRag/Test/T2_SplitTxtTest.py @@ -0,0 +1,37 @@ +import zipfile +import xml.etree.ElementTree as ET + +def parse_docx(docx_path): + with zipfile.ZipFile(docx_path) as z: + with z.open('word/document.xml') as f: + tree = ET.parse(f) + root = tree.getroot() + ns = { + 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', + 'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math' + } + + # 查找所有公式(包括浮动和内联公式) + formula_count = 0 + for oMath in root.findall('.//m:oMath', ns): + + print(oMath) + formula_count += 1 + formula_text = '' + # 处理公式中的文本节点 + for t in oMath.findall('.//m:t', ns): + if t.text: + formula_text += t.text + + # 处理公式中的特殊符号 + for e in oMath.findall('.//m:e', ns): + if e.text: + formula_text += e.text + + print(f"公式{formula_count}内容: {formula_text}") + + print(f"共找到{formula_count}个公式") + +if __name__ == "__main__": + docx_path = r'D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx' + parse_docx(docx_path) \ No newline at end of file diff --git a/dsRag/Test/TestReadGongShi.py b/dsRag/Test/TestReadGongShi.py new file mode 100644 index 00000000..499c48ef --- /dev/null +++ b/dsRag/Test/TestReadGongShi.py @@ -0,0 +1,19 @@ +from docx import Document + + +def extract_text_from_docx(file_path): + doc = Document(file_path) + formulas = [] + + for para in doc.paragraphs: + for run in para.runs: + if run.text: # 检查文本是否存在 + formulas.append(run.text) + + # 打印提取的公式 + for index, formula in enumerate(formulas): + print(f'公式 {index + 1}: {formula}') + + +# 路径可替换为您的 Word 文档路径 +extract_text_from_docx('D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx') \ No newline at end of file diff --git a/dsRag/Test/convert_math_type_to_word_formula.py b/dsRag/Test/convert_math_type_to_word_formula.py new file mode 100644 index 00000000..b51b4537 --- /dev/null +++ b/dsRag/Test/convert_math_type_to_word_formula.py @@ -0,0 +1,37 @@ +import win32com.client +import re + +def convert_to_latex(formula_xml): + # 简单转换规则 - 实际应用中可能需要更复杂的处理 + latex = formula_xml + latex = re.sub(r'<[^>]+>', '', latex) # 移除XML标签 + latex = latex.replace('<', '<').replace('>', '>') + return f"${latex}$" + +def convert_math_type_to_word_formula(doc_path): + word = win32com.client.Dispatch("Word.Application") + doc = word.Documents.Open(doc_path) + + formula_count = 0 + latex_formulas = [] + + for omath in doc.OMaths: + formula_count += 1 + try: + formula_xml = omath.Range.WordOpenXML + if formula_xml: + latex = convert_to_latex(formula_xml) + latex_formulas.append(latex) + print(f"公式{formula_count} LaTeX: {latex[:100]}...") + except Exception as e: + print(f"公式{formula_count}转换错误: {str(e)}") + + print(f"\n最终LaTeX输出:") + for i, latex in enumerate(latex_formulas, 1): + print(f"公式{i}: {latex}") + + doc.Close(False) + word.Quit() + +if __name__ == '__main__': + convert_math_type_to_word_formula(r'D:\dsWork\dsProject\dsRag\Test\化学方程式_CHEMISTRY_1.docx') \ No newline at end of file diff --git a/dsRag/Test/化学方程式_CHEMISTRY_1.docx b/dsRag/Test/化学方程式_CHEMISTRY_1.docx new file mode 100644 index 00000000..55df07f7 Binary files /dev/null and b/dsRag/Test/化学方程式_CHEMISTRY_1.docx differ diff --git a/dsRag/Txt/CHEMISTRY_1_1.txt b/dsRag/Txt/CHEMISTRY_1_1.txt new file mode 100644 index 00000000..407e94a7 --- /dev/null +++ b/dsRag/Txt/CHEMISTRY_1_1.txt @@ -0,0 +1 @@ +:氢气与氧气燃烧的方程式 diff --git a/dsRag/Txt/CHEMISTRY_1_2.txt b/dsRag/Txt/CHEMISTRY_1_2.txt new file mode 100644 index 00000000..76ede26d --- /dev/null +++ b/dsRag/Txt/CHEMISTRY_1_2.txt @@ -0,0 +1 @@ +:硝酸光照分解的方程式 diff --git a/dsRag/mtef-go-3 b/dsRag/mtef-go-3 new file mode 160000 index 00000000..5ede23b5 --- /dev/null +++ b/dsRag/mtef-go-3 @@ -0,0 +1 @@ +Subproject commit 5ede23b5d20dbd12d04f5142828e7f6386763454 diff --git a/dsRag/static/Txt/化学方程式_CHEMISTRY_1.docx b/dsRag/static/Txt/化学方程式_CHEMISTRY_1.docx new file mode 100644 index 00000000..60e5146d Binary files /dev/null and b/dsRag/static/Txt/化学方程式_CHEMISTRY_1.docx differ