'commit'

2 weeks ago · dc7189af7a
parent 044466992b
commit dc7189af7a
4 changed files with 6 additions and 4 deletions
--- a/dsLightRag/ShiTi/T3_DocxToMd.py
+++ b/dsLightRag/ShiTi/T3_DocxToMd.py
@ -7,7 +7,7 @@ async def main():
    # 要处理的文件路径
    file_path = "Docx/《动能定理》巩固练习.docx"
    # 转换docx为md
-    get_docx_content_by_pandoc(file_path, '../Word转试题.md')
+    get_docx_content_by_pandoc(file_path, '../Word转试题.md',extract_media='../static/Images/')

 if __name__ == "__main__":
    asyncio.run(main())
--- a/dsLightRag/Util/DocxUtil.py
+++ b/dsLightRag/Util/DocxUtil.py
@ -3,7 +3,7 @@ import subprocess
 import uuid


-def get_docx_content_by_pandoc(docx_file, output_file=None):
+def get_docx_content_by_pandoc(docx_file, output_file=None, extract_media=None):
    # 最后拼接的内容
    content = ""
    # output_file 设置为临时目录下的uuid.md
@ -12,9 +12,11 @@ def get_docx_content_by_pandoc(docx_file, output_file=None):
    prefix = docx_file.split(".")[0].split("/")[-1]
    temp_markdown = os.path.join('./static/markdown/', prefix + '.md')
    # 调用pandoc将docx文件转换成markdown
-    os.mkdir("./static/Images/" + file_name)
+    if extract_media is None:
+        os.mkdir("./static/Images/" + file_name)
+        extract_media = "./static/Images/"
    subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown,
-                    '--extract-media=./static/Images/' + file_name])
+                    '--extract-media=' + extract_media + file_name])
    # 读取然后修改内容，输出到新的文件
    img_idx = 0  # 图片索引
    with open(temp_markdown, 'r', encoding='utf-8') as f:
--- a/dsLightRag/Util/pycache/DocxUtil.cpython-310.pyc
+++ b/dsLightRag/Util/pycache/DocxUtil.cpython-310.pyc
--- a/dsLightRag/static/Images/2ddc4eb15b06461ab1d56444d012d66c/media/image1.png
+++ b/dsLightRag/static/Images/2ddc4eb15b06461ab1d56444d012d66c/media/image1.png