diff --git a/dsLightRag/ShiTi/T3_DocxToMd.py b/dsLightRag/ShiTi/T3_DocxToMd.py index b47cdae9..307e15ab 100644 --- a/dsLightRag/ShiTi/T3_DocxToMd.py +++ b/dsLightRag/ShiTi/T3_DocxToMd.py @@ -7,7 +7,7 @@ async def main(): # 要处理的文件路径 file_path = "Docx/《动能定理》巩固练习.docx" # 转换docx为md - get_docx_content_by_pandoc(file_path, '../Word转试题.md') + get_docx_content_by_pandoc(file_path, '../Word转试题.md',extract_media='../static/Images/') if __name__ == "__main__": asyncio.run(main()) diff --git a/dsLightRag/Util/DocxUtil.py b/dsLightRag/Util/DocxUtil.py index b3b711e8..d5f1784f 100644 --- a/dsLightRag/Util/DocxUtil.py +++ b/dsLightRag/Util/DocxUtil.py @@ -3,7 +3,7 @@ import subprocess import uuid -def get_docx_content_by_pandoc(docx_file, output_file=None): +def get_docx_content_by_pandoc(docx_file, output_file=None, extract_media=None): # 最后拼接的内容 content = "" # output_file 设置为临时目录下的uuid.md @@ -12,9 +12,11 @@ def get_docx_content_by_pandoc(docx_file, output_file=None): prefix = docx_file.split(".")[0].split("/")[-1] temp_markdown = os.path.join('./static/markdown/', prefix + '.md') # 调用pandoc将docx文件转换成markdown - os.mkdir("./static/Images/" + file_name) + if extract_media is None: + os.mkdir("./static/Images/" + file_name) + extract_media = "./static/Images/" subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown, - '--extract-media=./static/Images/' + file_name]) + '--extract-media=' + extract_media + file_name]) # 读取然后修改内容,输出到新的文件 img_idx = 0 # 图片索引 with open(temp_markdown, 'r', encoding='utf-8') as f: diff --git a/dsLightRag/Util/__pycache__/DocxUtil.cpython-310.pyc b/dsLightRag/Util/__pycache__/DocxUtil.cpython-310.pyc index 70711721..a7cd04be 100644 Binary files a/dsLightRag/Util/__pycache__/DocxUtil.cpython-310.pyc and b/dsLightRag/Util/__pycache__/DocxUtil.cpython-310.pyc differ diff --git a/dsLightRag/static/Images/2ddc4eb15b06461ab1d56444d012d66c/media/image1.png b/dsLightRag/static/Images/2ddc4eb15b06461ab1d56444d012d66c/media/image1.png new file mode 100644 index 00000000..9e0e4c51 Binary files /dev/null and b/dsLightRag/static/Images/2ddc4eb15b06461ab1d56444d012d66c/media/image1.png differ