'commit'

6 days ago · b6d2c1be05
parent 1e366f1970
commit b6d2c1be05
3 changed files with 16 additions and 10 deletions
--- a/dsLightRag/Topic/ShiJi/kv_store_doc_status.json
+++ b/dsLightRag/Topic/ShiJi/kv_store_doc_status.json
--- a/dsLightRag/Util/DocxUtil.py
+++ b/dsLightRag/Util/DocxUtil.py
@ -20,6 +20,7 @@ handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -
 logger.addHandler(handler)
 logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)

+
 def resize_images_in_directory(directory_path, max_width=640, max_height=480):
    """
    遍历目录下所有图片并缩放到指定尺寸
@ -55,6 +56,7 @@ def resize_images_in_directory(directory_path, max_width=640, max_height=480):

 import hashlib

+
 def calculate_docx_md5(docx_file_path):
    """
    计算docx文件的MD5哈希值
@ -73,6 +75,8 @@ def calculate_docx_md5(docx_file_path):

    # 返回16进制格式的哈希值
    return md5_hash.hexdigest()
+
+
 def get_docx_content_by_pandoc(docx_file):
    # 最后拼接的内容
    content = ""
@ -83,12 +87,14 @@ def get_docx_content_by_pandoc(docx_file):
    prefix = docx_file.split(".")[0].split("/")[-1]
    temp_markdown = os.path.join('./static/markdown/', prefix + '.md')
    # 调用pandoc将docx文件转换成markdown
-    os.mkdir("./static/Images/" + md5_value)
+    path = "./static/Images/" + md5_value
+    if not os.path.exists(path):
+        os.mkdir("./static/Images/" + md5_value)
    subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown,
                    '--extract-media=./static/Images/' + md5_value])
    # 遍历目录 './static/Images/'+file_name 下所有的图片，缩小于640*480的尺寸上

-    resize_images_in_directory('./static/Images/' + md5_value+'/media')
+    resize_images_in_directory('./static/Images/' + md5_value + '/media')
    # 读取然后修改内容，输出到新的文件
    img_idx = 0  # 图片索引
    with open(temp_markdown, 'r', encoding='utf-8') as f:
@ -112,14 +118,14 @@ def get_docx_content_by_pandoc(docx_file):
                # ![](../static/Images/01b20e04085e406ea5375791da58a60f/media/image3.png){width="3.1251607611548557in"
                pos = line.find(")")
                q = line[:pos + 1]
-                q=q.replace("./static",".")
-                #q = q[4:-1]
-                #q='<img src="'+q+'" alt="我是图片">'
+                q = q.replace("./static", ".")
+                # q = q[4:-1]
+                # q='<img src="'+q+'" alt="我是图片">'
                img_idx += 1
                content += q + "\n"
            else:
                content += line.strip().replace("**", "") + "\n"
-                content=content.replace("\phantom","")
+                content = content.replace("\phantom", "")
    # 将content回写到markdown文件
    with open(temp_markdown, 'w', encoding='utf-8') as f:
        f.write(content)
--- a/dsLightRag/Util/pycache/DocxUtil.cpython-310.pyc
+++ b/dsLightRag/Util/pycache/DocxUtil.cpython-310.pyc