'commit'

6 days ago · 1db4287e06
parent 71edcf4aae
commit 1db4287e06
9 changed files with 62 additions and 34 deletions
--- a/dsLightRag/Config/pycache/Config.cpython-310.pyc
+++ b/dsLightRag/Config/pycache/Config.cpython-310.pyc
--- a/dsLightRag/File_Start.py
+++ b/dsLightRag/File_Start.py
@ -16,13 +16,7 @@ from starlette.staticfiles import StaticFiles
 from Util.LightRagUtil import *
 from Util.PostgreSQLUtil import init_postgres_pool

-# 在程序开始时添加以下配置
-logging.basicConfig(
-    level=logging.INFO,  # 设置日志级别为INFO
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-
-# 或者如果你想更详细地控制日志输出
+# 更详细地控制日志输出
 logger = logging.getLogger('lightrag')
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
--- a/dsLightRag/T1_Train.py
+++ b/dsLightRag/T1_Train.py
@ -1,4 +1,5 @@
 import asyncio
+import logging

 from Util.DocxUtil import get_docx_content_by_pandoc
 from Util.LightRagUtil import configure_logging, initialize_rag
@ -10,30 +11,40 @@ KEMU = 'ShiJi'  # JiHe,Math,SuShi,Chemistry,ShiJi,ChangChun
 WORKING_DIR = "./Topic/" + KEMU
 docx_file = 'static/Txt/'

+# 是不是清空重新生成
+IS_CLEAR= False
+
+# 更详细地控制日志输出
+logger = logging.getLogger('lightrag')
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+logger.addHandler(handler)

 async def main():
-    # 注释掉或删除以下清理代码
-    files_to_delete = [
-        "graph_chunk_entity_relation.graphml",
-        "kv_store_doc_status.json",
-        "kv_store_full_docs.json",
-        "kv_store_text_chunks.json",
-        "vdb_chunks.json",
-        "vdb_entities.json",
-        "vdb_relationships.json",
-    ]
-
-    # 删除文件
-    for file in files_to_delete:
-        file_path = os.path.join(WORKING_DIR, file)
-        if os.path.exists(file_path):
-            os.remove(file_path)
-            print(f"Deleting old file:: {file_path}")
+    # 清空文件
+    if IS_CLEAR:
+        # 注释掉或删除以下清理代码
+        files_to_delete = [
+            "graph_chunk_entity_relation.graphml",
+            "kv_store_doc_status.json",
+            "kv_store_full_docs.json",
+            "kv_store_text_chunks.json",
+            "vdb_chunks.json",
+            "vdb_entities.json",
+            "vdb_relationships.json",
+        ]
+
+        # 删除文件
+        for file in files_to_delete:
+            file_path = os.path.join(WORKING_DIR, file)
+            if os.path.exists(file_path):
+                os.remove(file_path)
+                logger.info(f"删除的文件:: {file_path}")

    try:
        # 注意：默认设置使用NetworkX
        rag = await initialize_rag(WORKING_DIR)
-
        # 在docx_file 目录下遍历所有以KEMU开头的文件
        for filename in os.listdir(docx_file):
            if filename.startswith(KEMU):
@ -41,9 +52,9 @@ async def main():
                # 获取docx文件的内容
                content = get_docx_content_by_pandoc(file_path)
                await rag.ainsert(content, file_paths=[filename])
-                print(f"Inserted content from {filename}")
+                logger.info(f"Inserted content from {filename}")
    except Exception as e:
-        print(f"An error occurred: {e}")
+        logger.error(f"An error occurred: {e}")
    finally:
        await rag.finalize_storages()

--- a/dsLightRag/Tools/SpliteDocx.py
+++ b/dsLightRag/Tools/SpliteDocx.py
--- a/dsLightRag/Tools/init.py
+++ b/dsLightRag/Tools/init.py
--- a/dsLightRag/Topic/ShiJi/kv_store_doc_status.json
+++ b/dsLightRag/Topic/ShiJi/kv_store_doc_status.json
@ -16,7 +16,7 @@
    "content_summary": "少年读史记霸王的崛起少年读史记霸王的崛起\n张嘉骅 著张嘉骅 著\n青岛出版社青岛出版社\n出版发行 青岛出版社出版发行 青岛出版社\n社 址 青岛市海尔路182号（266061） 本社网址 http://www.qdpub.com社 址\n青岛市海尔路182号（266061） 本社网址\n邮购电话 13335059110（0532）85814750（兼传真）68068026邮购电话\n13335059110（0532）85814750（兼传真）68068026\n选题策划 谢 蔚选题策划 谢 蔚\n责任编辑 王...",
    "content_length": 132645,
    "created_at": "2025-07-17T14:12:28.160596+00:00",
-    "updated_at": "2025-07-17T14:12:30.078774+00:00",
+    "updated_at": "2025-07-18T00:03:57.164738+00:00",
    "file_path": "ShiJi_2.docx"
  },
  "doc-8d71547c62cb9e0893b184ac60433766": {
@ -26,7 +26,7 @@
    "content_summary": "少年读史记 辩士纵横天下少年读史记 辩士纵横天下\n张嘉骅 著张嘉骅 著\n青岛出版社青岛出版社\n图书在版编目（CIP）数据图书在版编目（CIP）数据\n书 名 少年读史记3------辩士纵横天下编 著 张嘉骅书 名\n少年读史记3------辩士纵横天下编 著 张嘉骅\n出版发行 青岛出版社出版发行 青岛出版社\n社 址 青岛市海尔路182号（266061） 本社网址 http://www.qdpub.com社 址\n青岛市海尔路182号（266061） 本社网址\n邮购电话 13335059110（053...",
    "content_length": 134992,
    "created_at": "2025-07-17T14:12:29.958957+00:00",
-    "updated_at": "2025-07-17T14:12:30.166903+00:00",
+    "updated_at": "2025-07-18T00:03:57.208492+00:00",
    "file_path": "ShiJi_3.docx"
  }
 }
--- a/dsLightRag/Util/DocxUtil.py
+++ b/dsLightRag/Util/DocxUtil.py
@ -51,21 +51,44 @@ def resize_images_in_directory(directory_path, max_width=640, max_height=480):
                        logger.info(f"已缩放: {file_path} -> {new_size}")
                except Exception as e:
                    logger.error(f"处理 {file_path} 时出错: {str(e)}")
+
+
+import hashlib
+
+def calculate_docx_md5(docx_file_path):
+    """
+    计算docx文件的MD5哈希值
+    :param docx_file_path: docx文件路径
+    :return: MD5哈希字符串
+    """
+    # 以二进制模式读取文件
+    with open(docx_file_path, 'rb') as f:
+        file_content = f.read()
+
+    # 创建MD5哈希对象
+    md5_hash = hashlib.md5()
+
+    # 更新哈希值
+    md5_hash.update(file_content)
+
+    # 返回16进制格式的哈希值
+    return md5_hash.hexdigest()
 def get_docx_content_by_pandoc(docx_file):
    # 最后拼接的内容
    content = ""
-    # output_file 设置为临时目录下的uuid.md
-    file_name = uuid.uuid4().hex
+    # 计算 docx_file 的字符串md5值
+    md5_value = calculate_docx_md5(docx_file)
+
    # 将docx_file去掉扩展名
    prefix = docx_file.split(".")[0].split("/")[-1]
    temp_markdown = os.path.join('./static/markdown/', prefix + '.md')
    # 调用pandoc将docx文件转换成markdown
-    os.mkdir("./static/Images/" + file_name)
+    os.mkdir("./static/Images/" + md5_value)
    subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown,
-                    '--extract-media=./static/Images/' + file_name])
+                    '--extract-media=./static/Images/' + md5_value])
    # 遍历目录 './static/Images/'+file_name 下所有的图片，缩小于640*480的尺寸上

-    resize_images_in_directory('./static/Images/' + file_name+'/media')
+    resize_images_in_directory('./static/Images/' + md5_value+'/media')
    # 读取然后修改内容，输出到新的文件
    img_idx = 0  # 图片索引
    with open(temp_markdown, 'r', encoding='utf-8') as f:
--- a/dsLightRag/Util/pycache/DocxUtil.cpython-310.pyc
+++ b/dsLightRag/Util/pycache/DocxUtil.cpython-310.pyc
--- a/dsLightRag/Util/pycache/LightRagUtil.cpython-310.pyc
+++ b/dsLightRag/Util/pycache/LightRagUtil.cpython-310.pyc