diff --git a/dsLightRag/Config/__pycache__/Config.cpython-310.pyc b/dsLightRag/Config/__pycache__/Config.cpython-310.pyc index f34c8803..36856db6 100644 Binary files a/dsLightRag/Config/__pycache__/Config.cpython-310.pyc and b/dsLightRag/Config/__pycache__/Config.cpython-310.pyc differ diff --git a/dsLightRag/File_Start.py b/dsLightRag/File_Start.py index 25b92fa7..61654a42 100644 --- a/dsLightRag/File_Start.py +++ b/dsLightRag/File_Start.py @@ -16,13 +16,7 @@ from starlette.staticfiles import StaticFiles from Util.LightRagUtil import * from Util.PostgreSQLUtil import init_postgres_pool -# 在程序开始时添加以下配置 -logging.basicConfig( - level=logging.INFO, # 设置日志级别为INFO - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) - -# 或者如果你想更详细地控制日志输出 +# 更详细地控制日志输出 logger = logging.getLogger('lightrag') logger.setLevel(logging.INFO) handler = logging.StreamHandler() diff --git a/dsLightRag/T1_Train.py b/dsLightRag/T1_Train.py index 4db65ec1..7510660e 100644 --- a/dsLightRag/T1_Train.py +++ b/dsLightRag/T1_Train.py @@ -1,4 +1,5 @@ import asyncio +import logging from Util.DocxUtil import get_docx_content_by_pandoc from Util.LightRagUtil import configure_logging, initialize_rag @@ -10,30 +11,40 @@ KEMU = 'ShiJi' # JiHe,Math,SuShi,Chemistry,ShiJi,ChangChun WORKING_DIR = "./Topic/" + KEMU docx_file = 'static/Txt/' +# 是不是清空重新生成 +IS_CLEAR= False + +# 更详细地控制日志输出 +logger = logging.getLogger('lightrag') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) +logger.addHandler(handler) async def main(): - # 注释掉或删除以下清理代码 - files_to_delete = [ - "graph_chunk_entity_relation.graphml", - "kv_store_doc_status.json", - "kv_store_full_docs.json", - "kv_store_text_chunks.json", - "vdb_chunks.json", - "vdb_entities.json", - "vdb_relationships.json", - ] - - # 删除文件 - for file in files_to_delete: - file_path = os.path.join(WORKING_DIR, file) - if os.path.exists(file_path): - os.remove(file_path) - print(f"Deleting old file:: {file_path}") + # 清空文件 + if IS_CLEAR: + # 注释掉或删除以下清理代码 + files_to_delete = [ + "graph_chunk_entity_relation.graphml", + "kv_store_doc_status.json", + "kv_store_full_docs.json", + "kv_store_text_chunks.json", + "vdb_chunks.json", + "vdb_entities.json", + "vdb_relationships.json", + ] + + # 删除文件 + for file in files_to_delete: + file_path = os.path.join(WORKING_DIR, file) + if os.path.exists(file_path): + os.remove(file_path) + logger.info(f"删除的文件:: {file_path}") try: # 注意:默认设置使用NetworkX rag = await initialize_rag(WORKING_DIR) - # 在docx_file 目录下遍历所有以KEMU开头的文件 for filename in os.listdir(docx_file): if filename.startswith(KEMU): @@ -41,9 +52,9 @@ async def main(): # 获取docx文件的内容 content = get_docx_content_by_pandoc(file_path) await rag.ainsert(content, file_paths=[filename]) - print(f"Inserted content from {filename}") + logger.info(f"Inserted content from {filename}") except Exception as e: - print(f"An error occurred: {e}") + logger.error(f"An error occurred: {e}") finally: await rag.finalize_storages() diff --git a/dsLightRag/SpliteDocx.py b/dsLightRag/Tools/SpliteDocx.py similarity index 100% rename from dsLightRag/SpliteDocx.py rename to dsLightRag/Tools/SpliteDocx.py diff --git a/dsLightRag/Tools/__init__.py b/dsLightRag/Tools/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dsLightRag/Topic/ShiJi/kv_store_doc_status.json b/dsLightRag/Topic/ShiJi/kv_store_doc_status.json index 85733e87..7aa8b824 100644 --- a/dsLightRag/Topic/ShiJi/kv_store_doc_status.json +++ b/dsLightRag/Topic/ShiJi/kv_store_doc_status.json @@ -16,7 +16,7 @@ "content_summary": "少年读史记霸王的崛起少年读史记霸王的崛起\n张嘉骅 著张嘉骅 著\n青岛出版社青岛出版社\n出版发行 青岛出版社出版发行 青岛出版社\n社 址 青岛市海尔路182号(266061) 本社网址 http://www.qdpub.com社 址\n青岛市海尔路182号(266061) 本社网址\n邮购电话 13335059110(0532)85814750(兼传真)68068026邮购电话\n13335059110(0532)85814750(兼传真)68068026\n选题策划 谢 蔚选题策划 谢 蔚\n责任编辑 王...", "content_length": 132645, "created_at": "2025-07-17T14:12:28.160596+00:00", - "updated_at": "2025-07-17T14:12:30.078774+00:00", + "updated_at": "2025-07-18T00:03:57.164738+00:00", "file_path": "ShiJi_2.docx" }, "doc-8d71547c62cb9e0893b184ac60433766": { @@ -26,7 +26,7 @@ "content_summary": "少年读史记 辩士纵横天下少年读史记 辩士纵横天下\n张嘉骅 著张嘉骅 著\n青岛出版社青岛出版社\n图书在版编目(CIP)数据图书在版编目(CIP)数据\n书 名 少年读史记3------辩士纵横天下编 著 张嘉骅书 名\n少年读史记3------辩士纵横天下编 著 张嘉骅\n出版发行 青岛出版社出版发行 青岛出版社\n社 址 青岛市海尔路182号(266061) 本社网址 http://www.qdpub.com社 址\n青岛市海尔路182号(266061) 本社网址\n邮购电话 13335059110(053...", "content_length": 134992, "created_at": "2025-07-17T14:12:29.958957+00:00", - "updated_at": "2025-07-17T14:12:30.166903+00:00", + "updated_at": "2025-07-18T00:03:57.208492+00:00", "file_path": "ShiJi_3.docx" } } \ No newline at end of file diff --git a/dsLightRag/Util/DocxUtil.py b/dsLightRag/Util/DocxUtil.py index 6d5e01cd..791ca2a5 100644 --- a/dsLightRag/Util/DocxUtil.py +++ b/dsLightRag/Util/DocxUtil.py @@ -51,21 +51,44 @@ def resize_images_in_directory(directory_path, max_width=640, max_height=480): logger.info(f"已缩放: {file_path} -> {new_size}") except Exception as e: logger.error(f"处理 {file_path} 时出错: {str(e)}") + + +import hashlib + +def calculate_docx_md5(docx_file_path): + """ + 计算docx文件的MD5哈希值 + :param docx_file_path: docx文件路径 + :return: MD5哈希字符串 + """ + # 以二进制模式读取文件 + with open(docx_file_path, 'rb') as f: + file_content = f.read() + + # 创建MD5哈希对象 + md5_hash = hashlib.md5() + + # 更新哈希值 + md5_hash.update(file_content) + + # 返回16进制格式的哈希值 + return md5_hash.hexdigest() def get_docx_content_by_pandoc(docx_file): # 最后拼接的内容 content = "" - # output_file 设置为临时目录下的uuid.md - file_name = uuid.uuid4().hex + # 计算 docx_file 的字符串md5值 + md5_value = calculate_docx_md5(docx_file) + # 将docx_file去掉扩展名 prefix = docx_file.split(".")[0].split("/")[-1] temp_markdown = os.path.join('./static/markdown/', prefix + '.md') # 调用pandoc将docx文件转换成markdown - os.mkdir("./static/Images/" + file_name) + os.mkdir("./static/Images/" + md5_value) subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown, - '--extract-media=./static/Images/' + file_name]) + '--extract-media=./static/Images/' + md5_value]) # 遍历目录 './static/Images/'+file_name 下所有的图片,缩小于640*480的尺寸上 - resize_images_in_directory('./static/Images/' + file_name+'/media') + resize_images_in_directory('./static/Images/' + md5_value+'/media') # 读取然后修改内容,输出到新的文件 img_idx = 0 # 图片索引 with open(temp_markdown, 'r', encoding='utf-8') as f: diff --git a/dsLightRag/Util/__pycache__/DocxUtil.cpython-310.pyc b/dsLightRag/Util/__pycache__/DocxUtil.cpython-310.pyc index 4de651c2..b3cedfa3 100644 Binary files a/dsLightRag/Util/__pycache__/DocxUtil.cpython-310.pyc and b/dsLightRag/Util/__pycache__/DocxUtil.cpython-310.pyc differ diff --git a/dsLightRag/Util/__pycache__/LightRagUtil.cpython-310.pyc b/dsLightRag/Util/__pycache__/LightRagUtil.cpython-310.pyc index e4c4435c..14cbead2 100644 Binary files a/dsLightRag/Util/__pycache__/LightRagUtil.cpython-310.pyc and b/dsLightRag/Util/__pycache__/LightRagUtil.cpython-310.pyc differ