From 1db4287e06bb6d4cd86ece04352a0095e40bcc81 Mon Sep 17 00:00:00 2001 From: HuangHai <10402852@qq.com> Date: Fri, 18 Jul 2025 08:04:02 +0800 Subject: [PATCH] 'commit' --- .../Config/__pycache__/Config.cpython-310.pyc | Bin 909 -> 909 bytes dsLightRag/File_Start.py | 8 +-- dsLightRag/T1_Train.py | 51 +++++++++++------- dsLightRag/{ => Tools}/SpliteDocx.py | 0 dsLightRag/Tools/__init__.py | 0 .../Topic/ShiJi/kv_store_doc_status.json | 4 +- dsLightRag/Util/DocxUtil.py | 33 ++++++++++-- .../Util/__pycache__/DocxUtil.cpython-310.pyc | Bin 2719 -> 3124 bytes .../__pycache__/LightRagUtil.cpython-310.pyc | Bin 4497 -> 4497 bytes 9 files changed, 62 insertions(+), 34 deletions(-) rename dsLightRag/{ => Tools}/SpliteDocx.py (100%) create mode 100644 dsLightRag/Tools/__init__.py diff --git a/dsLightRag/Config/__pycache__/Config.cpython-310.pyc b/dsLightRag/Config/__pycache__/Config.cpython-310.pyc index f34c880362db7e071df990fa4b8174845c7cfcf6..36856db6ddcfa50fdb7792aabb61a3c0afdcd118 100644 GIT binary patch delta 19 YcmeBW?`7x8=jG*M0D_y{8@Za90VwDMYXATM delta 19 YcmeBW?`7x8=jG*M0D?s<8@Za90VMkatpET3 diff --git a/dsLightRag/File_Start.py b/dsLightRag/File_Start.py index 25b92fa7..61654a42 100644 --- a/dsLightRag/File_Start.py +++ b/dsLightRag/File_Start.py @@ -16,13 +16,7 @@ from starlette.staticfiles import StaticFiles from Util.LightRagUtil import * from Util.PostgreSQLUtil import init_postgres_pool -# 在程序开始时添加以下配置 -logging.basicConfig( - level=logging.INFO, # 设置日志级别为INFO - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) - -# 或者如果你想更详细地控制日志输出 +# 更详细地控制日志输出 logger = logging.getLogger('lightrag') logger.setLevel(logging.INFO) handler = logging.StreamHandler() diff --git a/dsLightRag/T1_Train.py b/dsLightRag/T1_Train.py index 4db65ec1..7510660e 100644 --- a/dsLightRag/T1_Train.py +++ b/dsLightRag/T1_Train.py @@ -1,4 +1,5 @@ import asyncio +import logging from Util.DocxUtil import get_docx_content_by_pandoc from Util.LightRagUtil import configure_logging, initialize_rag @@ -10,30 +11,40 @@ KEMU = 'ShiJi' # JiHe,Math,SuShi,Chemistry,ShiJi,ChangChun WORKING_DIR = "./Topic/" + KEMU docx_file = 'static/Txt/' +# 是不是清空重新生成 +IS_CLEAR= False + +# 更详细地控制日志输出 +logger = logging.getLogger('lightrag') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) +logger.addHandler(handler) async def main(): - # 注释掉或删除以下清理代码 - files_to_delete = [ - "graph_chunk_entity_relation.graphml", - "kv_store_doc_status.json", - "kv_store_full_docs.json", - "kv_store_text_chunks.json", - "vdb_chunks.json", - "vdb_entities.json", - "vdb_relationships.json", - ] - - # 删除文件 - for file in files_to_delete: - file_path = os.path.join(WORKING_DIR, file) - if os.path.exists(file_path): - os.remove(file_path) - print(f"Deleting old file:: {file_path}") + # 清空文件 + if IS_CLEAR: + # 注释掉或删除以下清理代码 + files_to_delete = [ + "graph_chunk_entity_relation.graphml", + "kv_store_doc_status.json", + "kv_store_full_docs.json", + "kv_store_text_chunks.json", + "vdb_chunks.json", + "vdb_entities.json", + "vdb_relationships.json", + ] + + # 删除文件 + for file in files_to_delete: + file_path = os.path.join(WORKING_DIR, file) + if os.path.exists(file_path): + os.remove(file_path) + logger.info(f"删除的文件:: {file_path}") try: # 注意:默认设置使用NetworkX rag = await initialize_rag(WORKING_DIR) - # 在docx_file 目录下遍历所有以KEMU开头的文件 for filename in os.listdir(docx_file): if filename.startswith(KEMU): @@ -41,9 +52,9 @@ async def main(): # 获取docx文件的内容 content = get_docx_content_by_pandoc(file_path) await rag.ainsert(content, file_paths=[filename]) - print(f"Inserted content from {filename}") + logger.info(f"Inserted content from {filename}") except Exception as e: - print(f"An error occurred: {e}") + logger.error(f"An error occurred: {e}") finally: await rag.finalize_storages() diff --git a/dsLightRag/SpliteDocx.py b/dsLightRag/Tools/SpliteDocx.py similarity index 100% rename from dsLightRag/SpliteDocx.py rename to dsLightRag/Tools/SpliteDocx.py diff --git a/dsLightRag/Tools/__init__.py b/dsLightRag/Tools/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dsLightRag/Topic/ShiJi/kv_store_doc_status.json b/dsLightRag/Topic/ShiJi/kv_store_doc_status.json index 85733e87..7aa8b824 100644 --- a/dsLightRag/Topic/ShiJi/kv_store_doc_status.json +++ b/dsLightRag/Topic/ShiJi/kv_store_doc_status.json @@ -16,7 +16,7 @@ "content_summary": "少年读史记霸王的崛起少年读史记霸王的崛起\n张嘉骅 著张嘉骅 著\n青岛出版社青岛出版社\n出版发行 青岛出版社出版发行 青岛出版社\n社 址 青岛市海尔路182号(266061) 本社网址 http://www.qdpub.com社 址\n青岛市海尔路182号(266061) 本社网址\n邮购电话 13335059110(0532)85814750(兼传真)68068026邮购电话\n13335059110(0532)85814750(兼传真)68068026\n选题策划 谢 蔚选题策划 谢 蔚\n责任编辑 王...", "content_length": 132645, "created_at": "2025-07-17T14:12:28.160596+00:00", - "updated_at": "2025-07-17T14:12:30.078774+00:00", + "updated_at": "2025-07-18T00:03:57.164738+00:00", "file_path": "ShiJi_2.docx" }, "doc-8d71547c62cb9e0893b184ac60433766": { @@ -26,7 +26,7 @@ "content_summary": "少年读史记 辩士纵横天下少年读史记 辩士纵横天下\n张嘉骅 著张嘉骅 著\n青岛出版社青岛出版社\n图书在版编目(CIP)数据图书在版编目(CIP)数据\n书 名 少年读史记3------辩士纵横天下编 著 张嘉骅书 名\n少年读史记3------辩士纵横天下编 著 张嘉骅\n出版发行 青岛出版社出版发行 青岛出版社\n社 址 青岛市海尔路182号(266061) 本社网址 http://www.qdpub.com社 址\n青岛市海尔路182号(266061) 本社网址\n邮购电话 13335059110(053...", "content_length": 134992, "created_at": "2025-07-17T14:12:29.958957+00:00", - "updated_at": "2025-07-17T14:12:30.166903+00:00", + "updated_at": "2025-07-18T00:03:57.208492+00:00", "file_path": "ShiJi_3.docx" } } \ No newline at end of file diff --git a/dsLightRag/Util/DocxUtil.py b/dsLightRag/Util/DocxUtil.py index 6d5e01cd..791ca2a5 100644 --- a/dsLightRag/Util/DocxUtil.py +++ b/dsLightRag/Util/DocxUtil.py @@ -51,21 +51,44 @@ def resize_images_in_directory(directory_path, max_width=640, max_height=480): logger.info(f"已缩放: {file_path} -> {new_size}") except Exception as e: logger.error(f"处理 {file_path} 时出错: {str(e)}") + + +import hashlib + +def calculate_docx_md5(docx_file_path): + """ + 计算docx文件的MD5哈希值 + :param docx_file_path: docx文件路径 + :return: MD5哈希字符串 + """ + # 以二进制模式读取文件 + with open(docx_file_path, 'rb') as f: + file_content = f.read() + + # 创建MD5哈希对象 + md5_hash = hashlib.md5() + + # 更新哈希值 + md5_hash.update(file_content) + + # 返回16进制格式的哈希值 + return md5_hash.hexdigest() def get_docx_content_by_pandoc(docx_file): # 最后拼接的内容 content = "" - # output_file 设置为临时目录下的uuid.md - file_name = uuid.uuid4().hex + # 计算 docx_file 的字符串md5值 + md5_value = calculate_docx_md5(docx_file) + # 将docx_file去掉扩展名 prefix = docx_file.split(".")[0].split("/")[-1] temp_markdown = os.path.join('./static/markdown/', prefix + '.md') # 调用pandoc将docx文件转换成markdown - os.mkdir("./static/Images/" + file_name) + os.mkdir("./static/Images/" + md5_value) subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown, - '--extract-media=./static/Images/' + file_name]) + '--extract-media=./static/Images/' + md5_value]) # 遍历目录 './static/Images/'+file_name 下所有的图片,缩小于640*480的尺寸上 - resize_images_in_directory('./static/Images/' + file_name+'/media') + resize_images_in_directory('./static/Images/' + md5_value+'/media') # 读取然后修改内容,输出到新的文件 img_idx = 0 # 图片索引 with open(temp_markdown, 'r', encoding='utf-8') as f: diff --git a/dsLightRag/Util/__pycache__/DocxUtil.cpython-310.pyc b/dsLightRag/Util/__pycache__/DocxUtil.cpython-310.pyc index 4de651c293d79e0e650c1a694d128df0bf448222..b3cedfa3586735102138352201e3f6fae06f2619 100644 GIT binary patch delta 1139 zcmZWp-D?zA6uw9C3@+}%3+kvlUQ z`yy>8?!rwf<B5`qmVL4YN#NgOX8B%2Rr}c`zcHWL{Y@_({j%|R zrJu_kKj7{8$#+$LzPow<-m~?GSv+O@!spW(FJcF&%AG2tdaqIS8gBDKs
)q*Zm zeJ3>=&Jr~1j^|ISWqbSKDZNtG_9NEJyButoGLXZMVL3JjI7JSCunIo}j2 z3_}q{zExIvs0r-^`LZ7BLXRzk7FX}g;sRO&Fq%+aK`jqE66$@GX*;Daix;3I8@-h4 zT*F8K#xhIo_N{Sg9li9{+|>_;hA{XKS^5VF4Nai~5hhHO$aRMIw1gGg(VF&QZ$VJZ z!iHiS_5e*7>@JZxlJV4rN$Cmc;sUZ2xw?g|>)Y5WuR8VSss(w~cu-LJUhnPuYwJEi zu?^S@yAu}Mzn;?-stsj8MlItbdMNtQIAxVh$&JfZH;7&vQ~UW*gfvy%X4@%iO|Q~y z)l;S0@lv|Dl{?YL=AJ?M>3JtnsLWtvN>n#Ln>mB#G~*gBT$lQ2s=J+fz1gmF$rkZ| BCv*S+ delta 723 zcmZWnzi-n}5Wai1WBcWIN^t^dOAriAix3hbOT`4LR2YyL5Rs~abZMHBCNw`(B%YO_ zWdlLw6@<-yfM8)lEQo)A#RCHq3qp)++;aed@RL4&-`#h<_s;Ki`T1VOkc^6ECNrx7zA-YltuRF?WsvPbBz$vZhBi4$utJ296T0Lo(Cu|= z$|;lbI@@-WyqtQe;K4ro$?&bwB+aHGm1yRX@p#~;RO%9}w|!gLVI~6o9$wpBWtF8P z5e{xK%h7Cv!;OR;)v+X>Cr@wrj@7@yc9xz94j9kqdW#0l}Bz|WUa^!@LXx9xOFnR48 z2tDT<4BzbW`SrTF$}rM1sQsEjR?i?TD%2zUukhoaLas}ioTT|kgoRgM90O-cu{j=h zD7G}7kBzO?z2LihmVb3kUX`)C+3B`6dbf9iJy|ni+zg&cFJ0#=G)Q16cqmpx07@i8;%xRHX_z6nLte^k@ diff --git a/dsLightRag/Util/__pycache__/LightRagUtil.cpython-310.pyc b/dsLightRag/Util/__pycache__/LightRagUtil.cpython-310.pyc index e4c4435cab5601e62729bd39a636e00d445aab3f..14cbead21a7ac24fcf2b8dd1ec5875a7f14a5d31 100644 GIT binary patch delta 925 zcmY+C%TE(g6o)&soeoo`!y`r%B98{OMiC(~hysF^iu8pADHm}vw6|qo9?hKrtbc%R z+>o<1Q8pSkCNA9Qnz&)r+ra?bCbIj?&UmGFpc;U;@9t;^k48r^HI8WaQUyKcNfkaHFTBVE!#>8=980JL6k6(*@ z#UYHBI2vKoIiM^dlBgf7HEXD+j$yl=ZkK2?l~y(wQTU05y=XPv+xA`FkhbM_q(%;f zNjQoeiC3}2HFzN15^tk<2VP2PAqn55j5v+L3|wlT5f-4*J|!%{QTtAus9EYAIBd6A z@EoRB>4zXusI^7QWy7xW@)m(nd1NSwjVo2A*S0m+k7P?+OGvB@Gbh25ukf62a&Quh&jYOqVq4h&`QGB=sIuvy3z+@9sR;8WICK^3Xc{M z58+$KhLDE1vM8*9rTn~^<}!)qQ%jFery$+=)P1K^@N{p_ZE3N5Dun~Ie0UKXSZp?n z(~+2ITAJmQsp(%x&p#ixt-E!*M0)=hJ!hBNnrl3#1b)Rvrf?-h-raBQh;)~o5_hnT zmd0|;YWuXr_t+8%WYm8CQg+ml0qn-#lZaLxv5dg%>=AradqoY+0-UIu(>ga3ZP{!> z$R^?}!96|-e}(-1MoW%eF{%t7Mk8$RS+#nj&91TL1IBwJQ0biIW#4s9kVy;*x*lWA x%MO`_>op>FS!YbI`?Xhi2wO$0A&U4%gxdQCb8NPb5yAhZ;*$WL&y-JNc?-JfQ5Db^bo zy?7z-YNA{;UQA3p>6v)(=xO53yH~HyYpKRLe0lGiZ{GXn%^atX)3UE9a)htKr;qaI zsKDSfdX3}n=$#8pSzFI(`NeH*Y3I?sO*;~K6Jf9(dnsIkAF=zwD5T?aqhtIq9p}KL zitP|jcL|eR{pnIGBmsjkK9xL=k_^HGyo=BB@+>}~PT`q^<69P?T&Rk(!W1luyL|hV zIJ`ZHdI>dg$3G-AE3#xqthIZj&Jx=TLftb+yOd_ylqq~iLoZrG538M&8`8=hN2$cI zunZ@O6Y)B>n1^NQhPaC18oZF!gc5w0@?su|0!;Sa61JeyyCCerNpCI9QUwwo`n0~i z8TwSW$onW$h+8J6X?qrL?=qN{GnY$fT(PL$sA@q+N%uIHm1qxEEE{RIzka)1K}Y;|6drus03e}Q0*FEz7(BVlsA*_{y^G<}07Xrs%~ zGK$nDG58$2B!hfvg#VOUDl>*|I&%^+G7wAztWF7hNDYZi3@h+0wSV)7huNM&AuANJ zX*$a@rs7}X?>BAuUd6U3ehe1(4%|kI^>_{1{=oQd3@ZJ%dE2-BbGyr!M}v^k_BdjW t9fU05nL4F