main
HuangHai 6 days ago
parent 71edcf4aae
commit 1db4287e06

@ -16,13 +16,7 @@ from starlette.staticfiles import StaticFiles
from Util.LightRagUtil import *
from Util.PostgreSQLUtil import init_postgres_pool
# 在程序开始时添加以下配置
logging.basicConfig(
level=logging.INFO, # 设置日志级别为INFO
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# 或者如果你想更详细地控制日志输出
# 更详细地控制日志输出
logger = logging.getLogger('lightrag')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()

@ -1,4 +1,5 @@
import asyncio
import logging
from Util.DocxUtil import get_docx_content_by_pandoc
from Util.LightRagUtil import configure_logging, initialize_rag
@ -10,30 +11,40 @@ KEMU = 'ShiJi' # JiHe,Math,SuShi,Chemistry,ShiJi,ChangChun
WORKING_DIR = "./Topic/" + KEMU
docx_file = 'static/Txt/'
# 是不是清空重新生成
IS_CLEAR= False
# 更详细地控制日志输出
logger = logging.getLogger('lightrag')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
async def main():
# 注释掉或删除以下清理代码
files_to_delete = [
"graph_chunk_entity_relation.graphml",
"kv_store_doc_status.json",
"kv_store_full_docs.json",
"kv_store_text_chunks.json",
"vdb_chunks.json",
"vdb_entities.json",
"vdb_relationships.json",
]
# 删除文件
for file in files_to_delete:
file_path = os.path.join(WORKING_DIR, file)
if os.path.exists(file_path):
os.remove(file_path)
print(f"Deleting old file:: {file_path}")
# 清空文件
if IS_CLEAR:
# 注释掉或删除以下清理代码
files_to_delete = [
"graph_chunk_entity_relation.graphml",
"kv_store_doc_status.json",
"kv_store_full_docs.json",
"kv_store_text_chunks.json",
"vdb_chunks.json",
"vdb_entities.json",
"vdb_relationships.json",
]
# 删除文件
for file in files_to_delete:
file_path = os.path.join(WORKING_DIR, file)
if os.path.exists(file_path):
os.remove(file_path)
logger.info(f"删除的文件:: {file_path}")
try:
# 注意默认设置使用NetworkX
rag = await initialize_rag(WORKING_DIR)
# 在docx_file 目录下遍历所有以KEMU开头的文件
for filename in os.listdir(docx_file):
if filename.startswith(KEMU):
@ -41,9 +52,9 @@ async def main():
# 获取docx文件的内容
content = get_docx_content_by_pandoc(file_path)
await rag.ainsert(content, file_paths=[filename])
print(f"Inserted content from {filename}")
logger.info(f"Inserted content from {filename}")
except Exception as e:
print(f"An error occurred: {e}")
logger.error(f"An error occurred: {e}")
finally:
await rag.finalize_storages()

@ -16,7 +16,7 @@
"content_summary": "少年读史记霸王的崛起少年读史记霸王的崛起\n张嘉骅 著张嘉骅 著\n青岛出版社青岛出版社\n出版发行 青岛出版社出版发行 青岛出版社\n社 址 青岛市海尔路182号266061 本社网址 http://www.qdpub.com社 址\n青岛市海尔路182号266061 本社网址\n邮购电话 13335059110053285814750兼传真68068026邮购电话\n13335059110053285814750兼传真68068026\n选题策划 谢 蔚选题策划 谢 蔚\n责任编辑 王...",
"content_length": 132645,
"created_at": "2025-07-17T14:12:28.160596+00:00",
"updated_at": "2025-07-17T14:12:30.078774+00:00",
"updated_at": "2025-07-18T00:03:57.164738+00:00",
"file_path": "ShiJi_2.docx"
},
"doc-8d71547c62cb9e0893b184ac60433766": {
@ -26,7 +26,7 @@
"content_summary": "少年读史记 辩士纵横天下少年读史记 辩士纵横天下\n张嘉骅 著张嘉骅 著\n青岛出版社青岛出版社\n图书在版编目CIP数据图书在版编目CIP数据\n书 名 少年读史记3------辩士纵横天下编 著 张嘉骅书 名\n少年读史记3------辩士纵横天下编 著 张嘉骅\n出版发行 青岛出版社出版发行 青岛出版社\n社 址 青岛市海尔路182号266061 本社网址 http://www.qdpub.com社 址\n青岛市海尔路182号266061 本社网址\n邮购电话 13335059110053...",
"content_length": 134992,
"created_at": "2025-07-17T14:12:29.958957+00:00",
"updated_at": "2025-07-17T14:12:30.166903+00:00",
"updated_at": "2025-07-18T00:03:57.208492+00:00",
"file_path": "ShiJi_3.docx"
}
}

@ -51,21 +51,44 @@ def resize_images_in_directory(directory_path, max_width=640, max_height=480):
logger.info(f"已缩放: {file_path} -> {new_size}")
except Exception as e:
logger.error(f"处理 {file_path} 时出错: {str(e)}")
import hashlib
def calculate_docx_md5(docx_file_path):
"""
计算docx文件的MD5哈希值
:param docx_file_path: docx文件路径
:return: MD5哈希字符串
"""
# 以二进制模式读取文件
with open(docx_file_path, 'rb') as f:
file_content = f.read()
# 创建MD5哈希对象
md5_hash = hashlib.md5()
# 更新哈希值
md5_hash.update(file_content)
# 返回16进制格式的哈希值
return md5_hash.hexdigest()
def get_docx_content_by_pandoc(docx_file):
# 最后拼接的内容
content = ""
# output_file 设置为临时目录下的uuid.md
file_name = uuid.uuid4().hex
# 计算 docx_file 的字符串md5值
md5_value = calculate_docx_md5(docx_file)
# 将docx_file去掉扩展名
prefix = docx_file.split(".")[0].split("/")[-1]
temp_markdown = os.path.join('./static/markdown/', prefix + '.md')
# 调用pandoc将docx文件转换成markdown
os.mkdir("./static/Images/" + file_name)
os.mkdir("./static/Images/" + md5_value)
subprocess.run(['pandoc', docx_file, '-f', 'docx', '-t', 'markdown', '-o', temp_markdown,
'--extract-media=./static/Images/' + file_name])
'--extract-media=./static/Images/' + md5_value])
# 遍历目录 './static/Images/'+file_name 下所有的图片缩小于640*480的尺寸上
resize_images_in_directory('./static/Images/' + file_name+'/media')
resize_images_in_directory('./static/Images/' + md5_value+'/media')
# 读取然后修改内容,输出到新的文件
img_idx = 0 # 图片索引
with open(temp_markdown, 'r', encoding='utf-8') as f:

Loading…
Cancel
Save