This commit is contained in:
2025-09-12 21:57:36 +08:00
parent 86e262461c
commit 6162ceddb7
7 changed files with 3694 additions and 0 deletions

View File

@@ -3,3 +3,22 @@ EXCEL_PATH = r'D:\dsWork\YunNanProject\Doc\数据库-2015-2024-v2.xlsx'
# Echarts的静态资源路径 # Echarts的静态资源路径
ONLINE_HOST = "https://gcore.jsdelivr.net/npm/echarts@6.0.0/dist/" ONLINE_HOST = "https://gcore.jsdelivr.net/npm/echarts@6.0.0/dist/"
# DeepSeek大模型 【DeepSeek深度求索官方】训练时用这个
LLM_API_KEY = "sk-44ae895eeb614aa1a9c6460579e322f1"
LLM_BASE_URL = "https://api.deepseek.com"
LLM_MODEL_NAME = "deepseek-chat"
# LLM_MODEL_NAME = "deepseek-reasoner"
# 嵌入向量模型
EMBED_MODEL_NAME = "BAAI/bge-m3"
EMBED_API_KEY = "sk-pbqibyjwhrgmnlsmdygplahextfaclgnedetybccknxojlyl"
EMBED_BASE_URL = "https://api.siliconflow.cn/v1"
EMBED_DIM = 1024
EMBED_MAX_TOKEN_SIZE = 8192
# 重排模型
RERANK_MODEL = 'BAAI/bge-reranker-v2-m3'
RERANK_BASE_URL = 'https://api.siliconflow.cn/v1/rerank'
RERANK_BINDING_API_KEY = 'sk-pbqibyjwhrgmnlsmdygplahextfaclgnedetybccknxojlyl'

55
Rag/T1_Train.py Normal file
View File

@@ -0,0 +1,55 @@
import asyncio
import logging
import os
from Util.LightRagUtil import initialize_rag
# 是不是清空重新生成
IS_CLEAR = False
# 更详细地控制日志输出
logger = logging.getLogger('lightrag')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
async def main():
# 清空文件
if IS_CLEAR:
# 注释掉或删除以下清理代码
files_to_delete = [
"graph_chunk_entity_relation.graphml",
"kv_store_doc_status.json",
"kv_store_full_docs.json",
"kv_store_text_chunks.json",
"vdb_chunks.json",
"vdb_entities.json",
"vdb_relationships.json",
]
# 删除文件
for file in files_to_delete:
file_path = os.path.join(WORKING_DIR, file)
if os.path.exists(file_path):
os.remove(file_path)
logger.info(f"删除的文件:: {file_path}")
try:
# 注意默认设置使用NetworkX
rag = await initialize_rag(WORKING_DIR)
file_path='Txt/YunNan.txt'
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
await rag.ainsert(content, file_paths=[file_path])
logger.info(f"Inserted content from {file_path}")
except Exception as e:
logger.error(f"An error occurred: {e}")
finally:
await rag.finalize_storages()
# 组装文件路径
WORKING_DIR = "./KB"
asyncio.run(main())

46
Rag/T2_Query.py Normal file
View File

@@ -0,0 +1,46 @@
import asyncio
import inspect
from Util.LightRagUtil import configure_logging, initialize_rag, print_stream
from lightrag import QueryParam
# 化学
data = [
# {"NAME": "Chemistry", "Q": "硝酸光照分解的化学反应方程式是什么", "ChineseName": "化学"},
{"NAME": "Chemistry", "Q": "氢气与氧气燃烧的现象", "ChineseName": "化学"},
{"NAME": "Math", "Q": "氧化铁与硝酸的化学反应方程式是什么", "ChineseName": "数学"},
{"NAME": "Chinese", "Q": "氧化铁与硝酸的化学反应方程式是什么", "ChineseName": "语文"},
{"NAME": "JiHe", "Q": "三角形两边之和大于第三边的证明", "ChineseName": "几何"}
]
# 准备查询的科目
KEMU = "JiHe" # Chemistry JiHe
# 查找索引号
idx = [i for i, d in enumerate(data) if d["NAME"] == KEMU][0]
async def main():
try:
user_prompt = "\n 1、资料中提供化学反应方程式的一定要严格按提供的Latex公式输出绝对不允许对Latex公式进行修改 "
user_prompt = user_prompt + "\n 2、如果资料中提供了图片的一定要严格按照原文提供图片输出不允许省略或不输出"
user_prompt = user_prompt + "\n 3、资料中提到的知识内容需要判断是否与本次问题相关不相关的绝对不要输出"
rag = await initialize_rag('Topic/' + data[idx]["NAME"])
resp = await rag.aquery(
data[idx]["Q"],
param=QueryParam(mode="hybrid", stream=True, user_prompt=user_prompt),
# hybrid naive
)
if inspect.isasyncgen(resp):
await print_stream(resp)
else:
print(resp)
except Exception as e:
print(f"An error occurred: {e}")
finally:
if rag:
await rag.finalize_storages()
if __name__ == "__main__":
configure_logging()
asyncio.run(main())

0
Rag/__init__.py Normal file
View File

3387
Txt/YunNan.txt Normal file

File diff suppressed because it is too large Load Diff

187
Util/LightRagUtil.py Normal file
View File

@@ -0,0 +1,187 @@
import logging
import logging.config
import os
# 注意:目前只安装 1.4.6,升级后重排函数需要修改
# pip uninstall "lightrag-hku[api]"
# pip install "lightrag-hku[api]==1.4.6"
import numpy as np
from lightrag import LightRAG
from lightrag.kg.shared_storage import initialize_pipeline_status
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.rerank import custom_rerank
from lightrag.utils import EmbeddingFunc, logger, set_verbose_debug
from Config.Config import EMBED_DIM, EMBED_MAX_TOKEN_SIZE, RERANK_MODEL, RERANK_BASE_URL, RERANK_BINDING_API_KEY, \
LLM_MODEL_NAME, LLM_API_KEY, LLM_BASE_URL, EMBED_MODEL_NAME, EMBED_API_KEY, EMBED_BASE_URL
async def print_stream(stream):
async for chunk in stream:
if chunk:
print(chunk, end="", flush=True)
def configure_logging():
for logger_name in ["uvicorn", "uvicorn.access", "uvicorn.error", "lightrag"]:
logger_instance = logging.getLogger(logger_name)
logger_instance.handlers = []
logger_instance.filters = []
log_dir = os.getenv("LOG_DIR", os.getcwd())
log_file_path = os.path.abspath(
os.path.join(log_dir, "./Logs/lightrag.log")
)
print(f"\nLightRAG log file: {log_file_path}\n")
os.makedirs(os.path.dirname(log_dir), exist_ok=True)
log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760))
log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5))
logging.config.dictConfig(
{
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"default": {
"format": "%(levelname)s: %(message)s",
},
"detailed": {
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
},
},
"handlers": {
"console": {
"formatter": "default",
"class": "logging.StreamHandler",
"stream": "ext://sys.stderr",
},
"file": {
"formatter": "detailed",
"class": "logging.handlers.RotatingFileHandler",
"filename": log_file_path,
"maxBytes": log_max_bytes,
"backupCount": log_backup_count,
"encoding": "utf-8",
},
},
"loggers": {
"lightrag": {
"handlers": ["console", "file"],
"level": "INFO",
"propagate": False,
},
},
}
)
logger.setLevel(logging.INFO)
set_verbose_debug(os.getenv("VERBOSE_DEBUG", "false").lower() == "true")
async def llm_model_func(
prompt, system_prompt=None, history_messages=None, **kwargs
) -> str:
return await openai_complete_if_cache(
os.getenv("LLM_MODEL", LLM_MODEL_NAME),
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=LLM_API_KEY,
base_url=LLM_BASE_URL,
**kwargs,
)
async def embedding_func(texts: list[str]) -> np.ndarray:
return await openai_embed(
texts,
model=EMBED_MODEL_NAME,
api_key=EMBED_API_KEY,
base_url=EMBED_BASE_URL
)
async def rerank_func(query: str, documents: list, top_k: int = None, **kwargs):
return await custom_rerank(
query=query,
documents=documents,
model=RERANK_MODEL,
base_url=RERANK_BASE_URL,
api_key=RERANK_BINDING_API_KEY,
top_k=top_k or 10,
**kwargs,
)
async def initialize_rag(working_dir, graph_storage=None):
if graph_storage is None:
graph_storage = 'NetworkXStorage'
rag = LightRAG(
working_dir=working_dir,
llm_model_func=llm_model_func,
graph_storage=graph_storage,
embedding_func=EmbeddingFunc(
embedding_dim=EMBED_DIM,
max_token_size=EMBED_MAX_TOKEN_SIZE,
func=embedding_func
),
rerank_model_func=rerank_func,
)
await rag.initialize_storages()
await initialize_pipeline_status()
return rag
def create_llm_model_func():
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
return openai_complete_if_cache(
LLM_MODEL_NAME,
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=LLM_API_KEY,
base_url=LLM_BASE_URL,
**kwargs,
)
return llm_model_func
def create_embedding_func():
return EmbeddingFunc(
embedding_dim=EMBED_DIM,
max_token_size=EMBED_MAX_TOKEN_SIZE,
func=lambda texts: openai_embed(
texts,
model=EMBED_MODEL_NAME,
api_key=EMBED_API_KEY,
base_url=EMBED_BASE_URL,
),
)
async def initialize_pg_rag(WORKING_DIR, workspace='default'):
rag = LightRAG(
working_dir=WORKING_DIR,
llm_model_func=llm_model_func,
llm_model_name=LLM_MODEL_NAME,
llm_model_max_async=4,
llm_model_max_token_size=32768,
enable_llm_cache_for_entity_extract=True,
embedding_func=EmbeddingFunc(
embedding_dim=EMBED_DIM,
max_token_size=EMBED_MAX_TOKEN_SIZE,
func=embedding_func
),
rerank_model_func=rerank_func,
kv_storage="PGKVStorage",
doc_status_storage="PGDocStatusStorage",
graph_storage="PGGraphStorage",
vector_storage="PGVectorStorage",
auto_manage_storages_states=False,
vector_db_storage_cls_kwargs={"workspace": workspace}
)
await rag.initialize_storages()
await initialize_pipeline_status()
return rag