2025-08-26 13:33:43 +08:00
|
|
|
|
import asyncio
|
2025-08-26 13:54:12 +08:00
|
|
|
|
import logging
|
2025-08-26 13:45:10 +08:00
|
|
|
|
|
2025-08-26 13:33:43 +08:00
|
|
|
|
from raganything import RAGAnything, RAGAnythingConfig
|
|
|
|
|
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
|
|
|
|
from lightrag.utils import EmbeddingFunc
|
2025-08-26 16:05:06 +08:00
|
|
|
|
from logging.handlers import RotatingFileHandler # 导入RotatingFileHandler用于日志轮转
|
|
|
|
|
|
2025-08-26 13:39:51 +08:00
|
|
|
|
import Config.Config
|
|
|
|
|
|
2025-08-26 16:50:27 +08:00
|
|
|
|
# 控制日志输出
|
|
|
|
|
root_logger = logging.getLogger('lightrag')
|
2025-08-26 13:59:48 +08:00
|
|
|
|
root_logger.setLevel(logging.INFO)
|
2025-08-26 16:50:27 +08:00
|
|
|
|
handler = logging.StreamHandler()
|
|
|
|
|
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
|
|
|
|
root_logger.addHandler(handler)
|
2025-08-26 13:59:48 +08:00
|
|
|
|
|
|
|
|
|
# 同时保持原有的ragAnything日志记录器配置
|
2025-08-26 13:54:12 +08:00
|
|
|
|
logger = logging.getLogger('ragAnything')
|
|
|
|
|
logger.setLevel(logging.INFO)
|
2025-08-26 16:05:06 +08:00
|
|
|
|
# 控制台输出处理器
|
|
|
|
|
console_handler = logging.StreamHandler()
|
|
|
|
|
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
|
|
|
|
logger.addHandler(console_handler)
|
|
|
|
|
# 循环滚动文件处理器(控制在200K左右)
|
|
|
|
|
file_handler = RotatingFileHandler(
|
|
|
|
|
'lightrag.log',
|
|
|
|
|
maxBytes=200 * 1024, # 200KB
|
|
|
|
|
backupCount=5, # 最多保留5个备份文件
|
|
|
|
|
encoding='utf-8',
|
|
|
|
|
delay=True # 延迟创建文件,直到有日志输出
|
|
|
|
|
)
|
|
|
|
|
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
|
|
|
|
logger.addHandler(file_handler)
|
2025-08-26 13:59:48 +08:00
|
|
|
|
|
2025-08-26 13:39:51 +08:00
|
|
|
|
|
2025-08-26 16:05:06 +08:00
|
|
|
|
async def train(file_path, output_dir, working_dir):
|
2025-08-26 13:33:43 +08:00
|
|
|
|
# 设置 API 配置
|
2025-08-26 13:39:51 +08:00
|
|
|
|
api_key = Config.Config.ALY_LLM_API_KEY
|
|
|
|
|
base_url = Config.Config.ALY_LLM_BASE_URL
|
2025-08-26 13:33:43 +08:00
|
|
|
|
|
|
|
|
|
# 创建 RAGAnything 配置
|
|
|
|
|
config = RAGAnythingConfig(
|
2025-08-26 16:05:06 +08:00
|
|
|
|
working_dir=working_dir,
|
2025-08-26 13:33:43 +08:00
|
|
|
|
parser="mineru", # 选择解析器:mineru 或 docling
|
|
|
|
|
parse_method="auto", # 解析方法:auto, ocr 或 txt
|
|
|
|
|
enable_image_processing=True,
|
|
|
|
|
enable_table_processing=True,
|
|
|
|
|
enable_equation_processing=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# 定义 LLM 模型函数
|
|
|
|
|
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
|
|
|
|
return openai_complete_if_cache(
|
2025-08-26 13:39:51 +08:00
|
|
|
|
Config.Config.ALY_LLM_MODEL_NAME,
|
2025-08-26 13:33:43 +08:00
|
|
|
|
prompt,
|
|
|
|
|
system_prompt=system_prompt,
|
|
|
|
|
history_messages=history_messages,
|
|
|
|
|
api_key=api_key,
|
|
|
|
|
base_url=base_url,
|
|
|
|
|
**kwargs,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# 定义视觉模型函数用于图像处理
|
|
|
|
|
def vision_model_func(
|
2025-08-26 13:39:51 +08:00
|
|
|
|
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
|
2025-08-26 13:33:43 +08:00
|
|
|
|
):
|
|
|
|
|
# 如果提供了messages格式(用于多模态VLM增强查询),直接使用
|
|
|
|
|
if messages:
|
|
|
|
|
return openai_complete_if_cache(
|
2025-08-26 13:39:51 +08:00
|
|
|
|
Config.Config.GLM_MODEL_NAME,
|
2025-08-26 13:33:43 +08:00
|
|
|
|
"",
|
|
|
|
|
system_prompt=None,
|
|
|
|
|
history_messages=[],
|
|
|
|
|
messages=messages,
|
2025-08-26 13:39:51 +08:00
|
|
|
|
api_key=Config.Config.GLM_API_KEY,
|
|
|
|
|
base_url=Config.Config.GLM_BASE_URL,
|
2025-08-26 13:33:43 +08:00
|
|
|
|
**kwargs,
|
|
|
|
|
)
|
|
|
|
|
# 传统单图片格式
|
|
|
|
|
elif image_data:
|
|
|
|
|
return openai_complete_if_cache(
|
2025-08-26 13:39:51 +08:00
|
|
|
|
Config.Config.GLM_MODEL_NAME,
|
2025-08-26 13:33:43 +08:00
|
|
|
|
"",
|
|
|
|
|
system_prompt=None,
|
|
|
|
|
history_messages=[],
|
|
|
|
|
messages=[
|
|
|
|
|
{"role": "system", "content": system_prompt}
|
|
|
|
|
if system_prompt
|
|
|
|
|
else None,
|
|
|
|
|
{
|
|
|
|
|
"role": "user",
|
|
|
|
|
"content": [
|
|
|
|
|
{"type": "text", "text": prompt},
|
|
|
|
|
{
|
|
|
|
|
"type": "image_url",
|
|
|
|
|
"image_url": {
|
|
|
|
|
"url": f"data:image/jpeg;base64,{image_data}"
|
|
|
|
|
},
|
|
|
|
|
},
|
|
|
|
|
],
|
|
|
|
|
}
|
|
|
|
|
if image_data
|
|
|
|
|
else {"role": "user", "content": prompt},
|
|
|
|
|
],
|
2025-08-26 13:39:51 +08:00
|
|
|
|
api_key=Config.Config.GLM_API_KEY,
|
|
|
|
|
base_url=Config.Config.GLM_BASE_URL,
|
2025-08-26 13:33:43 +08:00
|
|
|
|
**kwargs,
|
|
|
|
|
)
|
|
|
|
|
# 纯文本格式
|
|
|
|
|
else:
|
|
|
|
|
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
|
|
|
|
|
|
|
|
|
# 定义嵌入函数
|
|
|
|
|
embedding_func = EmbeddingFunc(
|
2025-08-26 13:39:51 +08:00
|
|
|
|
embedding_dim=Config.Config.EMBED_DIM,
|
|
|
|
|
max_token_size=Config.Config.EMBED_MAX_TOKEN_SIZE,
|
2025-08-26 13:33:43 +08:00
|
|
|
|
func=lambda texts: openai_embed(
|
|
|
|
|
texts,
|
2025-08-26 13:39:51 +08:00
|
|
|
|
model=Config.Config.EMBED_MODEL_NAME,
|
|
|
|
|
api_key=Config.Config.EMBED_API_KEY,
|
|
|
|
|
base_url=Config.Config.EMBED_BASE_URL,
|
2025-08-26 13:33:43 +08:00
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
# 初始化 RAGAnything
|
|
|
|
|
rag = RAGAnything(
|
|
|
|
|
config=config,
|
|
|
|
|
llm_model_func=llm_model_func,
|
|
|
|
|
vision_model_func=vision_model_func,
|
2025-08-26 13:51:45 +08:00
|
|
|
|
embedding_func=embedding_func
|
2025-08-26 13:33:43 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# 处理文档
|
|
|
|
|
await rag.process_document_complete(
|
2025-08-26 16:05:06 +08:00
|
|
|
|
file_path=file_path,
|
|
|
|
|
output_dir=output_dir,
|
2025-08-26 13:33:43 +08:00
|
|
|
|
parse_method="auto"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2025-08-26 16:20:16 +08:00
|
|
|
|
# MinerU生成的临时文件目录
|
|
|
|
|
output_dir = "./Output"
|
|
|
|
|
|
2025-08-26 16:13:57 +08:00
|
|
|
|
# LightRag的数据库所在目录
|
2025-08-26 16:53:51 +08:00
|
|
|
|
working_dir = "./Topic/HuangWanQiao"
|
2025-08-26 16:13:57 +08:00
|
|
|
|
|
2025-08-26 16:12:27 +08:00
|
|
|
|
# 文档路径
|
2025-08-26 16:53:51 +08:00
|
|
|
|
file_path = "./Doc/黄琬乔2023蓝桥杯省赛准考证.pdf"
|
2025-08-26 16:20:16 +08:00
|
|
|
|
|
2025-08-26 16:12:27 +08:00
|
|
|
|
# 开始训练
|
2025-08-26 16:05:06 +08:00
|
|
|
|
asyncio.run(train(file_path, output_dir, working_dir))
|