dsProject/dsRagAnything/T1_Train.py

import asyncio
import logging

from raganything import RAGAnything, RAGAnythingConfig
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.utils import EmbeddingFunc
from logging.handlers import RotatingFileHandler  # 导入RotatingFileHandler用于日志轮转

import Config.Config

# 控制日志输出
root_logger = logging.getLogger('lightrag')
root_logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
root_logger.addHandler(handler)

# 同时保持原有的ragAnything日志记录器配置
logger = logging.getLogger('ragAnything')
logger.setLevel(logging.INFO)
# 控制台输出处理器
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(console_handler)
# 循环滚动文件处理器（控制在200K左右）
file_handler = RotatingFileHandler(
    'lightrag.log',
    maxBytes=200 * 1024,  # 200KB
    backupCount=5,  # 最多保留5个备份文件
    encoding='utf-8',
    delay=True  # 延迟创建文件，直到有日志输出
)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)


async def train(file_path, output_dir, working_dir):
    # 设置 API 配置
    api_key = Config.Config.ALY_LLM_API_KEY
    base_url = Config.Config.ALY_LLM_BASE_URL

    # 创建 RAGAnything 配置
    config = RAGAnythingConfig(
        working_dir=working_dir,
        parser="mineru",  # 选择解析器：mineru 或 docling
        parse_method="auto",  # 解析方法：auto, ocr 或 txt
        enable_image_processing=True,
        enable_table_processing=True,
        enable_equation_processing=True,
    )

    # 定义 LLM 模型函数
    def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
        return openai_complete_if_cache(
            Config.Config.ALY_LLM_MODEL_NAME,
            prompt,
            system_prompt=system_prompt,
            history_messages=history_messages,
            api_key=api_key,
            base_url=base_url,
            **kwargs,
        )

    # 定义视觉模型函数用于图像处理
    def vision_model_func(
            prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
    ):
        # 如果提供了messages格式（用于多模态VLM增强查询），直接使用
        if messages:
            return openai_complete_if_cache(
                Config.Config.GLM_MODEL_NAME,
                "",
                system_prompt=None,
                history_messages=[],
                messages=messages,
                api_key=Config.Config.GLM_API_KEY,
                base_url=Config.Config.GLM_BASE_URL,
                **kwargs,
            )
        # 传统单图片格式
        elif image_data:
            return openai_complete_if_cache(
                Config.Config.GLM_MODEL_NAME,
                "",
                system_prompt=None,
                history_messages=[],
                messages=[
                    {"role": "system", "content": system_prompt}
                    if system_prompt
                    else None,
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{image_data}"
                                },
                            },
                        ],
                    }
                    if image_data
                    else {"role": "user", "content": prompt},
                ],
                api_key=Config.Config.GLM_API_KEY,
                base_url=Config.Config.GLM_BASE_URL,
                **kwargs,
            )
        # 纯文本格式
        else:
            return llm_model_func(prompt, system_prompt, history_messages, **kwargs)

    # 定义嵌入函数
    embedding_func = EmbeddingFunc(
        embedding_dim=Config.Config.EMBED_DIM,
        max_token_size=Config.Config.EMBED_MAX_TOKEN_SIZE,
        func=lambda texts: openai_embed(
            texts,
            model=Config.Config.EMBED_MODEL_NAME,
            api_key=Config.Config.EMBED_API_KEY,
            base_url=Config.Config.EMBED_BASE_URL,
        ),
    )
    # 初始化 RAGAnything
    rag = RAGAnything(
        config=config,
        llm_model_func=llm_model_func,
        vision_model_func=vision_model_func,
        embedding_func=embedding_func
    )

    # 处理文档
    await rag.process_document_complete(
        file_path=file_path,
        output_dir=output_dir,
        parse_method="auto"
    )


if __name__ == "__main__":
    # MinerU生成的临时文件目录
    output_dir = "./Output"

    # LightRag的数据库所在目录
    #working_dir = "./Topic/HuangWanQiao"
    working_dir = "./Topic/Geogebra"

    # 文档路径
    file_path = "./Doc/GeoGebra.pdf"

    # 开始训练
    asyncio.run(train(file_path, output_dir, working_dir))
-												'commit'

											
										
										
											2025-08-26 13:33:43 +08:00
+								import asyncio
-												'commit'

											
										
										
											2025-08-26 13:54:12 +08:00
+								import logging
-												'commit'

											
										
										
											2025-08-26 13:45:10 +08:00
-												'commit'

											
										
										
											2025-08-26 13:33:43 +08:00
+								from raganything import RAGAnything, RAGAnythingConfig
 								from lightrag.llm.openai import openai_complete_if_cache, openai_embed
 								from lightrag.utils import EmbeddingFunc
-												'commit'

											
										
										
											2025-08-26 16:05:06 +08:00
+								from logging.handlers import RotatingFileHandler  # 导入RotatingFileHandler用于日志轮转
-												'commit'

											
										
										
											2025-08-26 13:39:51 +08:00
+								import Config.Config
-												'commit'

											
										
										
											2025-08-26 16:50:27 +08:00
+								# 控制日志输出
 								root_logger = logging.getLogger('lightrag')
-												'commit'

											
										
										
											2025-08-26 13:59:48 +08:00
+								root_logger.setLevel(logging.INFO)
-												'commit'

											
										
										
											2025-08-26 16:50:27 +08:00
+								handler = logging.StreamHandler()
 								handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
 								root_logger.addHandler(handler)
-												'commit'

											
										
										
											2025-08-26 13:59:48 +08:00
 								# 同时保持原有的ragAnything日志记录器配置
-												'commit'

											
										
										
											2025-08-26 13:54:12 +08:00
+								logger = logging.getLogger('ragAnything')
 								logger.setLevel(logging.INFO)
-												'commit'

											
										
										
											2025-08-26 16:05:06 +08:00
+								# 控制台输出处理器
 								console_handler = logging.StreamHandler()
 								console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
 								logger.addHandler(console_handler)
 								# 循环滚动文件处理器（控制在200K左右）
 								file_handler = RotatingFileHandler(
 								    'lightrag.log',
 								    maxBytes=200 * 1024,  # 200KB
 								    backupCount=5,  # 最多保留5个备份文件
 								    encoding='utf-8',
 								    delay=True  # 延迟创建文件，直到有日志输出
 								)
 								file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
 								logger.addHandler(file_handler)
-												'commit'

											
										
										
											2025-08-26 13:59:48 +08:00
-												'commit'

											
										
										
											2025-08-26 13:39:51 +08:00
-												'commit'

											
										
										
											2025-08-26 16:05:06 +08:00
+								async def train(file_path, output_dir, working_dir):
-												'commit'

											
										
										
											2025-08-26 13:33:43 +08:00
+								    # 设置 API 配置
-												'commit'

											
										
										
											2025-08-26 13:39:51 +08:00
+								    api_key = Config.Config.ALY_LLM_API_KEY
 								    base_url = Config.Config.ALY_LLM_BASE_URL
-												'commit'

											
										
										
											2025-08-26 13:33:43 +08:00
 								    # 创建 RAGAnything 配置
 								    config = RAGAnythingConfig(
-												'commit'

											
										
										
											2025-08-26 16:05:06 +08:00
+								        working_dir=working_dir,
-												'commit'

											
										
										
											2025-08-26 13:33:43 +08:00
+								        parser="mineru",  # 选择解析器：mineru 或 docling
 								        parse_method="auto",  # 解析方法：auto, ocr 或 txt
 								        enable_image_processing=True,
 								        enable_table_processing=True,
 								        enable_equation_processing=True,
 								    )
 								    # 定义 LLM 模型函数
 								    def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
 								        return openai_complete_if_cache(
-												'commit'

											
										
										
											2025-08-26 13:39:51 +08:00
+								            Config.Config.ALY_LLM_MODEL_NAME,
-												'commit'

											
										
										
											2025-08-26 13:33:43 +08:00
+								            prompt,
 								            system_prompt=system_prompt,
 								            history_messages=history_messages,
 								            api_key=api_key,
 								            base_url=base_url,
 								            **kwargs,
 								        )
 								    # 定义视觉模型函数用于图像处理
 								    def vision_model_func(
-												'commit'

											
										
										
											2025-08-26 13:39:51 +08:00
+								            prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
-												'commit'

											
										
										
											2025-08-26 13:33:43 +08:00
+								    ):
 								        # 如果提供了messages格式（用于多模态VLM增强查询），直接使用
 								        if messages:
 								            return openai_complete_if_cache(
-												'commit'

											
										
										
											2025-08-26 13:39:51 +08:00
+								                Config.Config.GLM_MODEL_NAME,
-												'commit'

											
										
										
											2025-08-26 13:33:43 +08:00
+								                "",
 								                system_prompt=None,
 								                history_messages=[],
 								                messages=messages,
-												'commit'

											
										
										
											2025-08-26 13:39:51 +08:00
+								                api_key=Config.Config.GLM_API_KEY,
 								                base_url=Config.Config.GLM_BASE_URL,
-												'commit'

											
										
										
											2025-08-26 13:33:43 +08:00
+								                **kwargs,
 								            )
 								        # 传统单图片格式
 								        elif image_data:
 								            return openai_complete_if_cache(
-												'commit'

											
										
										
											2025-08-26 13:39:51 +08:00
+								                Config.Config.GLM_MODEL_NAME,
-												'commit'

											
										
										
											2025-08-26 13:33:43 +08:00
+								                "",
 								                system_prompt=None,
 								                history_messages=[],
 								                messages=[
 								                    {"role": "system", "content": system_prompt}
 								                    if system_prompt
 								                    else None,
 								                    {
 								                        "role": "user",
 								                        "content": [
 								                            {"type": "text", "text": prompt},
 								                            {
 								                                "type": "image_url",
 								                                "image_url": {
 								                                    "url": f"data:image/jpeg;base64,{image_data}"
 								                                },
 								                            },
 								                        ],
 								                    }
 								                    if image_data
 								                    else {"role": "user", "content": prompt},
 								                ],
-												'commit'

											
										
										
											2025-08-26 13:39:51 +08:00
+								                api_key=Config.Config.GLM_API_KEY,
 								                base_url=Config.Config.GLM_BASE_URL,
-												'commit'

											
										
										
											2025-08-26 13:33:43 +08:00
+								                **kwargs,
 								            )
 								        # 纯文本格式
 								        else:
 								            return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
 								    # 定义嵌入函数
 								    embedding_func = EmbeddingFunc(
-												'commit'

											
										
										
											2025-08-26 13:39:51 +08:00
+								        embedding_dim=Config.Config.EMBED_DIM,
 								        max_token_size=Config.Config.EMBED_MAX_TOKEN_SIZE,
-												'commit'

											
										
										
											2025-08-26 13:33:43 +08:00
+								        func=lambda texts: openai_embed(
 								            texts,
-												'commit'

											
										
										
											2025-08-26 13:39:51 +08:00
+								            model=Config.Config.EMBED_MODEL_NAME,
 								            api_key=Config.Config.EMBED_API_KEY,
 								            base_url=Config.Config.EMBED_BASE_URL,
-												'commit'

											
										
										
											2025-08-26 13:33:43 +08:00
+								        ),
 								    )
 								    # 初始化 RAGAnything
 								    rag = RAGAnything(
 								        config=config,
 								        llm_model_func=llm_model_func,
 								        vision_model_func=vision_model_func,
-												'commit'

											
										
										
											2025-08-26 13:51:45 +08:00
+								        embedding_func=embedding_func
-												'commit'

											
										
										
											2025-08-26 13:33:43 +08:00
+								    )
 								    # 处理文档
 								    await rag.process_document_complete(
-												'commit'

											
										
										
											2025-08-26 16:05:06 +08:00
+								        file_path=file_path,
 								        output_dir=output_dir,
-												'commit'

											
										
										
											2025-08-26 13:33:43 +08:00
+								        parse_method="auto"
 								    )
 								if __name__ == "__main__":
-												'commit'

											
										
										
											2025-08-26 16:20:16 +08:00
+								    # MinerU生成的临时文件目录
 								    output_dir = "./Output"
-												'commit'

											
										
										
											2025-08-26 16:13:57 +08:00
+								    # LightRag的数据库所在目录
-												'commit'

											
										
										
											2025-08-26 16:54:24 +08:00
+								    #working_dir = "./Topic/HuangWanQiao"
 								    working_dir = "./Topic/Geogebra"
-												'commit'

											
										
										
											2025-08-26 16:13:57 +08:00
-												'commit'

											
										
										
											2025-08-26 16:12:27 +08:00
+								    # 文档路径
-												'commit'

											
										
										
											2025-08-26 16:54:24 +08:00
+								    file_path = "./Doc/GeoGebra.pdf"
-												'commit'

											
										
										
											2025-08-26 16:20:16 +08:00
-												'commit'

											
										
										
											2025-08-26 16:12:27 +08:00
+								    # 开始训练
-												'commit'

											
										
										
											2025-08-26 16:05:06 +08:00
+								    asyncio.run(train(file_path, output_dir, working_dir))