'commit'

2025-08-26 14:29:13 +08:00
parent 3244f43ddf
commit b6fad4b349
4 changed files with 232 additions and 1 deletions
--- a/dsRagAnything/T1_Train.py
+++ b/dsRagAnything/T1_Train.py
@@ -0,0 +1,154 @@
+import asyncio
+import logging
+
+from raganything import RAGAnything, RAGAnythingConfig
+from lightrag.llm.openai import openai_complete_if_cache, openai_embed
+from lightrag.utils import EmbeddingFunc
+from logging.handlers import RotatingFileHandler# 导入RotatingFileHandler用于日志轮转
+import Config.Config
+
+# 设置根日志记录器的级别为INFO，这样所有子记录器的日志都会被捕获
+root_logger = logging.getLogger()
+root_logger.setLevel(logging.INFO)
+
+# 确保根日志记录器有处理器
+if not root_logger.handlers:
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+    root_logger.addHandler(handler)
+
+
+# 同时保持原有的ragAnything日志记录器配置
+logger = logging.getLogger('ragAnything')
+logger.setLevel(logging.INFO)
+if not logger.handlers:
+    # 控制台输出处理器
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+    logger.addHandler(console_handler)
+    # 循环滚动文件处理器（控制在200K左右）
+    file_handler = RotatingFileHandler(
+        'lightrag.log',
+        maxBytes=200*1024,  # 200KB
+        backupCount=5,       # 最多保留5个备份文件
+        encoding='utf-8',
+        delay=True           # 延迟创建文件，直到有日志输出
+    )
+    file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+    logger.addHandler(file_handler)
+
+
+async def main():
+    # 设置 API 配置
+    api_key = Config.Config.ALY_LLM_API_KEY
+    base_url = Config.Config.ALY_LLM_BASE_URL
+
+    # 创建 RAGAnything 配置
+    config = RAGAnythingConfig(
+        working_dir="./rag_storage",
+        parser="mineru",  # 选择解析器：mineru 或 docling
+        parse_method="auto",  # 解析方法：auto, ocr 或 txt
+        enable_image_processing=True,
+        enable_table_processing=True,
+        enable_equation_processing=True,
+    )
+
+    # 定义 LLM 模型函数
+    def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
+        return openai_complete_if_cache(
+            Config.Config.ALY_LLM_MODEL_NAME,
+            prompt,
+            system_prompt=system_prompt,
+            history_messages=history_messages,
+            api_key=api_key,
+            base_url=base_url,
+            **kwargs,
+        )
+
+    # 定义视觉模型函数用于图像处理
+    def vision_model_func(
+            prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
+    ):
+        # 如果提供了messages格式（用于多模态VLM增强查询），直接使用
+        if messages:
+            return openai_complete_if_cache(
+                Config.Config.GLM_MODEL_NAME,
+                "",
+                system_prompt=None,
+                history_messages=[],
+                messages=messages,
+                api_key=Config.Config.GLM_API_KEY,
+                base_url=Config.Config.GLM_BASE_URL,
+                **kwargs,
+            )
+        # 传统单图片格式
+        elif image_data:
+            return openai_complete_if_cache(
+                Config.Config.GLM_MODEL_NAME,
+                "",
+                system_prompt=None,
+                history_messages=[],
+                messages=[
+                    {"role": "system", "content": system_prompt}
+                    if system_prompt
+                    else None,
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt},
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{image_data}"
+                                },
+                            },
+                        ],
+                    }
+                    if image_data
+                    else {"role": "user", "content": prompt},
+                ],
+                api_key=Config.Config.GLM_API_KEY,
+                base_url=Config.Config.GLM_BASE_URL,
+                **kwargs,
+            )
+        # 纯文本格式
+        else:
+            return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
+
+    # 定义嵌入函数
+    embedding_func = EmbeddingFunc(
+        embedding_dim=Config.Config.EMBED_DIM,
+        max_token_size=Config.Config.EMBED_MAX_TOKEN_SIZE,
+        func=lambda texts: openai_embed(
+            texts,
+            model=Config.Config.EMBED_MODEL_NAME,
+            api_key=Config.Config.EMBED_API_KEY,
+            base_url=Config.Config.EMBED_BASE_URL,
+        ),
+    )
+    # 初始化 RAGAnything
+    rag = RAGAnything(
+        config=config,
+        llm_model_func=llm_model_func,
+        vision_model_func=vision_model_func,
+        embedding_func=embedding_func
+    )
+
+    # 处理文档
+    await rag.process_document_complete(
+        file_path="./Doc/GeoGebra5经典版指令汇编201903061.pdf",
+        output_dir="./output",
+        parse_method="auto"
+    )
+
+    # 查询处理后的内容
+    # 纯文本查询 - 基本知识库搜索
+    text_result = await rag.aquery(
+        "文档的主要内容是什么？",
+        mode="hybrid"
+    )
+    print("文本查询结果:", text_result)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())