Files
dsProject/dsRagAnything/Test.py

135 lines
4.5 KiB
Python
Raw Normal View History

2025-08-26 13:33:43 +08:00
import asyncio
2025-08-26 13:45:10 +08:00
2025-08-26 13:33:43 +08:00
from raganything import RAGAnything, RAGAnythingConfig
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
from lightrag.utils import EmbeddingFunc
2025-08-26 13:39:51 +08:00
import Config.Config
2025-08-26 13:33:43 +08:00
async def main():
# 设置 API 配置
2025-08-26 13:39:51 +08:00
api_key = Config.Config.ALY_LLM_API_KEY
base_url = Config.Config.ALY_LLM_BASE_URL
2025-08-26 13:33:43 +08:00
# 创建 RAGAnything 配置
config = RAGAnythingConfig(
working_dir="./rag_storage",
parser="mineru", # 选择解析器mineru 或 docling
parse_method="auto", # 解析方法auto, ocr 或 txt
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
)
# 定义 LLM 模型函数
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
return openai_complete_if_cache(
2025-08-26 13:39:51 +08:00
Config.Config.ALY_LLM_MODEL_NAME,
2025-08-26 13:33:43 +08:00
prompt,
system_prompt=system_prompt,
history_messages=history_messages,
api_key=api_key,
base_url=base_url,
**kwargs,
)
# 定义视觉模型函数用于图像处理
def vision_model_func(
2025-08-26 13:39:51 +08:00
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
2025-08-26 13:33:43 +08:00
):
# 如果提供了messages格式用于多模态VLM增强查询直接使用
if messages:
return openai_complete_if_cache(
2025-08-26 13:39:51 +08:00
Config.Config.GLM_MODEL_NAME,
2025-08-26 13:33:43 +08:00
"",
system_prompt=None,
history_messages=[],
messages=messages,
2025-08-26 13:39:51 +08:00
api_key=Config.Config.GLM_API_KEY,
base_url=Config.Config.GLM_BASE_URL,
2025-08-26 13:33:43 +08:00
**kwargs,
)
# 传统单图片格式
elif image_data:
return openai_complete_if_cache(
2025-08-26 13:39:51 +08:00
Config.Config.GLM_MODEL_NAME,
2025-08-26 13:33:43 +08:00
"",
system_prompt=None,
history_messages=[],
messages=[
{"role": "system", "content": system_prompt}
if system_prompt
else None,
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_data}"
},
},
],
}
if image_data
else {"role": "user", "content": prompt},
],
2025-08-26 13:39:51 +08:00
api_key=Config.Config.GLM_API_KEY,
base_url=Config.Config.GLM_BASE_URL,
2025-08-26 13:33:43 +08:00
**kwargs,
)
# 纯文本格式
else:
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
# 定义嵌入函数
embedding_func = EmbeddingFunc(
2025-08-26 13:39:51 +08:00
embedding_dim=Config.Config.EMBED_DIM,
max_token_size=Config.Config.EMBED_MAX_TOKEN_SIZE,
2025-08-26 13:33:43 +08:00
func=lambda texts: openai_embed(
texts,
2025-08-26 13:39:51 +08:00
model=Config.Config.EMBED_MODEL_NAME,
api_key=Config.Config.EMBED_API_KEY,
base_url=Config.Config.EMBED_BASE_URL,
2025-08-26 13:33:43 +08:00
),
)
2025-08-26 13:45:10 +08:00
from functools import partial
from lightrag.rerank import cohere_rerank
rerank_model_func = partial(
cohere_rerank,
model=Config.Config.RERANK_MODEL,
api_key=Config.Config.RERANK_BINDING_API_KEY,
base_url=Config.Config.RERANK_BASE_URL,
)
2025-08-26 13:33:43 +08:00
# 初始化 RAGAnything
rag = RAGAnything(
config=config,
llm_model_func=llm_model_func,
vision_model_func=vision_model_func,
embedding_func=embedding_func,
2025-08-26 13:45:10 +08:00
#rerank_model_func=rerank_model_func,
2025-08-26 13:33:43 +08:00
)
# 处理文档
await rag.process_document_complete(
2025-08-26 13:39:51 +08:00
file_path="./Doc/GeoGebra5经典版指令汇编201903061.pdf",
2025-08-26 13:33:43 +08:00
output_dir="./output",
parse_method="auto"
)
# 查询处理后的内容
# 纯文本查询 - 基本知识库搜索
text_result = await rag.aquery(
"文档的主要内容是什么?",
mode="hybrid"
)
print("文本查询结果:", text_result)
if __name__ == "__main__":
2025-08-26 13:39:51 +08:00
asyncio.run(main())