@ -1,223 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Example script demonstrating the integration of MinerU parser with RAGAnything
|
||||
|
||||
This example shows how to:
|
||||
1. Process parsed documents with RAGAnything
|
||||
2. Perform multimodal queries on the processed documents
|
||||
3. Handle different types of content (text, images, tables)
|
||||
"""
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import logging.config
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root directory to Python path
|
||||
import sys
|
||||
|
||||
sys.path.append(str(Path(__file__).parent.parent))
|
||||
|
||||
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
||||
from lightrag.utils import EmbeddingFunc, logger, set_verbose_debug
|
||||
from raganything import RAGAnything, RAGAnythingConfig
|
||||
|
||||
|
||||
def configure_logging():
|
||||
"""Configure logging for the application"""
|
||||
# Get log directory path from environment variable or use current directory
|
||||
log_dir = os.getenv("LOG_DIR", os.getcwd())
|
||||
log_file_path = os.path.abspath(os.path.join(log_dir, "raganything_example.log"))
|
||||
|
||||
print(f"\nRAGAnything example log file: {log_file_path}\n")
|
||||
os.makedirs(os.path.dirname(log_dir), exist_ok=True)
|
||||
|
||||
# Get log file max size and backup count from environment variables
|
||||
log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB
|
||||
log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups
|
||||
|
||||
logging.config.dictConfig(
|
||||
{
|
||||
"version": 1,
|
||||
"disable_existing_loggers": False,
|
||||
"formatters": {
|
||||
"default": {
|
||||
"format": "%(levelname)s: %(message)s",
|
||||
},
|
||||
"detailed": {
|
||||
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
},
|
||||
},
|
||||
"handlers": {
|
||||
"console": {
|
||||
"formatter": "default",
|
||||
"class": "logging.StreamHandler",
|
||||
"stream": "ext://sys.stderr",
|
||||
},
|
||||
"file": {
|
||||
"formatter": "detailed",
|
||||
"class": "logging.handlers.RotatingFileHandler",
|
||||
"filename": log_file_path,
|
||||
"maxBytes": log_max_bytes,
|
||||
"backupCount": log_backup_count,
|
||||
"encoding": "utf-8",
|
||||
},
|
||||
},
|
||||
"loggers": {
|
||||
"lightrag": {
|
||||
"handlers": ["console", "file"],
|
||||
"level": "INFO",
|
||||
"propagate": False,
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Set the logger level to INFO
|
||||
logger.setLevel(logging.INFO)
|
||||
# Enable verbose debug if needed
|
||||
set_verbose_debug(os.getenv("VERBOSE", "false").lower() == "true")
|
||||
|
||||
|
||||
async def process_with_rag(
|
||||
file_path: str,
|
||||
output_dir: str,
|
||||
working_dir: str = None,
|
||||
):
|
||||
"""
|
||||
Process document with RAGAnything
|
||||
|
||||
Args:
|
||||
file_path: Path to the document
|
||||
output_dir: Output directory for RAG results
|
||||
api_key: OpenAI API key
|
||||
base_url: Optional base URL for API
|
||||
working_dir: Working directory for RAG storage
|
||||
"""
|
||||
try:
|
||||
# Create RAGAnything configuration
|
||||
config = RAGAnythingConfig(
|
||||
working_dir=working_dir or "./rag_storage",
|
||||
mineru_parse_method="auto",
|
||||
enable_image_processing=True,
|
||||
enable_table_processing=True,
|
||||
enable_equation_processing=True,
|
||||
)
|
||||
|
||||
# Define LLM model function
|
||||
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
||||
return openai_complete_if_cache(
|
||||
"deepseek-chat",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key="sk-44ae895eeb614aa1a9c6460579e322f1",
|
||||
base_url="https://api.deepseek.com",
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Define vision model function for image processing
|
||||
def vision_model_func(
|
||||
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
|
||||
):
|
||||
if image_data:
|
||||
return openai_complete_if_cache(
|
||||
"GLM-4.1V-9B-Thinking",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt}
|
||||
if system_prompt
|
||||
else None,
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_data}"
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
if image_data
|
||||
else {"role": "user", "content": prompt},
|
||||
],
|
||||
api_key="sk-pbqibyjwhrgmnlsmdygplahextfaclgnedetybccknxojlyl",
|
||||
base_url='https://api.siliconflow.cn/v1/chat/completions',
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
||||
|
||||
# Define embedding function
|
||||
embedding_func = EmbeddingFunc(
|
||||
embedding_dim=3072,
|
||||
max_token_size=8192,
|
||||
func=lambda texts: openai_embed(
|
||||
texts,
|
||||
model="BAAI/bge-m3",
|
||||
api_key="sk-pbqibyjwhrgmnlsmdygplahextfaclgnedetybccknxojlyl",
|
||||
base_url="https://api.siliconflow.cn/v1/embeddings",
|
||||
),
|
||||
)
|
||||
|
||||
# Initialize RAGAnything with new dataclass structure
|
||||
rag = RAGAnything(
|
||||
config=config,
|
||||
llm_model_func=llm_model_func,
|
||||
vision_model_func=vision_model_func,
|
||||
embedding_func=embedding_func,
|
||||
)
|
||||
|
||||
# Process document
|
||||
await rag.process_document_complete(
|
||||
file_path=file_path, output_dir=output_dir, parse_method="auto"
|
||||
)
|
||||
|
||||
# Example queries
|
||||
queries = [
|
||||
"What is the main content of the document?",
|
||||
"Describe the images and figures in the document",
|
||||
"Tell me about the experimental results and data tables",
|
||||
]
|
||||
|
||||
logger.info("\nQuerying processed document:")
|
||||
for query in queries:
|
||||
logger.info(f"\nQuery: {query}")
|
||||
result = await rag.query_with_multimodal(query, mode="hybrid")
|
||||
logger.info(f"Answer: {result}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing with RAG: {str(e)}")
|
||||
import traceback
|
||||
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
|
||||
def main():
|
||||
file_path="../Txt/黄海的个人简历.txt"
|
||||
output="../Txt/output"
|
||||
working_dir="../Txt/working_dir"
|
||||
# Process with RAG
|
||||
asyncio.run(
|
||||
process_with_rag(
|
||||
file_path, output, working_dir
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Configure logging first
|
||||
configure_logging()
|
||||
|
||||
print("RAGAnything Example")
|
||||
print("=" * 30)
|
||||
print("Processing document with multimodal RAG pipeline")
|
||||
print("=" * 30)
|
||||
|
||||
main()
|
@ -1,82 +0,0 @@
|
||||
import asyncio
|
||||
from raganything import RAGAnything
|
||||
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
||||
from lightrag.utils import EmbeddingFunc
|
||||
|
||||
async def main():
|
||||
# Initialize RAGAnything
|
||||
rag = RAGAnything(
|
||||
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
|
||||
"deepseek-chat",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key="sk-44ae895eeb614aa1a9c6460579e322f1",
|
||||
base_url="https://api.deepseek.com",
|
||||
**kwargs
|
||||
),
|
||||
vision_model_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
|
||||
"GLM-4.1V-9B-Thinking",
|
||||
"",
|
||||
system_prompt=None,
|
||||
history_messages=[],
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt} if system_prompt else None,
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
|
||||
]} if image_data else {"role": "user", "content": prompt}
|
||||
],
|
||||
api_key="sk-pbqibyjwhrgmnlsmdygplahextfaclgnedetybccknxojlyl",
|
||||
base_url='https://api.siliconflow.cn/v1/chat/completions',
|
||||
**kwargs,
|
||||
) if image_data else openai_complete_if_cache(
|
||||
"GLM-4.1V-9B-Thinking",
|
||||
prompt,
|
||||
system_prompt=system_prompt,
|
||||
history_messages=history_messages,
|
||||
api_key="sk-pbqibyjwhrgmnlsmdygplahextfaclgnedetybccknxojlyl",
|
||||
base_url='https://api.siliconflow.cn/v1/chat/completions',
|
||||
**kwargs,
|
||||
),
|
||||
embedding_func=EmbeddingFunc(
|
||||
embedding_dim=3072,
|
||||
max_token_size=8192,
|
||||
func=lambda texts: openai_embed(
|
||||
texts,
|
||||
model="BAAI/bge-m3",
|
||||
api_key="sk-pbqibyjwhrgmnlsmdygplahextfaclgnedetybccknxojlyl",
|
||||
base_url="https://api.siliconflow.cn/v1/embeddings",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
# Process a document
|
||||
await rag.process_document_complete(
|
||||
file_path="../Txt/黄琬乔2023蓝桥杯省赛准考证.pdf",
|
||||
output_dir="./output",
|
||||
parse_method="auto"
|
||||
)
|
||||
|
||||
# Query the processed content
|
||||
# Pure text query - for basic knowledge base search
|
||||
text_result = await rag.aquery(
|
||||
"这篇文档中说了什么内容?",
|
||||
mode="hybrid"
|
||||
)
|
||||
print("Text query result:", text_result)
|
||||
|
||||
# # Multimodal query with specific multimodal content
|
||||
# multimodal_result = await rag.aquery_with_multimodal(
|
||||
# "Explain this formula and its relevance to the document content",
|
||||
# multimodal_content=[{
|
||||
# "type": "equation",
|
||||
# "latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
|
||||
# "equation_caption": "Document relevance probability"
|
||||
# }],
|
||||
# mode="hybrid"
|
||||
# )
|
||||
# print("Multimodal query result:", multimodal_result)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
Before Width: | Height: | Size: 87 KiB |
Before Width: | Height: | Size: 3.8 KiB |
Before Width: | Height: | Size: 18 KiB |
Before Width: | Height: | Size: 3.4 KiB |
Before Width: | Height: | Size: 3.4 KiB |
Before Width: | Height: | Size: 10 KiB |
Before Width: | Height: | Size: 9.4 KiB |
Before Width: | Height: | Size: 17 KiB |
Before Width: | Height: | Size: 3.6 KiB |
Before Width: | Height: | Size: 1.6 KiB |
Before Width: | Height: | Size: 2.4 KiB |
Before Width: | Height: | Size: 2.7 KiB |
Before Width: | Height: | Size: 4.9 KiB |
Before Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 3.2 KiB |
Before Width: | Height: | Size: 2.4 KiB |
Before Width: | Height: | Size: 1.5 KiB |
Before Width: | Height: | Size: 4.7 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 5.3 KiB |
Before Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 4.3 KiB |
Before Width: | Height: | Size: 8.6 KiB |
Before Width: | Height: | Size: 3.0 KiB |
Before Width: | Height: | Size: 3.0 KiB |
Before Width: | Height: | Size: 10 KiB |
Before Width: | Height: | Size: 3.1 KiB |
Before Width: | Height: | Size: 23 KiB |
Before Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 11 KiB |
Before Width: | Height: | Size: 28 KiB |
Before Width: | Height: | Size: 22 KiB |
Before Width: | Height: | Size: 4.2 KiB |
Before Width: | Height: | Size: 3.5 KiB |
Before Width: | Height: | Size: 25 KiB |
Before Width: | Height: | Size: 5.0 KiB |
Before Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 20 KiB |
Before Width: | Height: | Size: 60 KiB |
Before Width: | Height: | Size: 1.8 KiB |
Before Width: | Height: | Size: 7.0 KiB |
Before Width: | Height: | Size: 2.4 KiB |
Before Width: | Height: | Size: 5.4 KiB |
Before Width: | Height: | Size: 2.6 KiB |
Before Width: | Height: | Size: 3.0 KiB |
Before Width: | Height: | Size: 7.8 KiB |
Before Width: | Height: | Size: 8.9 KiB |
Before Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 30 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 8.2 KiB |
Before Width: | Height: | Size: 2.4 KiB |
Before Width: | Height: | Size: 5.1 KiB |
Before Width: | Height: | Size: 7.9 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 3.3 KiB |
Before Width: | Height: | Size: 11 KiB |
Before Width: | Height: | Size: 3.3 KiB |
Before Width: | Height: | Size: 25 KiB |
Before Width: | Height: | Size: 1.6 KiB |
Before Width: | Height: | Size: 2.0 KiB |
Before Width: | Height: | Size: 5.0 KiB |
Before Width: | Height: | Size: 2.8 KiB |
Before Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 2.1 KiB |
Before Width: | Height: | Size: 23 KiB |
Before Width: | Height: | Size: 4.1 KiB |
Before Width: | Height: | Size: 24 KiB |
Before Width: | Height: | Size: 17 KiB |
Before Width: | Height: | Size: 2.7 KiB |
Before Width: | Height: | Size: 1.3 KiB |
Before Width: | Height: | Size: 16 KiB |
Before Width: | Height: | Size: 4.2 KiB |
Before Width: | Height: | Size: 2.3 KiB |
Before Width: | Height: | Size: 82 KiB |
Before Width: | Height: | Size: 2.2 KiB |
Before Width: | Height: | Size: 2.8 KiB |
Before Width: | Height: | Size: 7.9 KiB |
Before Width: | Height: | Size: 9.1 KiB |
Before Width: | Height: | Size: 2.3 KiB |
Before Width: | Height: | Size: 2.6 KiB |
Before Width: | Height: | Size: 13 KiB |
Before Width: | Height: | Size: 1.9 KiB |
Before Width: | Height: | Size: 19 KiB |
Before Width: | Height: | Size: 3.7 KiB |
Before Width: | Height: | Size: 1.4 KiB |
Before Width: | Height: | Size: 21 KiB |
Before Width: | Height: | Size: 17 KiB |
Before Width: | Height: | Size: 19 KiB |