You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
253 lines
9.3 KiB
253 lines
9.3 KiB
"""
|
|
Complete MinerU parsing + multimodal content insertion Pipeline
|
|
|
|
This script integrates:
|
|
1. MinerU document parsing
|
|
2. Pure text content LightRAG insertion
|
|
3. Specialized processing for multimodal content (using different processors)
|
|
"""
|
|
|
|
import os
|
|
from typing import Dict, Any, Optional, Callable
|
|
import sys
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
|
|
# Add project root directory to Python path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from lightrag import LightRAG
|
|
from lightrag.utils import logger
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables from .env file
|
|
# The OS environment variables take precedence over the .env file
|
|
load_dotenv(dotenv_path=".env", override=False)
|
|
|
|
# Import configuration and modules
|
|
from raganything.config import RAGAnythingConfig
|
|
from raganything.query import QueryMixin
|
|
from raganything.processor import ProcessorMixin
|
|
from raganything.batch import BatchMixin
|
|
from raganything.utils import get_processor_supports
|
|
from raganything.mineru_parser import MineruParser
|
|
|
|
# Import specialized processors
|
|
from raganything.modalprocessors import (
|
|
ImageModalProcessor,
|
|
TableModalProcessor,
|
|
EquationModalProcessor,
|
|
GenericModalProcessor,
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):
|
|
"""Multimodal Document Processing Pipeline - Complete document parsing and insertion pipeline"""
|
|
|
|
# Core Components
|
|
# ---
|
|
lightrag: Optional[LightRAG] = field(default=None)
|
|
"""Optional pre-initialized LightRAG instance."""
|
|
|
|
llm_model_func: Optional[Callable] = field(default=None)
|
|
"""LLM model function for text analysis."""
|
|
|
|
vision_model_func: Optional[Callable] = field(default=None)
|
|
"""Vision model function for image analysis."""
|
|
|
|
embedding_func: Optional[Callable] = field(default=None)
|
|
"""Embedding function for text vectorization."""
|
|
|
|
config: Optional[RAGAnythingConfig] = field(default=None)
|
|
"""Configuration object, if None will create with environment variables."""
|
|
|
|
# Internal State
|
|
# ---
|
|
modal_processors: Dict[str, Any] = field(default_factory=dict, init=False)
|
|
"""Dictionary of multimodal processors."""
|
|
|
|
def __post_init__(self):
|
|
"""Post-initialization setup following LightRAG pattern"""
|
|
# Initialize configuration if not provided
|
|
if self.config is None:
|
|
self.config = RAGAnythingConfig()
|
|
|
|
# Set working directory
|
|
self.working_dir = self.config.working_dir
|
|
|
|
# Set up logger (use existing logger, don't configure it)
|
|
self.logger = logger
|
|
|
|
# Create working directory if needed
|
|
if not os.path.exists(self.working_dir):
|
|
os.makedirs(self.working_dir)
|
|
self.logger.info(f"Created working directory: {self.working_dir}")
|
|
|
|
# If LightRAG is provided, initialize processors immediately
|
|
if self.lightrag is not None:
|
|
self._initialize_processors()
|
|
|
|
# Log configuration info
|
|
self.logger.info("RAGAnything initialized with config:")
|
|
self.logger.info(f" Working directory: {self.config.working_dir}")
|
|
self.logger.info(f" MinerU parse method: {self.config.mineru_parse_method}")
|
|
self.logger.info(
|
|
f" Multimodal processing - Image: {self.config.enable_image_processing}, "
|
|
f"Table: {self.config.enable_table_processing}, "
|
|
f"Equation: {self.config.enable_equation_processing}"
|
|
)
|
|
self.logger.info(f" Max concurrent files: {self.config.max_concurrent_files}")
|
|
|
|
def _initialize_processors(self):
|
|
"""Initialize multimodal processors with appropriate model functions"""
|
|
if self.lightrag is None:
|
|
raise ValueError(
|
|
"LightRAG instance must be initialized before creating processors"
|
|
)
|
|
|
|
# Create different multimodal processors based on configuration
|
|
self.modal_processors = {}
|
|
|
|
if self.config.enable_image_processing:
|
|
self.modal_processors["image"] = ImageModalProcessor(
|
|
lightrag=self.lightrag,
|
|
modal_caption_func=self.vision_model_func or self.llm_model_func,
|
|
)
|
|
|
|
if self.config.enable_table_processing:
|
|
self.modal_processors["table"] = TableModalProcessor(
|
|
lightrag=self.lightrag, modal_caption_func=self.llm_model_func
|
|
)
|
|
|
|
if self.config.enable_equation_processing:
|
|
self.modal_processors["equation"] = EquationModalProcessor(
|
|
lightrag=self.lightrag, modal_caption_func=self.llm_model_func
|
|
)
|
|
|
|
# Always include generic processor as fallback
|
|
self.modal_processors["generic"] = GenericModalProcessor(
|
|
lightrag=self.lightrag, modal_caption_func=self.llm_model_func
|
|
)
|
|
|
|
self.logger.info("Multimodal processors initialized")
|
|
self.logger.info(f"Available processors: {list(self.modal_processors.keys())}")
|
|
|
|
def update_config(self, **kwargs):
|
|
"""Update configuration with new values"""
|
|
for key, value in kwargs.items():
|
|
if hasattr(self.config, key):
|
|
setattr(self.config, key, value)
|
|
self.logger.debug(f"Updated config: {key} = {value}")
|
|
else:
|
|
self.logger.warning(f"Unknown config parameter: {key}")
|
|
|
|
async def _ensure_lightrag_initialized(self):
|
|
"""Ensure LightRAG instance is initialized, create if necessary"""
|
|
if self.lightrag is not None:
|
|
return
|
|
|
|
# Check MinerU 2.0 installation
|
|
if not MineruParser.check_installation():
|
|
raise RuntimeError(
|
|
"MinerU 2.0 is not properly installed. "
|
|
"Please install it using: pip install -U 'mineru[core]' "
|
|
"or uv pip install -U 'mineru[core]'"
|
|
)
|
|
|
|
# Validate required functions
|
|
if self.llm_model_func is None:
|
|
raise ValueError(
|
|
"llm_model_func must be provided when LightRAG is not pre-initialized"
|
|
)
|
|
if self.embedding_func is None:
|
|
raise ValueError(
|
|
"embedding_func must be provided when LightRAG is not pre-initialized"
|
|
)
|
|
|
|
from lightrag.kg.shared_storage import initialize_pipeline_status
|
|
|
|
# Create LightRAG instance with provided functions
|
|
self.lightrag = LightRAG(
|
|
working_dir=self.working_dir,
|
|
llm_model_func=self.llm_model_func,
|
|
embedding_func=self.embedding_func,
|
|
)
|
|
|
|
await self.lightrag.initialize_storages()
|
|
await initialize_pipeline_status()
|
|
|
|
# Initialize processors after LightRAG is ready
|
|
self._initialize_processors()
|
|
|
|
self.logger.info("LightRAG and multimodal processors initialized")
|
|
|
|
def check_mineru_installation(self) -> bool:
|
|
"""
|
|
Check if MinerU 2.0 is properly installed
|
|
|
|
Returns:
|
|
bool: True if MinerU 2.0 is properly installed
|
|
"""
|
|
return MineruParser.check_installation()
|
|
|
|
def get_config_info(self) -> Dict[str, Any]:
|
|
"""Get current configuration information"""
|
|
return {
|
|
"directory": {
|
|
"working_dir": self.config.working_dir,
|
|
"mineru_output_dir": self.config.mineru_output_dir,
|
|
},
|
|
"parsing": {
|
|
"mineru_parse_method": self.config.mineru_parse_method,
|
|
"display_content_stats": self.config.display_content_stats,
|
|
},
|
|
"multimodal_processing": {
|
|
"enable_image_processing": self.config.enable_image_processing,
|
|
"enable_table_processing": self.config.enable_table_processing,
|
|
"enable_equation_processing": self.config.enable_equation_processing,
|
|
},
|
|
"batch_processing": {
|
|
"max_concurrent_files": self.config.max_concurrent_files,
|
|
"supported_file_extensions": self.config.supported_file_extensions,
|
|
"recursive_folder_processing": self.config.recursive_folder_processing,
|
|
},
|
|
"logging": {
|
|
"note": "Logging fields have been removed - configure logging externally",
|
|
},
|
|
}
|
|
|
|
def get_processor_info(self) -> Dict[str, Any]:
|
|
"""Get processor information"""
|
|
base_info = {
|
|
"mineru_installed": MineruParser.check_installation(),
|
|
"config": self.get_config_info(),
|
|
"models": {
|
|
"llm_model": "External function"
|
|
if self.llm_model_func
|
|
else "Not provided",
|
|
"vision_model": "External function"
|
|
if self.vision_model_func
|
|
else "Not provided",
|
|
"embedding_model": "External function"
|
|
if self.embedding_func
|
|
else "Not provided",
|
|
},
|
|
}
|
|
|
|
if not self.modal_processors:
|
|
base_info["status"] = "Not initialized"
|
|
base_info["processors"] = {}
|
|
else:
|
|
base_info["status"] = "Initialized"
|
|
base_info["processors"] = {}
|
|
|
|
for proc_type, processor in self.modal_processors.items():
|
|
base_info["processors"][proc_type] = {
|
|
"class": processor.__class__.__name__,
|
|
"supports": get_processor_supports(proc_type),
|
|
"enabled": True,
|
|
}
|
|
|
|
return base_info
|