You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
143 lines
4.4 KiB
143 lines
4.4 KiB
"""
|
|
Utility functions for RAGAnything
|
|
|
|
Contains helper functions for content separation, text insertion, and other utilities
|
|
"""
|
|
|
|
from typing import Dict, List, Any, Tuple
|
|
from lightrag.utils import logger
|
|
|
|
|
|
def separate_content(
|
|
content_list: List[Dict[str, Any]],
|
|
) -> Tuple[str, List[Dict[str, Any]]]:
|
|
"""
|
|
Separate text content and multimodal content
|
|
|
|
Args:
|
|
content_list: Content list from MinerU parsing
|
|
|
|
Returns:
|
|
(text_content, multimodal_items): Pure text content and multimodal items list
|
|
"""
|
|
text_parts = []
|
|
multimodal_items = []
|
|
|
|
for item in content_list:
|
|
content_type = item.get("type", "text")
|
|
|
|
if content_type == "text":
|
|
# Text content
|
|
text = item.get("text", "")
|
|
if text.strip():
|
|
text_parts.append(text)
|
|
else:
|
|
# Multimodal content (image, table, equation, etc.)
|
|
multimodal_items.append(item)
|
|
|
|
# Merge all text content
|
|
text_content = "\n\n".join(text_parts)
|
|
|
|
logger.info("Content separation complete:")
|
|
logger.info(f" - Text content length: {len(text_content)} characters")
|
|
logger.info(f" - Multimodal items count: {len(multimodal_items)}")
|
|
|
|
# Count multimodal types
|
|
modal_types = {}
|
|
for item in multimodal_items:
|
|
modal_type = item.get("type", "unknown")
|
|
modal_types[modal_type] = modal_types.get(modal_type, 0) + 1
|
|
|
|
if modal_types:
|
|
logger.info(f" - Multimodal type distribution: {modal_types}")
|
|
|
|
return text_content, multimodal_items
|
|
|
|
|
|
async def insert_text_content(
|
|
lightrag,
|
|
input: str | list[str],
|
|
split_by_character: str | None = None,
|
|
split_by_character_only: bool = False,
|
|
ids: str | list[str] | None = None,
|
|
file_paths: str | list[str] | None = None,
|
|
):
|
|
"""
|
|
Insert pure text content into LightRAG
|
|
|
|
Args:
|
|
lightrag: LightRAG instance
|
|
input: Single document string or list of document strings
|
|
split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
|
|
chunk_token_size, it will be split again by token size.
|
|
split_by_character_only: if split_by_character_only is True, split the string by character only, when
|
|
split_by_character is None, this parameter is ignored.
|
|
ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated
|
|
file_paths: single string of the file path or list of file paths, used for citation
|
|
"""
|
|
logger.info("Starting text content insertion into LightRAG...")
|
|
|
|
# Use LightRAG's insert method with all parameters
|
|
await lightrag.ainsert(
|
|
input=input,
|
|
file_paths=file_paths,
|
|
split_by_character=split_by_character,
|
|
split_by_character_only=split_by_character_only,
|
|
ids=ids,
|
|
)
|
|
|
|
logger.info("Text content insertion complete")
|
|
|
|
|
|
def get_processor_for_type(modal_processors: Dict[str, Any], content_type: str):
|
|
"""
|
|
Get appropriate processor based on content type
|
|
|
|
Args:
|
|
modal_processors: Dictionary of available processors
|
|
content_type: Content type
|
|
|
|
Returns:
|
|
Corresponding processor instance
|
|
"""
|
|
# Direct mapping to corresponding processor
|
|
if content_type == "image":
|
|
return modal_processors.get("image")
|
|
elif content_type == "table":
|
|
return modal_processors.get("table")
|
|
elif content_type == "equation":
|
|
return modal_processors.get("equation")
|
|
else:
|
|
# For other types, use generic processor
|
|
return modal_processors.get("generic")
|
|
|
|
|
|
def get_processor_supports(proc_type: str) -> List[str]:
|
|
"""Get processor supported features"""
|
|
supports_map = {
|
|
"image": [
|
|
"Image content analysis",
|
|
"Visual understanding",
|
|
"Image description generation",
|
|
"Image entity extraction",
|
|
],
|
|
"table": [
|
|
"Table structure analysis",
|
|
"Data statistics",
|
|
"Trend identification",
|
|
"Table entity extraction",
|
|
],
|
|
"equation": [
|
|
"Mathematical formula parsing",
|
|
"Variable identification",
|
|
"Formula meaning explanation",
|
|
"Formula entity extraction",
|
|
],
|
|
"generic": [
|
|
"General content analysis",
|
|
"Structured processing",
|
|
"Entity extraction",
|
|
],
|
|
}
|
|
return supports_map.get(proc_type, ["Basic processing"])
|