You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

143 lines
4.4 KiB

"""
Utility functions for RAGAnything
Contains helper functions for content separation, text insertion, and other utilities
"""
from typing import Dict, List, Any, Tuple
from lightrag.utils import logger
def separate_content(
content_list: List[Dict[str, Any]],
) -> Tuple[str, List[Dict[str, Any]]]:
"""
Separate text content and multimodal content
Args:
content_list: Content list from MinerU parsing
Returns:
(text_content, multimodal_items): Pure text content and multimodal items list
"""
text_parts = []
multimodal_items = []
for item in content_list:
content_type = item.get("type", "text")
if content_type == "text":
# Text content
text = item.get("text", "")
if text.strip():
text_parts.append(text)
else:
# Multimodal content (image, table, equation, etc.)
multimodal_items.append(item)
# Merge all text content
text_content = "\n\n".join(text_parts)
logger.info("Content separation complete:")
logger.info(f" - Text content length: {len(text_content)} characters")
logger.info(f" - Multimodal items count: {len(multimodal_items)}")
# Count multimodal types
modal_types = {}
for item in multimodal_items:
modal_type = item.get("type", "unknown")
modal_types[modal_type] = modal_types.get(modal_type, 0) + 1
if modal_types:
logger.info(f" - Multimodal type distribution: {modal_types}")
return text_content, multimodal_items
async def insert_text_content(
lightrag,
input: str | list[str],
split_by_character: str | None = None,
split_by_character_only: bool = False,
ids: str | list[str] | None = None,
file_paths: str | list[str] | None = None,
):
"""
Insert pure text content into LightRAG
Args:
lightrag: LightRAG instance
input: Single document string or list of document strings
split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
chunk_token_size, it will be split again by token size.
split_by_character_only: if split_by_character_only is True, split the string by character only, when
split_by_character is None, this parameter is ignored.
ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated
file_paths: single string of the file path or list of file paths, used for citation
"""
logger.info("Starting text content insertion into LightRAG...")
# Use LightRAG's insert method with all parameters
await lightrag.ainsert(
input=input,
file_paths=file_paths,
split_by_character=split_by_character,
split_by_character_only=split_by_character_only,
ids=ids,
)
logger.info("Text content insertion complete")
def get_processor_for_type(modal_processors: Dict[str, Any], content_type: str):
"""
Get appropriate processor based on content type
Args:
modal_processors: Dictionary of available processors
content_type: Content type
Returns:
Corresponding processor instance
"""
# Direct mapping to corresponding processor
if content_type == "image":
return modal_processors.get("image")
elif content_type == "table":
return modal_processors.get("table")
elif content_type == "equation":
return modal_processors.get("equation")
else:
# For other types, use generic processor
return modal_processors.get("generic")
def get_processor_supports(proc_type: str) -> List[str]:
"""Get processor supported features"""
supports_map = {
"image": [
"Image content analysis",
"Visual understanding",
"Image description generation",
"Image entity extraction",
],
"table": [
"Table structure analysis",
"Data statistics",
"Trend identification",
"Table entity extraction",
],
"equation": [
"Mathematical formula parsing",
"Variable identification",
"Formula meaning explanation",
"Formula entity extraction",
],
"generic": [
"General content analysis",
"Structured processing",
"Entity extraction",
],
}
return supports_map.get(proc_type, ["Basic processing"])