""" Utility functions for RAGAnything Contains helper functions for content separation, text insertion, and other utilities """ from typing import Dict, List, Any, Tuple from lightrag.utils import logger def separate_content( content_list: List[Dict[str, Any]], ) -> Tuple[str, List[Dict[str, Any]]]: """ Separate text content and multimodal content Args: content_list: Content list from MinerU parsing Returns: (text_content, multimodal_items): Pure text content and multimodal items list """ text_parts = [] multimodal_items = [] for item in content_list: content_type = item.get("type", "text") if content_type == "text": # Text content text = item.get("text", "") if text.strip(): text_parts.append(text) else: # Multimodal content (image, table, equation, etc.) multimodal_items.append(item) # Merge all text content text_content = "\n\n".join(text_parts) logger.info("Content separation complete:") logger.info(f" - Text content length: {len(text_content)} characters") logger.info(f" - Multimodal items count: {len(multimodal_items)}") # Count multimodal types modal_types = {} for item in multimodal_items: modal_type = item.get("type", "unknown") modal_types[modal_type] = modal_types.get(modal_type, 0) + 1 if modal_types: logger.info(f" - Multimodal type distribution: {modal_types}") return text_content, multimodal_items async def insert_text_content( lightrag, input: str | list[str], split_by_character: str | None = None, split_by_character_only: bool = False, ids: str | list[str] | None = None, file_paths: str | list[str] | None = None, ): """ Insert pure text content into LightRAG Args: lightrag: LightRAG instance input: Single document string or list of document strings split_by_character: if split_by_character is not None, split the string by character, if chunk longer than chunk_token_size, it will be split again by token size. split_by_character_only: if split_by_character_only is True, split the string by character only, when split_by_character is None, this parameter is ignored. ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated file_paths: single string of the file path or list of file paths, used for citation """ logger.info("Starting text content insertion into LightRAG...") # Use LightRAG's insert method with all parameters await lightrag.ainsert( input=input, file_paths=file_paths, split_by_character=split_by_character, split_by_character_only=split_by_character_only, ids=ids, ) logger.info("Text content insertion complete") def get_processor_for_type(modal_processors: Dict[str, Any], content_type: str): """ Get appropriate processor based on content type Args: modal_processors: Dictionary of available processors content_type: Content type Returns: Corresponding processor instance """ # Direct mapping to corresponding processor if content_type == "image": return modal_processors.get("image") elif content_type == "table": return modal_processors.get("table") elif content_type == "equation": return modal_processors.get("equation") else: # For other types, use generic processor return modal_processors.get("generic") def get_processor_supports(proc_type: str) -> List[str]: """Get processor supported features""" supports_map = { "image": [ "Image content analysis", "Visual understanding", "Image description generation", "Image entity extraction", ], "table": [ "Table structure analysis", "Data statistics", "Trend identification", "Table entity extraction", ], "equation": [ "Mathematical formula parsing", "Variable identification", "Formula meaning explanation", "Formula entity extraction", ], "generic": [ "General content analysis", "Structured processing", "Entity extraction", ], } return supports_map.get(proc_type, ["Basic processing"])