diff --git a/dsLightRag/Tools/T4_Doc.py b/dsLightRag/Tools/T4_Doc.py index 9022b768..ae77a6c3 100644 --- a/dsLightRag/Tools/T4_Doc.py +++ b/dsLightRag/Tools/T4_Doc.py @@ -1,23 +1,19 @@ -import json -import os - +import ijson from Tools.KG_Config import TOPIC # 文件路径 file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\kv_store_doc_status.json" -# 读取并解析JSON文件 +# 使用ijson流式读取JSON文件 with open(file_path, 'r', encoding='utf-8') as f: - doc_status_data = json.load(f) - -# 遍历文档状态信息 -for doc_id, status_info in doc_status_data.items(): - print(f"文档ID: {doc_id}") - print(f"状态: {status_info['status']}") - print(f"分块数量: {status_info['chunks_count']}") - print(f"内容摘要: {status_info['content_summary'][:100]}...") # 打印前100字符 - print(f"内容长度: {status_info['content_length']}字符") - #print(f"创建时间: {status_info['created_at']}") - #print(f"更新时间: {status_info['updated_at']}") - #print(f"文件路径: {status_info['file_path']}") - print("---") \ No newline at end of file + # 流式迭代文档ID和状态信息 + for doc_id, status_info in ijson.kvitems(f, ''): + print(f"文档ID: {doc_id}") + print(f"状态: {status_info['status']}") + print(f"分块数量: {status_info['chunks_count']}") + print(f"内容摘要: {status_info['content_summary'][:100]}...") # 打印前100字符 + print(f"内容长度: {status_info['content_length']}字符") + #print(f"创建时间: {status_info['created_at']}") + #print(f"更新时间: {status_info['updated_at']}") + #print(f"文件路径: {status_info['file_path']}") + print("---") \ No newline at end of file diff --git a/dsLightRag/Tools/T5_Chunk.py b/dsLightRag/Tools/T5_Chunk.py index 96b85f90..8df16fe7 100644 --- a/dsLightRag/Tools/T5_Chunk.py +++ b/dsLightRag/Tools/T5_Chunk.py @@ -1,24 +1,26 @@ -import json +import ijson +import os from Tools.KG_Config import TOPIC # 文件路径 file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\vdb_chunks.json" -# 读取并解析JSON文件 -with open(file_path, 'r', encoding='utf-8') as f: - data = json.load(f) - -# 提取所有块信息 -chunks = data.get('data', []) +# 检查文件是否存在 +if not os.path.exists(file_path): + raise FileNotFoundError(f"文件不存在: {file_path}") -# 打印块数量和详细信息 -print(f"共找到 {len(chunks)} 个块:") -for i, chunk in enumerate(chunks, 1): - print(f"块 {i}:") - print(f"ID: {chunk.get('__id__')}") - print(f"创建时间: {chunk.get('__created_at__')}") - print(f"文档ID: {chunk.get('full_doc_id')}") - print(f"文件路径: {chunk.get('file_path')}") - print(f"内容预览: {chunk.get('content', '')}") # 显示前100字符 - print("---") \ No newline at end of file +# 使用ijson流式读取JSON文件 +with open(file_path, 'r', encoding='utf-8') as f: + # 流式迭代所有块 + chunks = ijson.items(f, 'data.item') + chunk_list = list(chunks) + print(f"共找到 {len(chunk_list)} 个块:") + for i, chunk in enumerate(chunk_list, 1): + print(f"块 {i}:") + print(f"ID: {chunk.get('__id__')}") + print(f"创建时间: {chunk.get('__created_at__')}") + print(f"文档ID: {chunk.get('full_doc_id')}") + print(f"文件路径: {chunk.get('file_path')}") + print(f"内容预览: {chunk.get('content', '')}") # 显示前100字符 + print("---") \ No newline at end of file diff --git a/dsLightRag/Tools/T6_Doc_Chunks.py b/dsLightRag/Tools/T6_Doc_Chunks.py index f307ac65..1a0945fd 100644 --- a/dsLightRag/Tools/T6_Doc_Chunks.py +++ b/dsLightRag/Tools/T6_Doc_Chunks.py @@ -1,20 +1,24 @@ -import json +import ijson +import os from Tools.KG_Config import TOPIC # 文件路径 file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\kv_store_text_chunks.json" -# 读取并解析JSON文件 -with open(file_path, 'r', encoding='utf-8') as f: - text_chunks = json.load(f) +# 检查文件是否存在 +if not os.path.exists(file_path): + raise FileNotFoundError(f"文件不存在: {file_path}") -# 遍历所有文本块 -for chunk_id, chunk_info in text_chunks.items(): - print(f"块ID: {chunk_id}") - print(f"所属文档ID: {chunk_info['full_doc_id']}") - print(f"块序号: {chunk_info['chunk_order_index']}") - print(f"Token数量: {chunk_info['tokens']}") - #print(f"内容预览: {chunk_info['content'][:100]}...") # 打印前100字符 - print(f"来源文件: {chunk_info['file_path']}") - print("---") \ No newline at end of file +# 使用ijson流式读取JSON文件 +with open(file_path, 'r', encoding='utf-8') as f: + # 流式迭代所有文本块 + print("正在读取文本块...") + for chunk_id, chunk_info in ijson.kvitems(f, ''): + print(f"块ID: {chunk_id}") + print(f"所属文档ID: {chunk_info['full_doc_id']}") + print(f"块序号: {chunk_info['chunk_order_index']}") + print(f"Token数量: {chunk_info['tokens']}") + #print(f"内容预览: {chunk_info['content'][:100]}...") # 打印前100字符 + print(f"来源文件: {chunk_info['file_path']}") + print("---") \ No newline at end of file diff --git a/dsLightRag/Tools/T7_Entity.py b/dsLightRag/Tools/T7_Entity.py index b6a83b1e..4364c863 100644 --- a/dsLightRag/Tools/T7_Entity.py +++ b/dsLightRag/Tools/T7_Entity.py @@ -1,18 +1,22 @@ -import json +import ijson +import os from Tools.KG_Config import TOPIC # 文件路径 file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\vdb_entities.json" -# 读取并解析JSON文件 -with open(file_path, 'r', encoding='utf-8') as f: - data = json.load(f) - -# 提取所有实体名称 -entities = [item['entity_name'] for item in data['data']] +# 检查文件是否存在 +if not os.path.exists(file_path): + raise FileNotFoundError(f"文件不存在: {file_path}") -# 打印实体列表 -print("文件中的实体列表:") -for entity in entities: - print(f"- {entity}") \ No newline at end of file +# 使用ijson流式读取JSON文件 +with open(file_path, 'r', encoding='utf-8') as f: + # 流式迭代所有实体名称 + print("正在读取实体...") + entity_names = ijson.items(f, 'data.item.entity_name') + entity_list = list(entity_names) + print(f"共找到 {len(entity_list)} 个实体:") + print("文件中的实体列表:") + for entity in entity_list: + print(f"- {entity}") \ No newline at end of file diff --git a/dsLightRag/Tools/T8_Relationships.py b/dsLightRag/Tools/T8_Relationships.py index a5f8c8a8..6439b991 100644 --- a/dsLightRag/Tools/T8_Relationships.py +++ b/dsLightRag/Tools/T8_Relationships.py @@ -1,4 +1,4 @@ -import json +import ijson import os from Tools.KG_Config import TOPIC @@ -9,15 +9,21 @@ def parse_relationships(file_path): if not os.path.exists(file_path): raise FileNotFoundError(f"文件不存在: {file_path}") - # 读取并解析JSON文件 + # 读取embedding_dim + embedding_dim = "未知维度" with open(file_path, 'r', encoding='utf-8') as f: - data = json.load(f) + embedding_dim = next(ijson.items(f, 'embedding_dim'), "未知维度") + + # 读取关系数据 + relationships = [] + with open(file_path, 'r', encoding='utf-8') as f: + relationships = list(ijson.items(f, 'data.item')) # 提取关键信息 result = { - "embedding_dim": data.get("embedding_dim", "未知维度"), - "relationship_count": len(data.get("data", [])), - "sample_relationships": data.get("data", [])[:3] # 显示前3条示例 + "embedding_dim": embedding_dim, + "relationship_count": len(relationships), + "sample_relationships": relationships[:3] # 显示前3条示例 } return result