'commit'

6 days ago · 153b2dadc1
parent 8878361126
commit 153b2dadc1
5 changed files with 76 additions and 64 deletions
--- a/dsLightRag/Tools/T4_Doc.py
+++ b/dsLightRag/Tools/T4_Doc.py
@ -1,23 +1,19 @@
-import json
-import os
-
+import ijson
 from Tools.KG_Config import TOPIC

 # 文件路径
 file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\kv_store_doc_status.json"

-# 读取并解析JSON文件
+# 使用ijson流式读取JSON文件
 with open(file_path, 'r', encoding='utf-8') as f:
-    doc_status_data = json.load(f)
-
-# 遍历文档状态信息
-for doc_id, status_info in doc_status_data.items():
-    print(f"文档ID: {doc_id}")
-    print(f"状态: {status_info['status']}")
-    print(f"分块数量: {status_info['chunks_count']}")
-    print(f"内容摘要: {status_info['content_summary'][:100]}...")  # 打印前100字符
-    print(f"内容长度: {status_info['content_length']}字符")
-    #print(f"创建时间: {status_info['created_at']}")
-    #print(f"更新时间: {status_info['updated_at']}")
-    #print(f"文件路径: {status_info['file_path']}")
-    print("---")
+    # 流式迭代文档ID和状态信息
+    for doc_id, status_info in ijson.kvitems(f, ''):
+        print(f"文档ID: {doc_id}")
+        print(f"状态: {status_info['status']}")
+        print(f"分块数量: {status_info['chunks_count']}")
+        print(f"内容摘要: {status_info['content_summary'][:100]}...")  # 打印前100字符
+        print(f"内容长度: {status_info['content_length']}字符")
+        #print(f"创建时间: {status_info['created_at']}")
+        #print(f"更新时间: {status_info['updated_at']}")
+        #print(f"文件路径: {status_info['file_path']}")
+        print("---")
--- a/dsLightRag/Tools/T5_Chunk.py
+++ b/dsLightRag/Tools/T5_Chunk.py
@ -1,24 +1,26 @@
-import json
+import ijson
+import os

 from Tools.KG_Config import TOPIC

 # 文件路径
 file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\vdb_chunks.json"

-# 读取并解析JSON文件
-with open(file_path, 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-# 提取所有块信息
-chunks = data.get('data', [])
+# 检查文件是否存在
+if not os.path.exists(file_path):
+    raise FileNotFoundError(f"文件不存在: {file_path}")

-# 打印块数量和详细信息
-print(f"共找到 {len(chunks)} 个块：")
-for i, chunk in enumerate(chunks, 1):
-    print(f"块 {i}:")
-    print(f"ID: {chunk.get('__id__')}")
-    print(f"创建时间: {chunk.get('__created_at__')}")
-    print(f"文档ID: {chunk.get('full_doc_id')}")
-    print(f"文件路径: {chunk.get('file_path')}")
-    print(f"内容预览: {chunk.get('content', '')}")  # 显示前100字符
-    print("---")
+# 使用ijson流式读取JSON文件
+with open(file_path, 'r', encoding='utf-8') as f:
+    # 流式迭代所有块
+    chunks = ijson.items(f, 'data.item')
+    chunk_list = list(chunks)
+    print(f"共找到 {len(chunk_list)} 个块：")
+    for i, chunk in enumerate(chunk_list, 1):
+        print(f"块 {i}:")
+        print(f"ID: {chunk.get('__id__')}")
+        print(f"创建时间: {chunk.get('__created_at__')}")
+        print(f"文档ID: {chunk.get('full_doc_id')}")
+        print(f"文件路径: {chunk.get('file_path')}")
+        print(f"内容预览: {chunk.get('content', '')}")  # 显示前100字符
+        print("---")
--- a/dsLightRag/Tools/T6_Doc_Chunks.py
+++ b/dsLightRag/Tools/T6_Doc_Chunks.py
@ -1,20 +1,24 @@
-import json
+import ijson
+import os

 from Tools.KG_Config import TOPIC

 # 文件路径
 file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\kv_store_text_chunks.json"

-# 读取并解析JSON文件
-with open(file_path, 'r', encoding='utf-8') as f:
-    text_chunks = json.load(f)
+# 检查文件是否存在
+if not os.path.exists(file_path):
+    raise FileNotFoundError(f"文件不存在: {file_path}")

-# 遍历所有文本块
-for chunk_id, chunk_info in text_chunks.items():
-    print(f"块ID: {chunk_id}")
-    print(f"所属文档ID: {chunk_info['full_doc_id']}")
-    print(f"块序号: {chunk_info['chunk_order_index']}")
-    print(f"Token数量: {chunk_info['tokens']}")
-    #print(f"内容预览: {chunk_info['content'][:100]}...")  # 打印前100字符
-    print(f"来源文件: {chunk_info['file_path']}")
-    print("---")
+# 使用ijson流式读取JSON文件
+with open(file_path, 'r', encoding='utf-8') as f:
+    # 流式迭代所有文本块
+    print("正在读取文本块...")
+    for chunk_id, chunk_info in ijson.kvitems(f, ''):
+        print(f"块ID: {chunk_id}")
+        print(f"所属文档ID: {chunk_info['full_doc_id']}")
+        print(f"块序号: {chunk_info['chunk_order_index']}")
+        print(f"Token数量: {chunk_info['tokens']}")
+        #print(f"内容预览: {chunk_info['content'][:100]}...")  # 打印前100字符
+        print(f"来源文件: {chunk_info['file_path']}")
+        print("---")
--- a/dsLightRag/Tools/T7_Entity.py
+++ b/dsLightRag/Tools/T7_Entity.py
@ -1,18 +1,22 @@
-import json
+import ijson
+import os

 from Tools.KG_Config import TOPIC

 # 文件路径
 file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\vdb_entities.json"

-# 读取并解析JSON文件
-with open(file_path, 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-# 提取所有实体名称
-entities = [item['entity_name'] for item in data['data']]
+# 检查文件是否存在
+if not os.path.exists(file_path):
+    raise FileNotFoundError(f"文件不存在: {file_path}")

-# 打印实体列表
-print("文件中的实体列表：")
-for entity in entities:
-    print(f"- {entity}")
+# 使用ijson流式读取JSON文件
+with open(file_path, 'r', encoding='utf-8') as f:
+    # 流式迭代所有实体名称
+    print("正在读取实体...")
+    entity_names = ijson.items(f, 'data.item.entity_name')
+    entity_list = list(entity_names)
+    print(f"共找到 {len(entity_list)} 个实体：")
+    print("文件中的实体列表：")
+    for entity in entity_list:
+        print(f"- {entity}")
--- a/dsLightRag/Tools/T8_Relationships.py
+++ b/dsLightRag/Tools/T8_Relationships.py
@ -1,4 +1,4 @@
-import json
+import ijson
 import os

 from Tools.KG_Config import TOPIC
@ -9,15 +9,21 @@ def parse_relationships(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"文件不存在: {file_path}")

-    # 读取并解析JSON文件
+    # 读取embedding_dim
+    embedding_dim = "未知维度"
    with open(file_path, 'r', encoding='utf-8') as f:
-        data = json.load(f)
+        embedding_dim = next(ijson.items(f, 'embedding_dim'), "未知维度")
+
+    # 读取关系数据
+    relationships = []
+    with open(file_path, 'r', encoding='utf-8') as f:
+        relationships = list(ijson.items(f, 'data.item'))

    # 提取关键信息
    result = {
-        "embedding_dim": data.get("embedding_dim", "未知维度"),
-        "relationship_count": len(data.get("data", [])),
-        "sample_relationships": data.get("data", [])[:3]  # 显示前3条示例
+        "embedding_dim": embedding_dim,
+        "relationship_count": len(relationships),
+        "sample_relationships": relationships[:3]  # 显示前3条示例
    }
    return result