parent
8878361126
commit
153b2dadc1
@ -1,23 +1,19 @@
|
|||||||
import json
|
import ijson
|
||||||
import os
|
|
||||||
|
|
||||||
from Tools.KG_Config import TOPIC
|
from Tools.KG_Config import TOPIC
|
||||||
|
|
||||||
# 文件路径
|
# 文件路径
|
||||||
file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\kv_store_doc_status.json"
|
file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\kv_store_doc_status.json"
|
||||||
|
|
||||||
# 读取并解析JSON文件
|
# 使用ijson流式读取JSON文件
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
doc_status_data = json.load(f)
|
# 流式迭代文档ID和状态信息
|
||||||
|
for doc_id, status_info in ijson.kvitems(f, ''):
|
||||||
# 遍历文档状态信息
|
print(f"文档ID: {doc_id}")
|
||||||
for doc_id, status_info in doc_status_data.items():
|
print(f"状态: {status_info['status']}")
|
||||||
print(f"文档ID: {doc_id}")
|
print(f"分块数量: {status_info['chunks_count']}")
|
||||||
print(f"状态: {status_info['status']}")
|
print(f"内容摘要: {status_info['content_summary'][:100]}...") # 打印前100字符
|
||||||
print(f"分块数量: {status_info['chunks_count']}")
|
print(f"内容长度: {status_info['content_length']}字符")
|
||||||
print(f"内容摘要: {status_info['content_summary'][:100]}...") # 打印前100字符
|
#print(f"创建时间: {status_info['created_at']}")
|
||||||
print(f"内容长度: {status_info['content_length']}字符")
|
#print(f"更新时间: {status_info['updated_at']}")
|
||||||
#print(f"创建时间: {status_info['created_at']}")
|
#print(f"文件路径: {status_info['file_path']}")
|
||||||
#print(f"更新时间: {status_info['updated_at']}")
|
print("---")
|
||||||
#print(f"文件路径: {status_info['file_path']}")
|
|
||||||
print("---")
|
|
@ -1,24 +1,26 @@
|
|||||||
import json
|
import ijson
|
||||||
|
import os
|
||||||
|
|
||||||
from Tools.KG_Config import TOPIC
|
from Tools.KG_Config import TOPIC
|
||||||
|
|
||||||
# 文件路径
|
# 文件路径
|
||||||
file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\vdb_chunks.json"
|
file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\vdb_chunks.json"
|
||||||
|
|
||||||
# 读取并解析JSON文件
|
# 检查文件是否存在
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
if not os.path.exists(file_path):
|
||||||
data = json.load(f)
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||||||
|
|
||||||
# 提取所有块信息
|
|
||||||
chunks = data.get('data', [])
|
|
||||||
|
|
||||||
# 打印块数量和详细信息
|
# 使用ijson流式读取JSON文件
|
||||||
print(f"共找到 {len(chunks)} 个块:")
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
for i, chunk in enumerate(chunks, 1):
|
# 流式迭代所有块
|
||||||
print(f"块 {i}:")
|
chunks = ijson.items(f, 'data.item')
|
||||||
print(f"ID: {chunk.get('__id__')}")
|
chunk_list = list(chunks)
|
||||||
print(f"创建时间: {chunk.get('__created_at__')}")
|
print(f"共找到 {len(chunk_list)} 个块:")
|
||||||
print(f"文档ID: {chunk.get('full_doc_id')}")
|
for i, chunk in enumerate(chunk_list, 1):
|
||||||
print(f"文件路径: {chunk.get('file_path')}")
|
print(f"块 {i}:")
|
||||||
print(f"内容预览: {chunk.get('content', '')}") # 显示前100字符
|
print(f"ID: {chunk.get('__id__')}")
|
||||||
print("---")
|
print(f"创建时间: {chunk.get('__created_at__')}")
|
||||||
|
print(f"文档ID: {chunk.get('full_doc_id')}")
|
||||||
|
print(f"文件路径: {chunk.get('file_path')}")
|
||||||
|
print(f"内容预览: {chunk.get('content', '')}") # 显示前100字符
|
||||||
|
print("---")
|
@ -1,20 +1,24 @@
|
|||||||
import json
|
import ijson
|
||||||
|
import os
|
||||||
|
|
||||||
from Tools.KG_Config import TOPIC
|
from Tools.KG_Config import TOPIC
|
||||||
|
|
||||||
# 文件路径
|
# 文件路径
|
||||||
file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\kv_store_text_chunks.json"
|
file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\kv_store_text_chunks.json"
|
||||||
|
|
||||||
# 读取并解析JSON文件
|
# 检查文件是否存在
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
if not os.path.exists(file_path):
|
||||||
text_chunks = json.load(f)
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||||||
|
|
||||||
# 遍历所有文本块
|
# 使用ijson流式读取JSON文件
|
||||||
for chunk_id, chunk_info in text_chunks.items():
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
print(f"块ID: {chunk_id}")
|
# 流式迭代所有文本块
|
||||||
print(f"所属文档ID: {chunk_info['full_doc_id']}")
|
print("正在读取文本块...")
|
||||||
print(f"块序号: {chunk_info['chunk_order_index']}")
|
for chunk_id, chunk_info in ijson.kvitems(f, ''):
|
||||||
print(f"Token数量: {chunk_info['tokens']}")
|
print(f"块ID: {chunk_id}")
|
||||||
#print(f"内容预览: {chunk_info['content'][:100]}...") # 打印前100字符
|
print(f"所属文档ID: {chunk_info['full_doc_id']}")
|
||||||
print(f"来源文件: {chunk_info['file_path']}")
|
print(f"块序号: {chunk_info['chunk_order_index']}")
|
||||||
print("---")
|
print(f"Token数量: {chunk_info['tokens']}")
|
||||||
|
#print(f"内容预览: {chunk_info['content'][:100]}...") # 打印前100字符
|
||||||
|
print(f"来源文件: {chunk_info['file_path']}")
|
||||||
|
print("---")
|
@ -1,18 +1,22 @@
|
|||||||
import json
|
import ijson
|
||||||
|
import os
|
||||||
|
|
||||||
from Tools.KG_Config import TOPIC
|
from Tools.KG_Config import TOPIC
|
||||||
|
|
||||||
# 文件路径
|
# 文件路径
|
||||||
file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\vdb_entities.json"
|
file_path = rf"d:\dsWork\dsProject\dsLightRag\Topic\{TOPIC}\vdb_entities.json"
|
||||||
|
|
||||||
# 读取并解析JSON文件
|
# 检查文件是否存在
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
if not os.path.exists(file_path):
|
||||||
data = json.load(f)
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||||||
|
|
||||||
# 提取所有实体名称
|
|
||||||
entities = [item['entity_name'] for item in data['data']]
|
|
||||||
|
|
||||||
# 打印实体列表
|
# 使用ijson流式读取JSON文件
|
||||||
print("文件中的实体列表:")
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
for entity in entities:
|
# 流式迭代所有实体名称
|
||||||
print(f"- {entity}")
|
print("正在读取实体...")
|
||||||
|
entity_names = ijson.items(f, 'data.item.entity_name')
|
||||||
|
entity_list = list(entity_names)
|
||||||
|
print(f"共找到 {len(entity_list)} 个实体:")
|
||||||
|
print("文件中的实体列表:")
|
||||||
|
for entity in entity_list:
|
||||||
|
print(f"- {entity}")
|
Loading…
Reference in new issue