You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
QingLong/AI/AiService/MarkdownToJsonConverter.py

181 lines
6.9 KiB

import asyncio
import json
import re
from CommonUtil import *
class MarkdownToJsonConverter:
def __init__(self, client):
self.client = client
def extract_level1_title(self, markdown_content):
"""
从 Markdown 字符串中提取一级目录的文本内容
"""
match = re.search(r'^#\s+(.+)$', markdown_content, re.MULTILINE)
if match:
return match.group(1) # 返回一级目录的文本内容
return None
def extract_level2_titles(self, markdown_content):
"""
从 Markdown 字符串中提取所有二级目录的文本内容,返回一个数组
"""
matches = re.findall(r'^##\s+(.+)$', markdown_content, re.MULTILINE)
unique_matches = list(dict.fromkeys(matches)) # 去重
return unique_matches
def extract_level2_and_level3(self, markdown_content):
"""
遍历 Markdown 内容,记录所有二级目录、三级目录及其下的 - 内容
"""
lines = markdown_content.splitlines() # 将内容按行分割
current_level3 = None
level3_items = []
result = [] # 用于存储最终的结构化数据
def save_level3():
"""保存当前三级目录及其内容"""
nonlocal current_level3, level3_items
if current_level3:
result[-1]["children"].append({
"title": current_level3,
"items": level3_items.copy()
})
level3_items.clear()
for line in lines:
line = line.strip() # 去掉前后空格
if not line: # 跳过空行
continue
if line.startswith("## "): # 二级目录
if current_level3:
save_level3()
current_level3 = None
current_level2 = line[3:].strip() # 去掉 "## " 取标题
result.append({
"title": current_level2,
"children": [] # 用于存储三级目录
})
elif line.startswith("### "): # 三级目录
if current_level3:
save_level3()
current_level3 = line[4:].strip() # 去掉 "### " 取标题
level3_items = []
elif line.startswith("- "): # 三级目录下的内容项
if current_level3 is None:
current_level3 = "未命名章节" # 兜底处理
level3_items.append(line[2:].strip()) # 去掉 "- " 取内容
if current_level3:
save_level3()
return result
def convert_structure_to_json(self, structure):
"""
将结构化数据转换为指定的 JSON 格式
"""
result = []
for level2 in structure:
level2_title = level2["title"]
level2_json = {
"type": "transition",
"data": {
"title": level2_title,
"text": level2_title
}
}
result.append(level2_json)
for level3 in level2["children"]:
level3_title = level3["title"]
items = [{"title": item, "text": item} for item in level3["items"]]
level3_json = {
"type": "content",
"data": {
"title": level3_title,
"text": level3_title,
"items": items
}
}
result.append(level3_json)
return result
async def generate_descriptions_for_json_batch(self, json_data):
"""
批量生成描述语句,并替换 JSON 中的 text 属性
"""
for item in json_data:
if "data" in item and "title" in item["data"]:
title = item["data"]["title"]
description = await self.generate_description(title)
item["data"]["text"] = description
if "data" in item and "items" in item["data"]:
for sub_item in item["data"]["items"]:
if "title" in sub_item:
title = sub_item["title"]
description = await self.generate_description(title)
sub_item["text"] = description
yield json.dumps(item, ensure_ascii=False)
await asyncio.sleep(0.5) # 控制逐行输出的速度
async def generate_description(self, title):
"""
调用 AI 接口,生成描述语句(限制在 20 个字以内)
"""
try:
response = await self.client.chat.completions.create(
model=MODEL_NAME,
messages=[
{"role": "system", "content": "你是一个专业的助手,能够根据上下文生成简洁的描述信息。"},
{"role": "user", "content": f"请为以下标题生成一句话的描述信息,描述信息应简洁明了,且与标题内容相关,不要使用与标题内容相同的语句,不要包含任何序号(如 1.、2. 等)或 Markdown 语法(如 #、- 等),且描述长度不超过 20 个字:\n- {title}"}
],
max_tokens=20
)
if response.choices and response.choices[0].message.content:
description = response.choices[0].message.content.strip()
description = re.sub(r'[\d.]+', '', description).strip()
description = re.sub(r'[#-]', '', description).strip()
return description
else:
print(f"AI 未返回有效描述信息:{title}")
return title
except Exception as e:
print(f"调用 AI 生成描述信息时出错:{e}")
return title
async def convert_markdown_to_json(self, markdown_content):
"""
将 Markdown 内容转换为 JSON 格式
"""
listAll = []
# 一级名称
level1_title = self.extract_level1_title(markdown_content)
json_obj = {"type": "cover", "data": {"title": level1_title, "text": ""}}
listAll.append(json_obj)
# 二级名称列表
contents = self.extract_level2_titles(markdown_content)
json_obj = {"type": "contents", "data": {"items": contents}}
listAll.append(json_obj)
# 二级目录和三级目录
result = self.extract_level2_and_level3(markdown_content)
json_obj = self.convert_structure_to_json(result)
for item in json_obj:
listAll.append(item)
# 生成描述
async for item in self.generate_descriptions_for_json_batch(listAll):
yield item