You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
181 lines
6.9 KiB
181 lines
6.9 KiB
import asyncio
|
|
import json
|
|
import re
|
|
from CommonUtil import *
|
|
|
|
|
|
class MarkdownToJsonConverter:
|
|
def __init__(self, client):
|
|
self.client = client
|
|
|
|
def extract_level1_title(self, markdown_content):
|
|
"""
|
|
从 Markdown 字符串中提取一级目录的文本内容
|
|
"""
|
|
match = re.search(r'^#\s+(.+)$', markdown_content, re.MULTILINE)
|
|
if match:
|
|
return match.group(1) # 返回一级目录的文本内容
|
|
return None
|
|
|
|
def extract_level2_titles(self, markdown_content):
|
|
"""
|
|
从 Markdown 字符串中提取所有二级目录的文本内容,返回一个数组
|
|
"""
|
|
matches = re.findall(r'^##\s+(.+)$', markdown_content, re.MULTILINE)
|
|
unique_matches = list(dict.fromkeys(matches)) # 去重
|
|
return unique_matches
|
|
|
|
def extract_level2_and_level3(self, markdown_content):
|
|
"""
|
|
遍历 Markdown 内容,记录所有二级目录、三级目录及其下的 - 内容
|
|
"""
|
|
lines = markdown_content.splitlines() # 将内容按行分割
|
|
current_level3 = None
|
|
level3_items = []
|
|
result = [] # 用于存储最终的结构化数据
|
|
|
|
def save_level3():
|
|
"""保存当前三级目录及其内容"""
|
|
nonlocal current_level3, level3_items
|
|
if current_level3:
|
|
result[-1]["children"].append({
|
|
"title": current_level3,
|
|
"items": level3_items.copy()
|
|
})
|
|
level3_items.clear()
|
|
|
|
for line in lines:
|
|
line = line.strip() # 去掉前后空格
|
|
if not line: # 跳过空行
|
|
continue
|
|
|
|
if line.startswith("## "): # 二级目录
|
|
if current_level3:
|
|
save_level3()
|
|
current_level3 = None
|
|
|
|
current_level2 = line[3:].strip() # 去掉 "## " 取标题
|
|
result.append({
|
|
"title": current_level2,
|
|
"children": [] # 用于存储三级目录
|
|
})
|
|
|
|
elif line.startswith("### "): # 三级目录
|
|
if current_level3:
|
|
save_level3()
|
|
current_level3 = line[4:].strip() # 去掉 "### " 取标题
|
|
level3_items = []
|
|
|
|
elif line.startswith("- "): # 三级目录下的内容项
|
|
if current_level3 is None:
|
|
current_level3 = "未命名章节" # 兜底处理
|
|
level3_items.append(line[2:].strip()) # 去掉 "- " 取内容
|
|
|
|
if current_level3:
|
|
save_level3()
|
|
|
|
return result
|
|
|
|
def convert_structure_to_json(self, structure):
|
|
"""
|
|
将结构化数据转换为指定的 JSON 格式
|
|
"""
|
|
result = []
|
|
|
|
for level2 in structure:
|
|
level2_title = level2["title"]
|
|
level2_json = {
|
|
"type": "transition",
|
|
"data": {
|
|
"title": level2_title,
|
|
"text": level2_title
|
|
}
|
|
}
|
|
result.append(level2_json)
|
|
|
|
for level3 in level2["children"]:
|
|
level3_title = level3["title"]
|
|
items = [{"title": item, "text": item} for item in level3["items"]]
|
|
level3_json = {
|
|
"type": "content",
|
|
"data": {
|
|
"title": level3_title,
|
|
"text": level3_title,
|
|
"items": items
|
|
}
|
|
}
|
|
result.append(level3_json)
|
|
|
|
return result
|
|
|
|
async def generate_descriptions_for_json_batch(self, json_data):
|
|
"""
|
|
批量生成描述语句,并替换 JSON 中的 text 属性
|
|
"""
|
|
for item in json_data:
|
|
if "data" in item and "title" in item["data"]:
|
|
title = item["data"]["title"]
|
|
description = await self.generate_description(title)
|
|
item["data"]["text"] = description
|
|
|
|
if "data" in item and "items" in item["data"]:
|
|
for sub_item in item["data"]["items"]:
|
|
if "title" in sub_item:
|
|
title = sub_item["title"]
|
|
description = await self.generate_description(title)
|
|
sub_item["text"] = description
|
|
yield json.dumps(item, ensure_ascii=False)
|
|
await asyncio.sleep(0.5) # 控制逐行输出的速度
|
|
|
|
async def generate_description(self, title):
|
|
"""
|
|
调用 AI 接口,生成描述语句(限制在 20 个字以内)
|
|
"""
|
|
try:
|
|
response = await self.client.chat.completions.create(
|
|
model=MODEL_NAME,
|
|
messages=[
|
|
{"role": "system", "content": "你是一个专业的助手,能够根据上下文生成简洁的描述信息。"},
|
|
{"role": "user", "content": f"请为以下标题生成一句话的描述信息,描述信息应简洁明了,且与标题内容相关,不要使用与标题内容相同的语句,不要包含任何序号(如 1.、2. 等)或 Markdown 语法(如 #、- 等),且描述长度不超过 20 个字:\n- {title}"}
|
|
],
|
|
max_tokens=20
|
|
)
|
|
|
|
if response.choices and response.choices[0].message.content:
|
|
description = response.choices[0].message.content.strip()
|
|
description = re.sub(r'[\d.]+', '', description).strip()
|
|
description = re.sub(r'[#-]', '', description).strip()
|
|
return description
|
|
else:
|
|
print(f"AI 未返回有效描述信息:{title}")
|
|
return title
|
|
except Exception as e:
|
|
print(f"调用 AI 生成描述信息时出错:{e}")
|
|
return title
|
|
|
|
async def convert_markdown_to_json(self, markdown_content):
|
|
"""
|
|
将 Markdown 内容转换为 JSON 格式
|
|
"""
|
|
listAll = []
|
|
|
|
# 一级名称
|
|
level1_title = self.extract_level1_title(markdown_content)
|
|
json_obj = {"type": "cover", "data": {"title": level1_title, "text": ""}}
|
|
listAll.append(json_obj)
|
|
|
|
# 二级名称列表
|
|
contents = self.extract_level2_titles(markdown_content)
|
|
json_obj = {"type": "contents", "data": {"items": contents}}
|
|
listAll.append(json_obj)
|
|
|
|
# 二级目录和三级目录
|
|
result = self.extract_level2_and_level3(markdown_content)
|
|
json_obj = self.convert_structure_to_json(result)
|
|
for item in json_obj:
|
|
listAll.append(item)
|
|
|
|
# 生成描述
|
|
async for item in self.generate_descriptions_for_json_batch(listAll):
|
|
yield item
|