You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

157 lines
5.7 KiB

5 months ago
import json
from openai import OpenAI
5 months ago
from Text2Sql.Util.MarkdownToDocxUtil import markdown_to_docx
5 months ago
from Text2Sql.Util.PostgreSQLUtil import PostgreSQLUtil
from Text2Sql.Util.SaveToExcel import save_to_excel
5 months ago
from Text2Sql.Util.VannaUtil import *
5 months ago
5 months ago
from Util.EchartsUtil import *
5 months ago
'''
经验
1尽量使用宽表少用关联越少越好
2应该有一些固定的组合用法预置出来给出范例让用户可以简单修改后就能使用
3应该有类似于 保存为用例查询历史等功能让用户方便利旧
'''
4 months ago
5 months ago
if __name__ == "__main__":
5 months ago
vn = VannaUtil()
5 months ago
# 开始训练
print("开始训练...")
5 months ago
# 打开AreaSchoolLesson.sql文件内容
4 months ago
with open("Sql/AreaSchoolLessonDDL.sql", "r", encoding="utf-8") as file:
5 months ago
ddl = file.read()
# 训练数据
vn.train(
ddl=ddl
)
4 months ago
# 添加有关业务术语或定义的文档
4 months ago
# vn.train(documentation="Sql/AreaSchoolLesson.md")
4 months ago
# 使用 SQL 进行训练
with open('Sql/AreaSchoolLessonGenerate.sql', 'r', encoding='utf-8') as file:
sql_content = file.read()
# 使用正则表达式提取注释和 SQL 语句
sql_pattern = r'/\*(.*?)\*/(.*?);'
sql_snippets = re.findall(sql_pattern, sql_content, re.DOTALL)
# 打印提取的注释和 SQL 语句
for i, (comment, sql) in enumerate(sql_snippets, 1):
4 months ago
vn.train(sql=comment.strip() + '\n' + sql.strip() + '\n')
4 months ago
5 months ago
# 自然语言提问
5 months ago
# 整体情况
5 months ago
# question = '''
# 查询:
# 1、发布时间是2024年度
# 2、每个行政区每个学校都上传了多少课程数量
# 3、格式: 行政区划名,学段,排名,学校名称,课程数量
# '''
5 months ago
# 指定行政区域
# question = '''
# 查询:
# 1、发布时间是2024年度
# 2、二道区每个学校都上传了多少课程数量
# 3、格式: 行政区划名,学段,排名,学校名称,发布年份,课程数量
# '''
# 指定学段
5 months ago
question = '''
查询:
1发布时间是2024年度
2每个学段每个科目上传课程数量按由多到少排序
3字段名: 学段,科目,排名,课程数量
'''
5 months ago
common_prompt = '''
5 months ago
返回的信息要求
5 months ago
1行政区划为NULL 或者是空字符的不参加统计
5 months ago
2目标数据库是Postgresql 16
5 months ago
'''
5 months ago
question = question + common_prompt
# 开始查询
print("开始查询...")
# 获取完整 SQL
sql = vn.generate_sql(question)
print("生成的查询 SQL:\n", sql)
# 执行SQL查询
with PostgreSQLUtil() as db:
5 months ago
_data = db.execute_query(sql)
4 months ago
# 获取字段名和数据示例
field_names = list(_data[0].keys()) if _data else []
sample_data = _data[:3] # 取前 3 行作为示例数据
5 months ago
# 1、生成柱状图
5 months ago
generate_bar_chart(
5 months ago
_data=_data,
title="学段+科目课程数量柱状图",
4 months ago
x_columns=['学段', '科目'], # 动态指定 X 轴列
y_columns=['课程数量'], # 动态指定 Y 轴列
5 months ago
output_file="d:/lesson_bar_chart.html"
)
5 months ago
# 2、生成饼状图
5 months ago
generate_pie_chart(
_data=_data,
5 months ago
title="学段+科目分布",
4 months ago
category_columns=['学段', '科目'], # 多列组合参数
value_column='课程数量',
5 months ago
output_file="d:/lesson_pie_chart.html"
)
5 months ago
# 3、生成excel
5 months ago
filename = "d:/导出信息.xlsx"
5 months ago
save_to_excel(_data, filename)
5 months ago
# 4、生成word报告
5 months ago
prompt = '''
请根据以下 JSON 数据整理出2000字左右的话描述当前数据情况要求
1以Markdown格式返回我将直接通过markdown格式生成Word
5 months ago
2标题统一为长春云校数据分析报告
3内容中不要提到JSON数据统一称数据
5 months ago
4尽量以条目列出这样更清晰
5数据
5 months ago
'''
5 months ago
prompt = prompt + json.dumps(_data, ensure_ascii=False)
5 months ago
# 初始化 OpenAI 客户端
client = OpenAI(
api_key=MODEL_API_KEY,
5 months ago
base_url=MODEL_API_URL,
5 months ago
)
5 months ago
# 调用 OpenAI API 生成总结(流式输出)
5 months ago
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[
5 months ago
{"role": "system", "content": "你是一个数据分析助手,擅长从 JSON 数据中提取关键信息并生成详细的总结。"},
5 months ago
{"role": "user", "content": prompt}
],
5 months ago
max_tokens=3000, # 控制生成内容的长度
temperature=0.7, # 控制生成内容的创造性
stream=True # 启用流式输出
5 months ago
)
5 months ago
# 初始化变量用于存储流式输出的内容
summary = ""
# 处理流式输出
for chunk in response:
if chunk.choices[0].delta.content: # 检查是否有内容
chunk_content = chunk.choices[0].delta.content
print(chunk_content, end="", flush=True) # 实时打印到控制台
summary += chunk_content # 将内容拼接到 summary 中
5 months ago
# 保存markdown
with open("d:/report.md", "w", encoding="utf-8") as file:
file.write(summary)
5 months ago
# 最终 summary 为完整的 Markdown 内容
print("\n\n流式输出完成summary 已拼接为完整字符串。")
5 months ago
# 生成 Word 文档
markdown_to_docx(summary, output_file="d:/report.docx")