diff --git a/AI/Text2Sql/Sql/AreaSchoolLesson.md b/AI/Text2Sql/Sql/AreaSchoolLesson.md new file mode 100644 index 00000000..6b0b4666 --- /dev/null +++ b/AI/Text2Sql/Sql/AreaSchoolLesson.md @@ -0,0 +1 @@ +# 行政区划名,行政区,区域名称,区域,一般是同一个概念,查询表t_crawler_lesson中字段gather_regionc \ No newline at end of file diff --git a/AI/Text2Sql/Sql/AreaSchoolLesson.sql b/AI/Text2Sql/Sql/AreaSchoolLessonDDL.sql similarity index 100% rename from AI/Text2Sql/Sql/AreaSchoolLesson.sql rename to AI/Text2Sql/Sql/AreaSchoolLessonDDL.sql diff --git a/AI/Text2Sql/Sql/AreaSchoolLessonGenerate.sql b/AI/Text2Sql/Sql/AreaSchoolLessonGenerate.sql new file mode 100644 index 00000000..2de0f90f --- /dev/null +++ b/AI/Text2Sql/Sql/AreaSchoolLessonGenerate.sql @@ -0,0 +1,24 @@ +/* +查询: +1、发布时间是2024年度 +2、每个学段,每个科目,上传课程数量,按由多到少排序 +3、字段名: 学段,科目,排名,课程数量 +*/ +SELECT stage_name AS 学段, subject_name AS 科目, ROW_NUMBER() OVER (PARTITION BY stage_name ORDER BY COUNT(*) DESC) AS 排名, COUNT(*) AS 课程数量 FROM t_crawler_lesson WHERE publish_time >= '2024-01-01' AND publish_time < '2025-01-01' AND gather_regionc IS NOT NULL AND gather_regionc <> '' GROUP BY stage_name, subject_name ORDER BY stage_name, 课程数量 DESC; + + +/* +查询: +1、发布时间是2024年度 +2、每个行政区每个学校都上传了多少课程数量 +3、格式: 行政区划名,学段,排名,学校名称,课程数量 +*/ +SELECT gather_regionc AS 行政区划名, stage_name AS 学段, ROW_NUMBER() OVER (PARTITION BY gather_regionc ORDER BY COUNT(*) DESC) AS 排名, teacher_school_name AS 学校名称, COUNT(*) AS 课程数量 FROM t_crawler_lesson WHERE publish_time >= '2024-01-01' AND publish_time < '2025-01-01' AND gather_regionc IS NOT NULL AND gather_regionc != '' GROUP BY gather_regionc, stage_name, teacher_school_name ORDER BY gather_regionc, COUNT(*) DESC; + +/* +查询: +1、发布时间是2024年度 +2、二道区每个学校都上传了多少课程数量 +3、格式: 行政区划名,学段,排名,学校名称,发布年份,课程数量 +*/ +SELECT gather_regionc AS 行政区划名, stage_name AS 学段, ROW_NUMBER() OVER (PARTITION BY gather_regionc ORDER BY COUNT(*) DESC) AS 排名, teacher_school_name AS 学校名称, EXTRACT(YEAR FROM publish_time) AS 发布年份, COUNT(*) AS 课程数量 FROM t_crawler_lesson WHERE publish_time >= '2024-01-01' AND publish_time < '2025-01-01' AND gather_regionc = '二道区' AND gather_regionc IS NOT NULL AND gather_regionc != '' GROUP BY gather_regionc, stage_name, teacher_school_name, EXTRACT(YEAR FROM publish_time) ORDER BY gather_regionc, 课程数量 DESC; \ No newline at end of file diff --git a/AI/Text2Sql/Test/YunXiao_Deepseek.py b/AI/Text2Sql/Test/YunXiao_Deepseek.py index 4f5620e5..18bdff7f 100644 --- a/AI/Text2Sql/Test/YunXiao_Deepseek.py +++ b/AI/Text2Sql/Test/YunXiao_Deepseek.py @@ -42,8 +42,8 @@ def generate_sql_from_prompt(ddl: str, prompt: str) -> str: raise ValueError("未能生成 SQL 查询") if __name__ == '__main__': - # 读取 Sql/AreaSchoolLesson.sql 文件 - with open("../Sql/AreaSchoolLesson.sql", "r", encoding="utf-8") as file: + # 读取 Sql/AreaSchoolLessonDDL.sql 文件 + with open("../Sql/AreaSchoolLessonDDL.sql", "r", encoding="utf-8") as file: ddl = file.read() # 自然语言描述 diff --git a/AI/Text2Sql/YunXiao.py b/AI/Text2Sql/YunXiao.py index 575c2236..e239722a 100644 --- a/AI/Text2Sql/YunXiao.py +++ b/AI/Text2Sql/YunXiao.py @@ -21,14 +21,26 @@ if __name__ == "__main__": # 开始训练 print("开始训练...") # 打开AreaSchoolLesson.sql文件内容 - with open("Sql/AreaSchoolLesson.sql", "r", encoding="utf-8") as file: + with open("Sql/AreaSchoolLessonDDL.sql", "r", encoding="utf-8") as file: ddl = file.read() # 训练数据 vn.train( ddl=ddl ) - - + # 添加有关业务术语或定义的文档 + vn.train(documentation="Sql/AreaSchoolLesson.md") + + # 使用 SQL 进行训练 + # 读取 SQL 文件 + with open('Sql/AreaSchoolLessonGenerate.sql', 'r', encoding='utf-8') as file: + sql_content = file.read() + # 使用正则表达式提取注释和 SQL 语句 + sql_pattern = r'/\*(.*?)\*/(.*?);' + sql_snippets = re.findall(sql_pattern, sql_content, re.DOTALL) + + # 打印提取的注释和 SQL 语句 + for i, (comment, sql) in enumerate(sql_snippets, 1): + vn.train(sql=comment.strip() + '\n' + sql.strip()+'\n') # 自然语言提问 # 整体情况