main
黄海 6 months ago
parent b85af57b4e
commit 12e795b7e5

@ -1,243 +0,0 @@
package Tools.Crawler.Backup;
import Tools.Crawler.Util;
import cn.hutool.core.date.DateTime;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.jfinal.kit.PropKit;
import com.jfinal.plugin.activerecord.ActiveRecordPlugin;
import com.jfinal.plugin.activerecord.CaseInsensitiveContainerFactory;
import com.jfinal.plugin.activerecord.Db;
import com.jfinal.plugin.activerecord.dialect.PostgreSqlDialect;
import com.jfinal.plugin.hikaricp.HikariCpPlugin;
import com.jfinal.plugin.activerecord.Record;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;
import java.util.List;
// 爬取一次大约需要20分钟
public class YunXiaoOneByOne {
public static JSONArray tempTree = new JSONArray();
public static void traverseTree(JSONArray treeArray) {
for (Object item : treeArray) {
JSONObject node = (JSONObject) item;
JSONObject jsonObj = new JSONObject();
jsonObj.put("nodeId", node.getString("key"));
jsonObj.put("nodeName", node.getString("title"));
jsonObj.put("isLeaf", node.getBoolean("isLeaf"));
jsonObj.put("parentValue", node.getString("parentValue"));
tempTree.add(jsonObj);
// 判断是否有children并且不为空
if (node.containsKey("children") && !node.getJSONArray("children").isEmpty()) {
JSONArray children = node.getJSONArray("children");
traverseTree(children);
}
}
}
public static void print(String msg) {
//先输出时间,再输出内容
System.out.println(DateTime.now() + " " + msg);
}
public static void main(String[] args) {
PropKit.use("application.properties");
HikariCpPlugin hp = new HikariCpPlugin(PropKit.get("jdbcUrl"), PropKit.get("user"),
PropKit.get("password").trim(), PropKit.get("driverClassName"));
hp.start();
// 配置ActiveRecord插件
ActiveRecordPlugin arp = new ActiveRecordPlugin(hp);
//配置默认小写
arp.setContainerFactory(new CaseInsensitiveContainerFactory(true));
arp.setDialect(new PostgreSqlDialect());
arp.start();
//清空爬虫表
String truncatSql = "truncate table t_crawler_subject";
Db.update(truncatSql);
truncatSql = "truncate table t_crawler_scheme";
Db.update(truncatSql);
truncatSql = "truncate table t_crawler_book";
Db.update(truncatSql);
truncatSql = "truncate table t_crawler_structure";
Db.update(truncatSql);
truncatSql = "truncate table t_crawler_lesson";
Db.update(truncatSql);
Map<String, String> map = new HashMap<>();
map.put("1", "学前");
map.put("2", "小学");
map.put("3", "初中");
map.put("4", "高中");
print("开始爬取数据!");
//资源写入的数据集
List<Record> subjectList = new ArrayList<>();
List<Record> lessonList = new ArrayList<>();
List<Record> schemeList = new ArrayList<>();
List<Record> bookList = new ArrayList<>();
List<Record> structureList = new ArrayList<>();
map.forEach((key, value) -> {
String respSubject = null;
try {
respSubject = Util.doGet("https://yx.ccsjy.cn/api/business/v1/subject/list/" + key);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
JSONObject jsonObj = JSONObject.parseObject(respSubject);
JSONArray jsonArr = jsonObj.getJSONObject("data").getJSONArray("rows");
for (int i = 0; i < jsonArr.size(); i++) {
JSONObject jsonSubject = jsonArr.getJSONObject(i);
String subjectCode = jsonSubject.getString("subjectCode");
String subjectName = jsonSubject.getString("subjectName");
Record rSubject = new Record();
rSubject.set("subject_id", subjectCode);
rSubject.set("subject_name", subjectName);
rSubject.set("stage_id", key);
subjectList.add(rSubject);
JSONObject argScheme = new JSONObject();
argScheme.put("subjectCode", subjectCode);
argScheme.put("systemId", 1);
String respScheme = Util.doPost("https://yx.ccsjy.cn/api/business/v1/edition/list", argScheme.toString());
JSONObject jsonObjScheme = JSONObject.parseObject(respScheme);
JSONArray jsonArrScheme = jsonObjScheme.getJSONArray("data");
for (int j = 0; j < jsonArrScheme.size(); j++) {
JSONObject jsonScheme = jsonArrScheme.getJSONObject(j);
String businessEditionId = jsonScheme.getString("businessEditionId");
String editionName = jsonScheme.getString("editionName");
Record rScheme = new Record();
rScheme.set("scheme_id", businessEditionId);
rScheme.set("scheme_name", editionName);
rScheme.set("subject_id", subjectCode);
rScheme.set("stage_id", key);
rScheme.set("id", UUID.randomUUID().toString());
schemeList.add(rScheme);
JSONObject argBook = new JSONObject();
argBook.put("stageCode", key);
argBook.put("subjectCode", subjectCode);
argBook.put("businessEditionId", businessEditionId);
String respBook = Util.doPost("https://yx.ccsjy.cn/api/business/v1/book/list", argBook.toString());
JSONObject jsonObjBook = JSONObject.parseObject(respBook);
JSONArray jsonArrBook = jsonObjBook.getJSONArray("data");
for (int k = 0; k < jsonArrBook.size(); k++) {
JSONObject jsonBook = jsonArrBook.getJSONObject(k);
String businessBookId = jsonBook.getString("businessBookId");
String bookName = jsonBook.getString("bookName");
Record rBook = new Record();
rBook.set("book_id", businessBookId);
rBook.set("book_name", bookName);
rBook.set("scheme_id", businessEditionId);
rBook.set("subject_id", subjectCode);
rBook.set("stage_id", key);
rBook.set("id", UUID.randomUUID().toString());
bookList.add(rBook);
JSONObject argTree = new JSONObject();
argTree.put("businessBookId", businessBookId);
argTree.put("childrensFlag", 1);
argTree.put("parentId", -1);
argTree.put("searchKeyword", "");
String respTree = Util.doPost("https://yx.ccsjy.cn/api/business/v1/chapter/tree", argTree.toString());
JSONObject jsonObjTree = JSONObject.parseObject(respTree);
JSONArray jsonArrTree = jsonObjTree.getJSONObject("data").getJSONArray("tree");
tempTree = new JSONArray();
traverseTree(jsonArrTree);
for (int n = 0; n < tempTree.size(); n++) {
JSONObject jsonTree = tempTree.getJSONObject(n);
String nodeId = jsonTree.getString("nodeId");
String nodeName = jsonTree.getString("nodeName");
Boolean isLeaf = jsonTree.getBoolean("isLeaf");
String parentValue = jsonTree.getString("parentValue");
int is_leaf = 0;
if (isLeaf) {
is_leaf = 1;
}
Record rStructure = new Record();
rStructure.set("node_id", nodeId);
rStructure.set("node_name", nodeName);
rStructure.set("parent_id", parentValue);
rStructure.set("is_leaf", is_leaf);
rStructure.set("book_id", businessBookId);
rStructure.set("scheme_id", businessEditionId);
rStructure.set("subject_id", subjectCode);
rStructure.set("stage_id", key);
rStructure.set("id", UUID.randomUUID().toString());
structureList.add(rStructure);
JSONObject argSource = new JSONObject();
argSource.put("pageNum", 1);
argSource.put("pageSize", 100);
argSource.put("businessBookId", businessBookId);
argSource.put("nodeId", nodeId);
argSource.put("stageCode", key);
argSource.put("subjectCode", subjectCode);
argSource.put("excellentFlag", "");
argSource.put("nodeType", 1);
argSource.put("sortType", 2);
argSource.put("source", "");
argSource.put("searchKeyword", "");
String respSource = Util.doPost("https://yx.ccsjy.cn/api/cloud-school/v1/cloudLesson/getOnDemandLessonPage", argSource.toString());
JSONObject jsonObjSource = JSONObject.parseObject(respSource);
JSONArray jsonArrSource = jsonObjSource.getJSONObject("data").getJSONArray("rows");
print("正在爬取:【" + value + "" + subjectName + "" + editionName + "" + bookName + "" + nodeName + "】下的资源!");
for (int m = 0; m < jsonArrSource.size(); m++) {
JSONObject jsonSource = jsonArrSource.getJSONObject(m);
String lessonId = jsonSource.getString("lessonId");
String lessonName = jsonSource.getString("lessonName");
String teacherSchoolId = jsonSource.getString("teacherSchoolId");
String teacherSchoolName = jsonSource.getString("teacherSchoolName");
String teacherId = jsonSource.getString("teacherId");
String teacherName = jsonSource.getString("teacherName");
Record record = new Record();
record.set("lesson_id", lessonId);
record.set("lesson_name", lessonName);
record.set("node_id", nodeId);
record.set("teacher_id", teacherId);
record.set("teacher_school_id", teacherSchoolId);
record.set("teacher_school_name", teacherSchoolName);
record.set("teacher_name", teacherName);
record.set("book_id", businessBookId);
record.set("scheme_id", businessEditionId);
record.set("subject_id", subjectCode);
record.set("stage_id", key);
record.set("id", UUID.randomUUID().toString());
lessonList.add(record);
}
print("已收集资源数量:"+lessonList.size()+"个。");
}
}
}
}
});
print("开始保存数据...");
Db.batchSave("t_crawler_subject", subjectList, 300);
Db.batchSave("t_crawler_scheme", schemeList, 300);
Db.batchSave("t_crawler_book", bookList, 300);
Db.batchSave("t_crawler_structure", structureList, 300);
Db.batchSave("t_crawler_lesson", lessonList, 300);
print("爬取数据完成!");
}
}

@ -0,0 +1,13 @@
package Tools.Crawler;
import Tools.Crawler.Util.BookLesson;
import Tools.Crawler.Util.KnowledgeLesson;
import Tools.Crawler.Util.PgInit;
public class Start {
public static void main(String[] args) {
PgInit.Init();
BookLesson.BookLessonStart();
KnowledgeLesson.KnowledgeLessonStart();
}
}

@ -1,4 +1,4 @@
package Tools.Crawler;
package Tools.Crawler.Util;
import cn.hutool.core.date.DateTime;
import com.alibaba.fastjson.JSONArray;
@ -421,7 +421,7 @@ public class BookLesson {
return set;
}
public static void Start() {
public static void BookLessonStart() {
Map<String, String> map = new HashMap<>();
map.put("1", "学前");
map.put("2", "小学");
@ -561,9 +561,4 @@ public class BookLesson {
print("爬取数据耗时:" + (endTime - startTime) / 1000 + "秒");
}
public static void main(String[] args) {
PgInit.Init();
//爬虫开始
Start();
}
}

@ -1,4 +1,4 @@
package Tools.Crawler;
package Tools.Crawler.Util;
import cn.hutool.core.date.DateTime;
import com.alibaba.fastjson.JSONArray;
@ -118,7 +118,7 @@ public class KnowledgeLesson {
// 添加进度输出
int current = processedCount.incrementAndGet();
if (current % 10 == 0 || current == totalCount) { // 每处理10个节点输出一次进度
System.out.println(DateTime.now()+" "+String.format("进度: %d/%d (%.2f%%), 当前节点课程数: %d",
System.out.println(DateTime.now() + " " + String.format("进度: %d/%d (%.2f%%), 当前节点课程数: %d",
current, totalCount,
(current * 100.0 / totalCount),
jsonArrSource == null ? 0 : jsonArrSource.size()));
@ -157,7 +157,7 @@ public class KnowledgeLesson {
}
}
} catch (Exception e) {
System.out.println(DateTime.now()+" 处理课程数据出错 [nodeId=" + nodeId + "]: " + e.getMessage());
System.out.println(DateTime.now() + " 处理课程数据出错 [nodeId=" + nodeId + "]: " + e.getMessage());
}
}
@ -176,8 +176,7 @@ public class KnowledgeLesson {
}
}
public static void main(String[] args) {
PgInit.Init();
public static void KnowledgeLessonStart() {
// 清空知识点表
String sql = "truncate table t_crawler_structure_knowledge";
Db.update(sql);
@ -187,7 +186,7 @@ public class KnowledgeLesson {
Map<String, String> subjectMap = loadSubjectMap();
List<Map<String, String>> tasks = loadTasks();
System.out.println(DateTime.now()+" 加载到 " + stageMap.size() + " 个学段, " + subjectMap.size() + " 个学科");
System.out.println(DateTime.now() + " 加载到 " + stageMap.size() + " 个学段, " + subjectMap.size() + " 个学科");
for (Map<String, String> task : tasks) {
String stageCode = task.get("stageCode");
@ -232,10 +231,10 @@ public class KnowledgeLesson {
}
//遍历所有的知识点
// 第二阶段:多线程收集课程信息
System.out.println(DateTime.now()+" 开始收集课程信息...");
System.out.println(DateTime.now() + " 开始收集课程信息...");
List<Record> listKnowledge = Db.find("select * from t_crawler_structure_knowledge");
totalCount = listKnowledge.size(); // 设置总数
System.out.println(DateTime.now()+" 共有 " + totalCount + " 个知识点需要处理");
System.out.println(DateTime.now() + " 共有 " + totalCount + " 个知识点需要处理");
// 重置计数器
processedCount.set(0);

@ -1,4 +1,4 @@
package Tools.Crawler;
package Tools.Crawler.Util;
import com.dsideal.QingLong.Start;
import com.jfinal.kit.PropKit;

@ -1,4 +1,4 @@
package Tools.Crawler;
package Tools.Crawler.Util;
import cn.hutool.http.HttpRequest;

@ -1,6 +1,6 @@
package com.dsideal.QingLong.Cron4j;
import Tools.Crawler.BookLesson;
import Tools.Crawler.Util.BookLesson;
import com.jfinal.plugin.cron4j.ITask;
import lombok.SneakyThrows;
@ -11,7 +11,7 @@ public class LingDianTask implements ITask {
// 这里放被执行的调试任务代码
Tools.FillSubject.syncSubject();
//爬去云校数据
BookLesson.Start();
BookLesson.BookLessonStart();
}
public void stop() {

Loading…
Cancel
Save