main
黄海 6 months ago
parent 202c1a5627
commit 20f0aed9fd

@ -1,22 +1,28 @@
package Tools.Crawler;
import cn.hutool.core.date.DateTime;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.jfinal.plugin.activerecord.Db;
import com.jfinal.plugin.activerecord.Record;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
public class YunXiaoKnowledge {
// 用于存储课程信息的线程安全列表
public static List<Record> lessonList = Collections.synchronizedList(new ArrayList<>());
public static int batchSize = 100;
// 在类的开始处添加一个计数器
private static final AtomicInteger processedCount = new AtomicInteger(0);
private static int totalCount = 0; // 添加总数变量
// 定义线程池
private static final ExecutorService executorService = Executors.newFixedThreadPool(
Math.max(4, Runtime.getRuntime().availableProcessors())
@ -54,6 +60,7 @@ public class YunXiaoKnowledge {
return tasks;
}
/**
*
*/
@ -86,61 +93,74 @@ public class YunXiaoKnowledge {
}
}
}
/**
*
*/
public static void collectLesson(String nodeId, String stageCode, String subjectCode) {
String url = "https://yx.ccsjy.cn/api/cloud-school/v1/cloudLesson/getOnDemandLessonPage";
JSONObject argBook = new JSONObject();
argBook.put("nodeId", nodeId);
argBook.put("nodeType", 2);
argBook.put("pageNum", 1);
argBook.put("pageSize", 10000);
argBook.put("sortType", 2);
argBook.put("stageCode", stageCode);
argBook.put("subjectCode", subjectCode);
String respBook = YunXiaoBook.doRequestWithRetry(url, argBook.toString(), false, 3);
if (respBook == null) return;
try {
JSONArray jsonArrSource = JSONObject.parseObject(respBook)
.getJSONObject("data")
.getJSONArray("rows");
for (int m = 0; m < jsonArrSource.size(); m++) {
JSONObject jsonSource = jsonArrSource.getJSONObject(m);
LocalDateTime dateTime = LocalDateTime.parse(
jsonSource.getString("publishTime"),
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
);
String url = "https://yx.ccsjy.cn/api/cloud-school/v1/cloudLesson/getOnDemandLessonPage";
JSONObject argBook = new JSONObject();
argBook.put("nodeId", nodeId);
argBook.put("nodeType", 2);
argBook.put("pageNum", 1);
argBook.put("pageSize", 10000);
argBook.put("sortType", 2);
argBook.put("stageCode", stageCode);
argBook.put("subjectCode", subjectCode);
String respBook = YunXiaoBook.doRequestWithRetry(url, argBook.toString(), false, 3);
Record record = new Record()
.set("lesson_id", jsonSource.getString("lessonId"))
.set("lesson_name", jsonSource.getString("lessonName"))
.set("node_id", nodeId)
.set("teacher_id", jsonSource.getString("teacherId"))
.set("teacher_school_id", jsonSource.getString("teacherSchoolId"))
.set("teacher_school_name", jsonSource.getString("teacherSchoolName"))
.set("teacher_name", jsonSource.getString("teacherName"))
.set("book_id", null)
.set("scheme_id", null)
.set("subject_id", subjectCode)
.set("grade_code", jsonSource.getString("gradeCode"))
.set("publish_time", dateTime.toLocalDate())
.set("stage_id", stageCode)
.set("preview_count", jsonSource.getIntValue("previewCount"))
.set("learning_person_count", jsonSource.getIntValue("learningPersonCount"))
.set("learning_person_times", jsonSource.getIntValue("learningPersonTimes"))
.set("id", UUID.randomUUID().toString())
.set("node_type", 2);
lessonList.add(record);
if (respBook != null) {
JSONArray jsonArrSource = JSONObject.parseObject(respBook)
.getJSONObject("data")
.getJSONArray("rows");
// 添加进度输出
int current = processedCount.incrementAndGet();
if (current % 10 == 0 || current == totalCount) { // 每处理10个节点输出一次进度
System.out.println(DateTime.now()+" "+String.format("进度: %d/%d (%.2f%%), 当前节点课程数: %d",
current, totalCount,
(current * 100.0 / totalCount),
jsonArrSource == null ? 0 : jsonArrSource.size()));
}
if (jsonArrSource != null && !jsonArrSource.isEmpty()) {
for (int m = 0; m < jsonArrSource.size(); m++) {
JSONObject jsonSource = jsonArrSource.getJSONObject(m);
LocalDateTime dateTime = LocalDateTime.parse(
jsonSource.getString("publishTime"),
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
);
Record record = new Record()
.set("lesson_id", jsonSource.getString("lessonId"))
.set("lesson_name", jsonSource.getString("lessonName"))
.set("node_id", nodeId)
.set("teacher_id", jsonSource.getString("teacherId"))
.set("teacher_school_id", jsonSource.getString("teacherSchoolId"))
.set("teacher_school_name", jsonSource.getString("teacherSchoolName"))
.set("teacher_name", jsonSource.getString("teacherName"))
.set("book_id", null)
.set("scheme_id", null)
.set("subject_id", subjectCode)
.set("grade_code", jsonSource.getString("gradeCode"))
.set("publish_time", dateTime.toLocalDate())
.set("stage_id", stageCode)
.set("preview_count", jsonSource.getIntValue("previewCount"))
.set("learning_person_count", jsonSource.getIntValue("learningPersonCount"))
.set("learning_person_times", jsonSource.getIntValue("learningPersonTimes"))
.set("id", UUID.randomUUID().toString())
.set("node_type", 2);
lessonList.add(record);
}
}
}
} catch (Exception e) {
System.err.println("处理课程数据出错: " + e.getMessage());
System.out.println(DateTime.now()+" 处理课程数据出错 [nodeId=" + nodeId + "]: " + e.getMessage());
}
}
/**
*
*/
@ -155,6 +175,7 @@ public class YunXiaoKnowledge {
Db.batchSave(tableName, batch, batch.size());
}
}
public static void main(String[] args) {
PgInit.Init();
// 清空知识点表
@ -166,7 +187,7 @@ public class YunXiaoKnowledge {
Map<String, String> subjectMap = loadSubjectMap();
List<Map<String, String>> tasks = loadTasks();
System.out.println("加载到 " + stageMap.size() + " 个学段, " + subjectMap.size() + " 个学科");
System.out.println(DateTime.now()+" 加载到 " + stageMap.size() + " 个学段, " + subjectMap.size() + " 个学科");
for (Map<String, String> task : tasks) {
String stageCode = task.get("stageCode");
@ -178,7 +199,7 @@ public class YunXiaoKnowledge {
argBook.put("stageCode", stageCode);
argBook.put("subjectCode", subjectCode);
System.out.printf("开始获取学段[%s-%s]学科[%s-%s]的数据...\n",
System.out.printf(DateTime.now() + " 开始获取学段[%s-%s]学科[%s-%s]的数据...\n",
stageCode, stageMap.get(stageCode),
subjectCode, subjectMap.get(subjectCode));
String url = "https://yx.ccsjy.cn/api/business/v1/knowledge/tree";
@ -193,26 +214,32 @@ public class YunXiaoKnowledge {
collectNodes(tree, allRecords, stageCode, subjectCode, stageMap, subjectMap);
System.out.printf("学段[%s]学科[%s]共收集到 %d 条记录\n",
System.out.printf(DateTime.now() + " 学段[%s]学科[%s]共收集到 %d 条记录\n",
stageMap.get(stageCode),
subjectMap.get(subjectCode),
allRecords.size());
Db.batchSave("t_crawler_structure_knowledge", allRecords, allRecords.size());
} catch (Exception e) {
System.err.println("处理数据时出错: " + e.getMessage());
System.err.println(DateTime.now() + " 处理数据时出错: " + e.getMessage());
e.printStackTrace();
}
} else {
System.err.printf("获取学段[%s]学科[%s]数据失败!\n",
System.err.printf(DateTime.now() + " 获取学段[%s]学科[%s]数据失败!\n",
stageMap.get(stageCode),
subjectMap.get(subjectCode));
}
}
//遍历所有的知识点
// 第二阶段:多线程收集课程信息
System.out.println("开始收集课程信息...");
System.out.println(DateTime.now()+" 开始收集课程信息...");
List<Record> listKnowledge = Db.find("select * from t_crawler_structure_knowledge");
totalCount = listKnowledge.size(); // 设置总数
System.out.println(DateTime.now()+" 共有 " + totalCount + " 个知识点需要处理");
// 重置计数器
processedCount.set(0);
// 创建任务列表
List<CompletableFuture<Void>> futures = new ArrayList<>();
@ -227,16 +254,17 @@ public class YunXiaoKnowledge {
futures.add(future);
}
// 等待所有任务完成
CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
// 清理旧数据并保存新数据
System.out.println("开始保存课程数据...");
System.out.println(DateTime.now() + " 开始保存课程数据...");
Db.update("delete from t_crawler_lesson where node_type=2");
batchSaveData(lessonList, "t_crawler_lesson");
// 关闭线程池
executorService.shutdown();
System.out.println("数据处理完成! 共处理 " + lessonList.size() + " 条课程数据");
System.out.println(DateTime.now() + " 数据处理完成! 共处理 " + lessonList.size() + " 条课程数据");
}
}

Loading…
Cancel
Save