You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

541 lines
22 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

package Tools.Crawler.Util;
import cn.hutool.core.date.DateTime;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.jfinal.kit.Kv;
import com.jfinal.plugin.activerecord.*;
import com.jfinal.plugin.activerecord.Record;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import static Tools.Crawler.Util.Util.doRequestWithRetry;
public class BookLesson {
// 使用线程安全的集合
private static final CopyOnWriteArrayList<Record> subjectList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> lessonList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> schemeList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> bookList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> structureList = new CopyOnWriteArrayList<>();
// 用于记录进度
private static final AtomicInteger resourceCount = new AtomicInteger(0);
//线程池大小
private static final int ThreadCount = 4;
//最大重试次数
private static final int MaxRetryCount = 10;
//每次写入的数量
private static final int batchSize = 300;
//标准名称
public static Map<String, Kv> bzSchoolNameList = new HashMap<>();
//手工映射过名称
public static Map<String, Kv> handMatchSchoolList = new HashMap<>();
//哪些课程已经匹配过
public static Set<String> lessonAlreadyMatch = new HashSet<>();
public static void traverseTree(JSONArray treeArray, JSONArray tempTree) {
for (Object item : treeArray) {
JSONObject node = (JSONObject) item;
JSONObject jsonObj = new JSONObject();
jsonObj.put("nodeId", node.getString("key"));
jsonObj.put("nodeName", node.getString("title"));
jsonObj.put("isLeaf", node.getBoolean("isLeaf"));
jsonObj.put("parentValue", node.getString("parentValue"));
tempTree.add(jsonObj);
if (node.containsKey("children") && !node.getJSONArray("children").isEmpty()) {
traverseTree(node.getJSONArray("children"), tempTree);
}
}
}
/**
* 输出信息
*
* @param msg
*/
public static void print(String msg) {
System.out.println(DateTime.now() + " " + msg);
}
/**
* 处理科目
*
* @param subject
* @param key
* @param value
*/
private static void processSubject(JSONObject subject, String key, String value) throws InterruptedException {
String subjectCode = subject.getString("subjectCode");
String subjectName = subject.getString("subjectName");
Record rSubject = new Record().set("subject_id", subjectCode).set("subject_name", subjectName).set("stage_id", key);
subjectList.add(rSubject);
JSONObject argScheme = new JSONObject();
argScheme.put("subjectCode", subjectCode);
argScheme.put("systemId", 1);
String respScheme = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/edition/list",
argScheme.toString(), false, MaxRetryCount);
if (respScheme != null) {
//处理科目后,处理此科目下的版本
processSchemes(JSONObject.parseObject(respScheme).getJSONArray("data"), subjectCode, key, value);
}
}
/**
* 处理版本
*
* @param schemes
* @param subjectCode
* @param key
* @param value
*/
private static void processSchemes(JSONArray schemes, String subjectCode, String key, String value) throws InterruptedException {
ExecutorService schemeExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch schemeLatch = new CountDownLatch(schemes.size());
for (int j = 0; j < schemes.size(); j++) {
JSONObject scheme = schemes.getJSONObject(j);
schemeExecutor.submit(() -> {
try {
processScheme(scheme, subjectCode, key, value);
} catch (InterruptedException e) {
throw new RuntimeException(e);
} finally {
schemeLatch.countDown();
}
});
}
schemeLatch.await();
schemeExecutor.shutdown();
}
/**
* 处理某一个版本
*
* @param scheme
* @param subjectCode
* @param key
* @param value
*/
private static void processScheme(JSONObject scheme, String subjectCode, String key, String value) throws InterruptedException {
String businessEditionId = scheme.getString("businessEditionId");
String editionName = scheme.getString("editionName");
Record rScheme = new Record().set("scheme_id", businessEditionId).set("scheme_name", editionName).set("subject_id", subjectCode)
.set("stage_id", key).set("id", UUID.randomUUID().toString());
schemeList.add(rScheme);
JSONObject argBook = new JSONObject();
argBook.put("stageCode", key);
argBook.put("subjectCode", subjectCode);
argBook.put("businessEditionId", businessEditionId);
String respBook = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/book/list", argBook.toString(), false, 3);
if (respBook != null) {
JSONArray books = JSONObject.parseObject(respBook).getJSONArray("data");
// 使用线程池处理每本书
ExecutorService bookExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch bookLatch = new CountDownLatch(books.size());
for (int k = 0; k < books.size(); k++) {
JSONObject book = books.getJSONObject(k);
bookExecutor.submit(() -> {
try {
//章节目录
processBook(book, businessEditionId, subjectCode, key, value, editionName);
//知识点
} catch (InterruptedException e) {
throw new RuntimeException(e);
} finally {
bookLatch.countDown();
}
});
}
bookLatch.await();
bookExecutor.shutdown();
}
}
/**
* 处理册
*
* @param book
* @param businessEditionId
* @param subjectCode
* @param key
* @param value
* @param editionName
*/
private static void processBook(JSONObject book, String businessEditionId, String subjectCode,
String key, String value, String editionName) throws InterruptedException {
String businessBookId = book.getString("businessBookId");
String bookName = book.getString("bookName");
Record rBook = new Record().set("book_id", businessBookId)
.set("book_name", bookName).set("scheme_id", businessEditionId)
.set("subject_id", subjectCode).set("stage_id", key)
.set("id", UUID.randomUUID().toString());
bookList.add(rBook);
JSONObject argTree = new JSONObject();
argTree.put("businessBookId", businessBookId);
argTree.put("childrensFlag", 1);
argTree.put("parentId", -1);
argTree.put("searchKeyword", "");
String respTree = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/chapter/tree",
argTree.toString(), false, MaxRetryCount);
if (respTree != null) {
JSONArray jsonArrTree = JSONObject.parseObject(respTree)
.getJSONObject("data")
.getJSONArray("tree");
JSONArray tempTree = new JSONArray();
traverseTree(jsonArrTree, tempTree);
// 使用线程池处理每个节点
ExecutorService nodeExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch nodeLatch = new CountDownLatch(tempTree.size());
for (int n = 0; n < tempTree.size(); n++) {
JSONObject jsonTree = tempTree.getJSONObject(n);
nodeExecutor.submit(() -> {
try {
processNode(jsonTree, businessBookId, businessEditionId, subjectCode,
key, value, editionName, bookName);
} finally {
nodeLatch.countDown();
}
});
}
nodeLatch.await();
nodeExecutor.shutdown();
}
}
/**
* 处理节点
*
* @param jsonTree
* @param businessBookId
* @param businessEditionId
* @param subjectCode
* @param key
* @param value
* @param editionName
* @param bookName
*/
private static void processNode(JSONObject jsonTree, String businessBookId, String businessEditionId,
String subjectCode, String key, String value, String editionName,
String bookName) {
String nodeId = jsonTree.getString("nodeId");
String nodeName = jsonTree.getString("nodeName");
Boolean isLeaf = jsonTree.getBoolean("isLeaf");
String parentValue = jsonTree.getString("parentValue");
Record rStructure = new Record()
.set("node_id", nodeId).set("node_name", nodeName).set("parent_id", parentValue)
.set("is_leaf", isLeaf ? 1 : 0).set("book_id", businessBookId)
.set("scheme_id", businessEditionId).set("subject_id", subjectCode)
.set("stage_id", key).set("id", UUID.randomUUID().toString());
structureList.add(rStructure);
processResource(nodeId, businessBookId, subjectCode, key, value,
editionName, bookName, nodeName);
}
/**
* 处理资源
*
* @param nodeId
* @param businessBookId
* @param subjectCode
* @param key
* @param value
* @param editionName
* @param bookName
* @param nodeName
*/
private static void processResource(String nodeId, String businessBookId, String subjectCode,
String key, String value, String editionName, String bookName, String nodeName) {
JSONObject argSource = new JSONObject();
argSource.put("pageNum", 1);
argSource.put("pageSize", 100);
argSource.put("businessBookId", businessBookId);
argSource.put("nodeId", nodeId);
argSource.put("stageCode", key);
argSource.put("subjectCode", subjectCode);
argSource.put("excellentFlag", "");
argSource.put("nodeType", 1);//教材体系 2:知识点体系
argSource.put("sortType", 2);
argSource.put("source", "");
argSource.put("searchKeyword", "");
String respSource = doRequestWithRetry("https://yx.ccsjy.cn/api/cloud-school/v1/cloudLesson/getOnDemandLessonPage",
argSource.toString(), false, MaxRetryCount);
if (respSource != null) {
JSONObject jsonObjSource = JSONObject.parseObject(respSource);
JSONArray jsonArrSource = jsonObjSource.getJSONObject("data").getJSONArray("rows");
print("正在爬取:【" + value + "" + editionName + "" + bookName + "" + nodeName + "】下的资源!");
for (int m = 0; m < jsonArrSource.size(); m++) {
JSONObject jsonSource = jsonArrSource.getJSONObject(m);
// 解析日期时间字符串
String publishTimeStr = jsonSource.getString("publishTime");
LocalDateTime dateTime = LocalDateTime.parse(publishTimeStr,
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"));
LocalDate date = dateTime.toLocalDate();
Record record = new Record()
.set("lesson_id", jsonSource.getString("lessonId"))
.set("lesson_name", jsonSource.getString("lessonName"))
.set("node_id", nodeId)
.set("teacher_id", jsonSource.getString("teacherId"))
.set("teacher_school_id", jsonSource.getString("teacherSchoolId"))
.set("teacher_school_name", jsonSource.getString("teacherSchoolName"))
.set("teacher_name", jsonSource.getString("teacherName"))
.set("book_id", businessBookId)
.set("scheme_id", editionName)
.set("subject_id", subjectCode)
.set("grade_code", jsonSource.getString("gradeCode"))
.set("publish_time", date)
.set("stage_id", key)
//增加学习次数
.set("preview_count", jsonSource.getIntValue("previewCount"))
.set("learning_person_count", jsonSource.getIntValue("learningPersonCount"))
.set("learning_person_times", jsonSource.getIntValue("learningPersonTimes"))
.set("id", UUID.randomUUID().toString());
lessonList.add(record);
int count = resourceCount.incrementAndGet();
if (count % 100 == 0) {
print("已收集资源数量:" + count + "个。");
}
}
}
}
/**
* 获取标准学校名称列表
*
* @return
*/
public static Map<String, Kv> getBzSchoolNameList() {
SqlPara sqlPara = Db.getSqlPara("YunXiao.getBzSchoolNameList");
List<Record> list = Db.find(sqlPara);
Map<String, Kv> map = new HashMap<>();
for (Record record : list) {
String bz_school_name = record.getStr("organization_name");
Kv kv = Kv.create();
kv.set("organization_no", record.getStr("organization_no"));
kv.set("gather_regionc", record.getStr("gather_regionc"));
kv.set("school_running_type", record.getStr("school_running_type"));
map.put(bz_school_name, kv);
}
return map;
}
/**
* 获取手动匹配学校名称列表
*
* @return
*/
public static Map<String, Kv> getHandMatchSchoolList() {
String sql = "select * from t_crawler_lesson_school where match_type=2";
List<Record> list = Db.find(sql);
Map<String, Kv> map = new HashMap<>();
for (Record record : list) {
String original_school_name = record.getStr("original_school_name");
Kv kv = Kv.create();
String organization_name = record.getStr("organization_name");
String organization_no = record.getStr("organization_no");
String gather_regionc = record.getStr("gather_regionc");
kv.set("organization_name", organization_name);
kv.set("organization_no", organization_no);
kv.set("gather_regionc", gather_regionc);
map.put(original_school_name, kv);
}
return map;
}
/**
* 获取已匹配的课程
*
* @return
*/
public static Set<String> getLessonAlreadyMatch() {
String sql = "select * from t_crawler_lesson_school where match_type>0";
List<Record> list = Db.find(sql);
Set<String> set = new HashSet<>();
for (Record record : list) {
set.add(record.getStr("lesson_id"));
}
return set;
}
public static void Start() {
Map<String, String> map = new HashMap<>();
map.put("1", "学前");
map.put("2", "小学");
map.put("3", "初中");
map.put("4", "高中");
//初始化三个全局量
bzSchoolNameList = getBzSchoolNameList();
handMatchSchoolList = getHandMatchSchoolList();
lessonAlreadyMatch = getLessonAlreadyMatch();
print("开始爬取数据!");
//记录开始时间
long startTime = System.currentTimeMillis();
// 创建线程池
ExecutorService executorService = Executors.newFixedThreadPool(10);
CountDownLatch mainLatch = new CountDownLatch(map.size());
map.forEach((key, value) -> {
executorService.submit(() -> {
try {
String respSubject = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/subject/list/" + key,
null, true, MaxRetryCount);
if (respSubject != null) {
JSONObject jsonObj = JSONObject.parseObject(respSubject);
JSONArray subjects = jsonObj.getJSONObject("data").getJSONArray("rows");
ExecutorService subjectExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch subjectLatch = new CountDownLatch(subjects.size());
for (int i = 0; i < subjects.size(); i++) {
JSONObject subject = subjects.getJSONObject(i);
subjectExecutor.submit(() -> {
try {
processSubject(subject, key, value);
} catch (InterruptedException e) {
throw new RuntimeException(e);
} finally {
subjectLatch.countDown();
}
});
}
subjectLatch.await();
subjectExecutor.shutdown();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
mainLatch.countDown();
}
});
});
try {
mainLatch.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
executorService.shutdown();
// 清空表
String[] tables = {"t_crawler_subject", "t_crawler_scheme", "t_crawler_book",
"t_crawler_structure", "t_crawler_lesson"};
for (String table : tables) {
Db.update("truncate table " + table);
}
String sql = "delete from t_crawler_lesson_school where match_type=0;";
Db.update(sql);
// 保存数据
print("开始保存数据...");
Db.batchSave("t_crawler_subject", subjectList, batchSize);
Db.batchSave("t_crawler_scheme", schemeList, batchSize);
Db.batchSave("t_crawler_book", bookList, batchSize);
Db.batchSave("t_crawler_structure", structureList, batchSize);
Db.batchSave("t_crawler_lesson", lessonList, batchSize);
print("开始记录课程与学校的关系...");
//保存课程与学校的关联表
List<Record> writeList = new ArrayList<>();
for (Record record : lessonList) {
String original_school_name = record.getStr("teacher_school_name");//原始学校名称
if (original_school_name.equals("东北师范大学东安实验学校")) {
System.out.println("Here!");
}
String teacher_name = record.getStr("teacher_name");
String organization_name = "";
String organization_no = "";
String gather_regionc = "";
String school_running_type = "";
int match_type;
String lesson_id = record.getStr("lesson_id");
if (lessonAlreadyMatch.contains(lesson_id)) continue;//如果记录过此课程的学校关系,本次就不再记录了
if (original_school_name.equals("--")) {
System.out.println("--");
}
if (bzSchoolNameList.containsKey(original_school_name)) {
//100%命中的名称
Kv kv = bzSchoolNameList.get(original_school_name);
organization_name = original_school_name;
organization_no = kv.getStr("organization_no");
gather_regionc = kv.getStr("gather_regionc");
school_running_type = kv.getStr("school_running_type");
match_type = 1;
} else if (handMatchSchoolList.containsKey(original_school_name)) {//手动映射过的名称
Kv kv = handMatchSchoolList.get(original_school_name);
organization_name = kv.getStr("organization_name");
organization_no = kv.getStr("organization_no");
gather_regionc = kv.getStr("gather_regionc");
school_running_type = kv.getStr("school_running_type");
match_type = 2;
} else {
match_type = 0;//待匹配
}
Record rWrite = new Record();
rWrite.set("lesson_id", lesson_id);
rWrite.set("original_school_name", original_school_name);
rWrite.set("organization_name", organization_name);
rWrite.set("organization_no", organization_no);
rWrite.set("gather_regionc", gather_regionc);
rWrite.set("match_type", match_type);
rWrite.set("teacher_name", teacher_name);
rWrite.set("school_running_type", school_running_type);
writeList.add(rWrite);
}
//对writeList根据lesson_id去重
writeList = writeList.stream().collect(
Collectors.collectingAndThen(Collectors.toCollection(() -> new TreeSet<>(
Comparator.comparing(r -> r.getStr("lesson_id"))
)
), ArrayList::new));
Db.batchSave("t_crawler_lesson_school", writeList, batchSize);
print("爬取数据完成!");
print("总共收集资源:" + lessonList.size() + "个");
print("总共收集节点:" + structureList.size() + "个");
//记录结束时间,并输出两者的差值是多少分钟多少少
long endTime = System.currentTimeMillis();
print("爬取数据耗时:" + (endTime - startTime) / 1000 + "秒");
}
}