You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

533 lines
22 KiB

6 months ago
package Tools.Crawler.Util;
7 months ago
7 months ago
import cn.hutool.core.date.DateTime;
7 months ago
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
7 months ago
import com.jfinal.kit.Kv;
import com.jfinal.plugin.activerecord.*;
7 months ago
import com.jfinal.plugin.activerecord.Record;
7 months ago
6 months ago
import java.time.LocalDate;
6 months ago
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
7 months ago
import java.util.*;
7 months ago
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
7 months ago
import java.util.stream.Collectors;
7 months ago
6 months ago
import static Tools.Crawler.Util.Util.doRequestWithRetry;
6 months ago
public class BookLesson {
7 months ago
// 使用线程安全的集合
private static final CopyOnWriteArrayList<Record> subjectList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> lessonList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> schemeList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> bookList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> structureList = new CopyOnWriteArrayList<>();
// 用于记录进度
private static final AtomicInteger resourceCount = new AtomicInteger(0);
7 months ago
7 months ago
//线程池大小
7 months ago
private static final int ThreadCount = 4;
7 months ago
//最大重试次数
7 months ago
private static final int MaxRetryCount = 10;
7 months ago
//每次写入的数量
private static final int batchSize = 300;
7 months ago
//标准名称
public static Map<String, Kv> bzSchoolNameList = new HashMap<>();
//手工映射过名称
public static Map<String, Kv> handMatchSchoolList = new HashMap<>();
//哪些课程已经匹配过
public static Set<String> lessonAlreadyMatch = new HashSet<>();
7 months ago
public static void traverseTree(JSONArray treeArray, JSONArray tempTree) {
7 months ago
for (Object item : treeArray) {
JSONObject node = (JSONObject) item;
JSONObject jsonObj = new JSONObject();
jsonObj.put("nodeId", node.getString("key"));
jsonObj.put("nodeName", node.getString("title"));
jsonObj.put("isLeaf", node.getBoolean("isLeaf"));
jsonObj.put("parentValue", node.getString("parentValue"));
tempTree.add(jsonObj);
if (node.containsKey("children") && !node.getJSONArray("children").isEmpty()) {
7 months ago
traverseTree(node.getJSONArray("children"), tempTree);
7 months ago
}
}
}
7 months ago
/**
*
*
* @param msg
*/
7 months ago
public static void print(String msg) {
System.out.println(DateTime.now() + " " + msg);
}
7 months ago
/**
*
*
* @param subject
* @param key
* @param value
*/
private static void processSubject(JSONObject subject, String key, String value) throws InterruptedException {
String subjectCode = subject.getString("subjectCode");
String subjectName = subject.getString("subjectName");
Record rSubject = new Record().set("subject_id", subjectCode).set("subject_name", subjectName).set("stage_id", key);
subjectList.add(rSubject);
JSONObject argScheme = new JSONObject();
argScheme.put("subjectCode", subjectCode);
argScheme.put("systemId", 1);
String respScheme = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/edition/list",
argScheme.toString(), false, MaxRetryCount);
if (respScheme != null) {
//处理科目后,处理此科目下的版本
processSchemes(JSONObject.parseObject(respScheme).getJSONArray("data"), subjectCode, key, value);
}
}
/**
*
*
* @param schemes
* @param subjectCode
* @param key
* @param value
*/
private static void processSchemes(JSONArray schemes, String subjectCode, String key, String value) throws InterruptedException {
ExecutorService schemeExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch schemeLatch = new CountDownLatch(schemes.size());
for (int j = 0; j < schemes.size(); j++) {
JSONObject scheme = schemes.getJSONObject(j);
schemeExecutor.submit(() -> {
try {
processScheme(scheme, subjectCode, key, value);
} catch (InterruptedException e) {
throw new RuntimeException(e);
} finally {
schemeLatch.countDown();
}
});
}
schemeLatch.await();
schemeExecutor.shutdown();
}
/**
*
*
* @param scheme
* @param subjectCode
* @param key
* @param value
*/
private static void processScheme(JSONObject scheme, String subjectCode, String key, String value) throws InterruptedException {
String businessEditionId = scheme.getString("businessEditionId");
String editionName = scheme.getString("editionName");
Record rScheme = new Record().set("scheme_id", businessEditionId).set("scheme_name", editionName).set("subject_id", subjectCode)
.set("stage_id", key).set("id", UUID.randomUUID().toString());
schemeList.add(rScheme);
JSONObject argBook = new JSONObject();
argBook.put("stageCode", key);
argBook.put("subjectCode", subjectCode);
argBook.put("businessEditionId", businessEditionId);
String respBook = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/book/list", argBook.toString(), false, 3);
if (respBook != null) {
JSONArray books = JSONObject.parseObject(respBook).getJSONArray("data");
// 使用线程池处理每本书
ExecutorService bookExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch bookLatch = new CountDownLatch(books.size());
for (int k = 0; k < books.size(); k++) {
JSONObject book = books.getJSONObject(k);
bookExecutor.submit(() -> {
try {
6 months ago
//章节目录
7 months ago
processBook(book, businessEditionId, subjectCode, key, value, editionName);
6 months ago
//知识点
7 months ago
} catch (InterruptedException e) {
throw new RuntimeException(e);
} finally {
bookLatch.countDown();
}
});
}
bookLatch.await();
bookExecutor.shutdown();
}
}
/**
*
*
* @param book
* @param businessEditionId
* @param subjectCode
* @param key
* @param value
* @param editionName
*/
private static void processBook(JSONObject book, String businessEditionId, String subjectCode,
String key, String value, String editionName) throws InterruptedException {
String businessBookId = book.getString("businessBookId");
String bookName = book.getString("bookName");
Record rBook = new Record().set("book_id", businessBookId)
.set("book_name", bookName).set("scheme_id", businessEditionId)
.set("subject_id", subjectCode).set("stage_id", key)
.set("id", UUID.randomUUID().toString());
bookList.add(rBook);
JSONObject argTree = new JSONObject();
argTree.put("businessBookId", businessBookId);
argTree.put("childrensFlag", 1);
argTree.put("parentId", -1);
argTree.put("searchKeyword", "");
String respTree = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/chapter/tree",
argTree.toString(), false, MaxRetryCount);
if (respTree != null) {
JSONArray jsonArrTree = JSONObject.parseObject(respTree)
.getJSONObject("data")
.getJSONArray("tree");
JSONArray tempTree = new JSONArray();
traverseTree(jsonArrTree, tempTree);
// 使用线程池处理每个节点
ExecutorService nodeExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch nodeLatch = new CountDownLatch(tempTree.size());
for (int n = 0; n < tempTree.size(); n++) {
JSONObject jsonTree = tempTree.getJSONObject(n);
nodeExecutor.submit(() -> {
try {
processNode(jsonTree, businessBookId, businessEditionId, subjectCode,
key, value, editionName, bookName);
} finally {
nodeLatch.countDown();
}
});
}
nodeLatch.await();
nodeExecutor.shutdown();
}
}
/**
*
*
* @param jsonTree
* @param businessBookId
* @param businessEditionId
* @param subjectCode
* @param key
* @param value
* @param editionName
* @param bookName
*/
private static void processNode(JSONObject jsonTree, String businessBookId, String businessEditionId,
String subjectCode, String key, String value, String editionName,
String bookName) {
String nodeId = jsonTree.getString("nodeId");
String nodeName = jsonTree.getString("nodeName");
Boolean isLeaf = jsonTree.getBoolean("isLeaf");
String parentValue = jsonTree.getString("parentValue");
Record rStructure = new Record()
.set("node_id", nodeId).set("node_name", nodeName).set("parent_id", parentValue)
.set("is_leaf", isLeaf ? 1 : 0).set("book_id", businessBookId)
.set("scheme_id", businessEditionId).set("subject_id", subjectCode)
.set("stage_id", key).set("id", UUID.randomUUID().toString());
structureList.add(rStructure);
processResource(nodeId, businessBookId, subjectCode, key, value,
editionName, bookName, nodeName);
}
/**
*
*
* @param nodeId
* @param businessBookId
* @param subjectCode
* @param key
* @param value
* @param editionName
* @param bookName
* @param nodeName
*/
private static void processResource(String nodeId, String businessBookId, String subjectCode,
String key, String value, String editionName, String bookName, String nodeName) {
JSONObject argSource = new JSONObject();
argSource.put("pageNum", 1);
argSource.put("pageSize", 100);
argSource.put("businessBookId", businessBookId);
argSource.put("nodeId", nodeId);
argSource.put("stageCode", key);
argSource.put("subjectCode", subjectCode);
argSource.put("excellentFlag", "");
6 months ago
argSource.put("nodeType", 1);//教材体系 2:知识点体系
7 months ago
argSource.put("sortType", 2);
argSource.put("source", "");
argSource.put("searchKeyword", "");
String respSource = doRequestWithRetry("https://yx.ccsjy.cn/api/cloud-school/v1/cloudLesson/getOnDemandLessonPage",
argSource.toString(), false, MaxRetryCount);
if (respSource != null) {
JSONObject jsonObjSource = JSONObject.parseObject(respSource);
JSONArray jsonArrSource = jsonObjSource.getJSONObject("data").getJSONArray("rows");
print("正在爬取:【" + value + "" + editionName + "" + bookName + "" + nodeName + "】下的资源!");
for (int m = 0; m < jsonArrSource.size(); m++) {
JSONObject jsonSource = jsonArrSource.getJSONObject(m);
6 months ago
6 months ago
// 解析日期时间字符串
6 months ago
String publishTimeStr = jsonSource.getString("publishTime");
6 months ago
LocalDateTime dateTime = LocalDateTime.parse(publishTimeStr,
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"));
LocalDate date = dateTime.toLocalDate();
6 months ago
7 months ago
Record record = new Record()
.set("lesson_id", jsonSource.getString("lessonId"))
.set("lesson_name", jsonSource.getString("lessonName"))
.set("node_id", nodeId)
.set("teacher_id", jsonSource.getString("teacherId"))
.set("teacher_school_id", jsonSource.getString("teacherSchoolId"))
.set("teacher_school_name", jsonSource.getString("teacherSchoolName"))
.set("teacher_name", jsonSource.getString("teacherName"))
.set("book_id", businessBookId)
.set("scheme_id", editionName)
.set("subject_id", subjectCode)
6 months ago
.set("grade_code", jsonSource.getString("gradeCode"))
6 months ago
.set("publish_time", date)
7 months ago
.set("stage_id", key)
7 months ago
//增加学习次数
7 months ago
.set("preview_count", jsonSource.getIntValue("previewCount"))
.set("learning_person_count", jsonSource.getIntValue("learningPersonCount"))
.set("learning_person_times", jsonSource.getIntValue("learningPersonTimes"))
7 months ago
.set("id", UUID.randomUUID().toString());
lessonList.add(record);
int count = resourceCount.incrementAndGet();
if (count % 100 == 0) {
print("已收集资源数量:" + count + "个。");
}
}
}
}
7 months ago
7 months ago
/**
*
*
* @return
*/
public static Map<String, Kv> getBzSchoolNameList() {
SqlPara sqlPara = Db.getSqlPara("YunXiao.getBzSchoolNameList");
List<Record> list = Db.find(sqlPara);
Map<String, Kv> map = new HashMap<>();
for (Record record : list) {
String bz_school_name = record.getStr("organization_name");
Kv kv = Kv.create();
kv.set("organization_no", record.getStr("organization_no"));
kv.set("gather_regionc", record.getStr("gather_regionc"));
map.put(bz_school_name, kv);
}
return map;
}
/**
*
*
* @return
*/
public static Map<String, Kv> getHandMatchSchoolList() {
String sql = "select * from t_crawler_lesson_school where match_type=2";
List<Record> list = Db.find(sql);
Map<String, Kv> map = new HashMap<>();
for (Record record : list) {
String original_school_name = record.getStr("original_school_name");
Kv kv = Kv.create();
String organization_name = record.getStr("organization_name");
String organization_no = record.getStr("organization_no");
String gather_regionc = record.getStr("gather_regionc");
kv.set("organization_name", organization_name);
kv.set("organization_no", organization_no);
kv.set("gather_regionc", gather_regionc);
map.put(original_school_name, kv);
}
return map;
}
/**
*
*
* @return
*/
public static Set<String> getLessonAlreadyMatch() {
6 months ago
String sql = "select * from t_crawler_lesson_school where match_type>0";
7 months ago
List<Record> list = Db.find(sql);
Set<String> set = new HashSet<>();
for (Record record : list) {
set.add(record.getStr("lesson_id"));
}
return set;
}
6 months ago
public static void Start() {
7 months ago
Map<String, String> map = new HashMap<>();
map.put("1", "学前");
map.put("2", "小学");
map.put("3", "初中");
map.put("4", "高中");
7 months ago
//初始化三个全局量
bzSchoolNameList = getBzSchoolNameList();
handMatchSchoolList = getHandMatchSchoolList();
lessonAlreadyMatch = getLessonAlreadyMatch();
7 months ago
print("开始爬取数据!");
7 months ago
//记录开始时间
long startTime = System.currentTimeMillis();
// 创建线程池
ExecutorService executorService = Executors.newFixedThreadPool(10);
CountDownLatch mainLatch = new CountDownLatch(map.size());
7 months ago
map.forEach((key, value) -> {
7 months ago
executorService.submit(() -> {
try {
String respSubject = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/subject/list/" + key,
null, true, MaxRetryCount);
if (respSubject != null) {
JSONObject jsonObj = JSONObject.parseObject(respSubject);
JSONArray subjects = jsonObj.getJSONObject("data").getJSONArray("rows");
ExecutorService subjectExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch subjectLatch = new CountDownLatch(subjects.size());
for (int i = 0; i < subjects.size(); i++) {
JSONObject subject = subjects.getJSONObject(i);
subjectExecutor.submit(() -> {
try {
processSubject(subject, key, value);
} catch (InterruptedException e) {
throw new RuntimeException(e);
} finally {
subjectLatch.countDown();
}
});
7 months ago
}
7 months ago
subjectLatch.await();
subjectExecutor.shutdown();
7 months ago
}
7 months ago
} catch (Exception e) {
e.printStackTrace();
} finally {
mainLatch.countDown();
7 months ago
}
7 months ago
});
7 months ago
});
7 months ago
try {
mainLatch.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
executorService.shutdown();
7 months ago
// 清空表
String[] tables = {"t_crawler_subject", "t_crawler_scheme", "t_crawler_book",
"t_crawler_structure", "t_crawler_lesson"};
for (String table : tables) {
Db.update("truncate table " + table);
}
6 months ago
String sql = "delete from t_crawler_lesson_school where match_type=0;";
Db.update(sql);
7 months ago
// 保存数据
7 months ago
print("开始保存数据...");
7 months ago
Db.batchSave("t_crawler_subject", subjectList, batchSize);
Db.batchSave("t_crawler_scheme", schemeList, batchSize);
Db.batchSave("t_crawler_book", bookList, batchSize);
Db.batchSave("t_crawler_structure", structureList, batchSize);
7 months ago
Db.batchSave("t_crawler_lesson", lessonList, batchSize);
print("开始记录课程与学校的关系...");
//保存课程与学校的关联表
List<Record> writeList = new ArrayList<>();
for (Record record : lessonList) {
String original_school_name = record.getStr("teacher_school_name");//原始学校名称
6 months ago
if (original_school_name.equals("东北师范大学东安实验学校")) {
System.out.println("Here!");
}
7 months ago
String teacher_name = record.getStr("teacher_name");
7 months ago
String organization_name = "";
String organization_no = "";
String gather_regionc = "";
7 months ago
int match_type;
7 months ago
String lesson_id = record.getStr("lesson_id");
if (lessonAlreadyMatch.contains(lesson_id)) continue;//如果记录过此课程的学校关系,本次就不再记录了
6 months ago
7 months ago
if (bzSchoolNameList.containsKey(original_school_name)) {
7 months ago
//100%命中的名称
7 months ago
Kv kv = bzSchoolNameList.get(original_school_name);
7 months ago
organization_name = original_school_name;
7 months ago
organization_no = kv.getStr("organization_no");
gather_regionc = kv.getStr("gather_regionc");
match_type = 1;
6 months ago
} else if (handMatchSchoolList.containsKey(original_school_name)) {//手动映射过的名称
Kv kv = handMatchSchoolList.get(original_school_name);
7 months ago
organization_name = kv.getStr("organization_name");
organization_no = kv.getStr("organization_no");
gather_regionc = kv.getStr("gather_regionc");
match_type = 2;
} else {
match_type = 0;//待匹配
}
Record rWrite = new Record();
rWrite.set("lesson_id", lesson_id);
rWrite.set("original_school_name", original_school_name);
rWrite.set("organization_name", organization_name);
rWrite.set("organization_no", organization_no);
rWrite.set("gather_regionc", gather_regionc);
rWrite.set("match_type", match_type);
7 months ago
rWrite.set("teacher_name", teacher_name);
7 months ago
writeList.add(rWrite);
}
//对writeList根据lesson_id去重
writeList = writeList.stream().collect(
Collectors.collectingAndThen(Collectors.toCollection(() -> new TreeSet<>(
Comparator.comparing(r -> r.getStr("lesson_id"))
)
), ArrayList::new));
Db.batchSave("t_crawler_lesson_school", writeList, batchSize);
7 months ago
print("爬取数据完成!");
7 months ago
print("总共收集资源:" + lessonList.size() + "个");
print("总共收集节点:" + structureList.size() + "个");
//记录结束时间,并输出两者的差值是多少分钟多少少
long endTime = System.currentTimeMillis();
print("爬取数据耗时:" + (endTime - startTime) / 1000 + "秒");
7 months ago
}
7 months ago
7 months ago
}