package Tools.Crawler.Util; import cn.hutool.core.date.DateTime; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import com.jfinal.kit.Kv; import com.jfinal.plugin.activerecord.*; import com.jfinal.plugin.activerecord.Record; import java.time.LocalDate; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.*; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import static Tools.Crawler.Util.Util.doRequestWithRetry; public class BookLesson { // 使用线程安全的集合 private static final CopyOnWriteArrayList subjectList = new CopyOnWriteArrayList<>(); private static final CopyOnWriteArrayList lessonList = new CopyOnWriteArrayList<>(); private static final CopyOnWriteArrayList schemeList = new CopyOnWriteArrayList<>(); private static final CopyOnWriteArrayList bookList = new CopyOnWriteArrayList<>(); private static final CopyOnWriteArrayList structureList = new CopyOnWriteArrayList<>(); // 用于记录进度 private static final AtomicInteger resourceCount = new AtomicInteger(0); //线程池大小 private static final int ThreadCount = 4; //最大重试次数 private static final int MaxRetryCount = 10; //每次写入的数量 private static final int batchSize = 300; //标准名称 public static Map bzSchoolNameList = new HashMap<>(); //手工映射过名称 public static Map handMatchSchoolList = new HashMap<>(); //哪些课程已经匹配过 public static Set lessonAlreadyMatch = new HashSet<>(); //初始化 public static void init() { //初始化三个全局量 if (bzSchoolNameList.isEmpty()) bzSchoolNameList = getBzSchoolNameList(); if (handMatchSchoolList.isEmpty()) handMatchSchoolList = getHandMatchSchoolList(); if (lessonAlreadyMatch.isEmpty()) lessonAlreadyMatch = getLessonAlreadyMatch(); } public static void traverseTree(JSONArray treeArray, JSONArray tempTree) { for (Object item : treeArray) { JSONObject node = (JSONObject) item; JSONObject jsonObj = new JSONObject(); jsonObj.put("nodeId", node.getString("key")); jsonObj.put("nodeName", node.getString("title")); jsonObj.put("isLeaf", node.getBoolean("isLeaf")); jsonObj.put("parentValue", node.getString("parentValue")); tempTree.add(jsonObj); if (node.containsKey("children") && !node.getJSONArray("children").isEmpty()) { traverseTree(node.getJSONArray("children"), tempTree); } } } /** * 输出信息 * * @param msg */ public static void print(String msg) { System.out.println(DateTime.now() + " " + msg); } /** * 处理科目 * * @param subject * @param key * @param value */ private static void processSubject(JSONObject subject, String key, String value) throws InterruptedException { String subjectCode = subject.getString("subjectCode"); String subjectName = subject.getString("subjectName"); Record rSubject = new Record().set("subject_id", subjectCode).set("subject_name", subjectName).set("stage_id", key); subjectList.add(rSubject); JSONObject argScheme = new JSONObject(); argScheme.put("subjectCode", subjectCode); argScheme.put("systemId", 1); String respScheme = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/edition/list", argScheme.toString(), false, MaxRetryCount); if (respScheme != null) { //处理科目后,处理此科目下的版本 processSchemes(JSONObject.parseObject(respScheme).getJSONArray("data"), subjectCode, key, value); } } /** * 处理版本 * * @param schemes * @param subjectCode * @param key * @param value */ private static void processSchemes(JSONArray schemes, String subjectCode, String key, String value) throws InterruptedException { ExecutorService schemeExecutor = Executors.newFixedThreadPool(ThreadCount); CountDownLatch schemeLatch = new CountDownLatch(schemes.size()); for (int j = 0; j < schemes.size(); j++) { JSONObject scheme = schemes.getJSONObject(j); schemeExecutor.submit(() -> { try { processScheme(scheme, subjectCode, key, value); } catch (InterruptedException e) { throw new RuntimeException(e); } finally { schemeLatch.countDown(); } }); } schemeLatch.await(); schemeExecutor.shutdown(); } /** * 处理某一个版本 * * @param scheme * @param subjectCode * @param key * @param value */ private static void processScheme(JSONObject scheme, String subjectCode, String key, String value) throws InterruptedException { String businessEditionId = scheme.getString("businessEditionId"); String editionName = scheme.getString("editionName"); Record rScheme = new Record().set("scheme_id", businessEditionId).set("scheme_name", editionName).set("subject_id", subjectCode) .set("stage_id", key).set("id", UUID.randomUUID().toString()); schemeList.add(rScheme); JSONObject argBook = new JSONObject(); argBook.put("stageCode", key); argBook.put("subjectCode", subjectCode); argBook.put("businessEditionId", businessEditionId); String respBook = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/book/list", argBook.toString(), false, 3); if (respBook != null) { JSONArray books = JSONObject.parseObject(respBook).getJSONArray("data"); // 使用线程池处理每本书 ExecutorService bookExecutor = Executors.newFixedThreadPool(ThreadCount); CountDownLatch bookLatch = new CountDownLatch(books.size()); for (int k = 0; k < books.size(); k++) { JSONObject book = books.getJSONObject(k); bookExecutor.submit(() -> { try { //章节目录 processBook(book, businessEditionId, subjectCode, key, value, editionName); //知识点 } catch (InterruptedException e) { throw new RuntimeException(e); } finally { bookLatch.countDown(); } }); } bookLatch.await(); bookExecutor.shutdown(); } } /** * 处理册 * * @param book * @param businessEditionId * @param subjectCode * @param key * @param value * @param editionName */ private static void processBook(JSONObject book, String businessEditionId, String subjectCode, String key, String value, String editionName) throws InterruptedException { String businessBookId = book.getString("businessBookId"); String bookName = book.getString("bookName"); Record rBook = new Record().set("book_id", businessBookId) .set("book_name", bookName).set("scheme_id", businessEditionId) .set("subject_id", subjectCode).set("stage_id", key) .set("id", UUID.randomUUID().toString()); bookList.add(rBook); JSONObject argTree = new JSONObject(); argTree.put("businessBookId", businessBookId); argTree.put("childrensFlag", 1); argTree.put("parentId", -1); argTree.put("searchKeyword", ""); String respTree = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/chapter/tree", argTree.toString(), false, MaxRetryCount); if (respTree != null) { JSONArray jsonArrTree = JSONObject.parseObject(respTree) .getJSONObject("data") .getJSONArray("tree"); JSONArray tempTree = new JSONArray(); traverseTree(jsonArrTree, tempTree); // 使用线程池处理每个节点 ExecutorService nodeExecutor = Executors.newFixedThreadPool(ThreadCount); CountDownLatch nodeLatch = new CountDownLatch(tempTree.size()); for (int n = 0; n < tempTree.size(); n++) { JSONObject jsonTree = tempTree.getJSONObject(n); nodeExecutor.submit(() -> { try { processNode(jsonTree, businessBookId, businessEditionId, subjectCode, key, value, editionName, bookName); } finally { nodeLatch.countDown(); } }); } nodeLatch.await(); nodeExecutor.shutdown(); } } /** * 处理节点 * * @param jsonTree * @param businessBookId * @param businessEditionId * @param subjectCode * @param key * @param value * @param editionName * @param bookName */ private static void processNode(JSONObject jsonTree, String businessBookId, String businessEditionId, String subjectCode, String key, String value, String editionName, String bookName) { String nodeId = jsonTree.getString("nodeId"); String nodeName = jsonTree.getString("nodeName"); Boolean isLeaf = jsonTree.getBoolean("isLeaf"); String parentValue = jsonTree.getString("parentValue"); Record rStructure = new Record() .set("node_id", nodeId).set("node_name", nodeName).set("parent_id", parentValue) .set("is_leaf", isLeaf ? 1 : 0).set("book_id", businessBookId) .set("scheme_id", businessEditionId).set("subject_id", subjectCode) .set("stage_id", key).set("id", UUID.randomUUID().toString()); structureList.add(rStructure); processResource(nodeId, businessBookId, subjectCode, key, value, editionName, bookName, nodeName); } /** * 处理资源 * * @param nodeId * @param businessBookId * @param subjectCode * @param key * @param value * @param editionName * @param bookName * @param nodeName */ private static void processResource(String nodeId, String businessBookId, String subjectCode, String key, String value, String editionName, String bookName, String nodeName) { JSONObject argSource = new JSONObject(); argSource.put("pageNum", 1); argSource.put("pageSize", 100); argSource.put("businessBookId", businessBookId); argSource.put("nodeId", nodeId); argSource.put("stageCode", key); argSource.put("subjectCode", subjectCode); argSource.put("excellentFlag", ""); argSource.put("nodeType", 1);//教材体系 2:知识点体系 argSource.put("sortType", 2); argSource.put("source", ""); argSource.put("searchKeyword", ""); String respSource = doRequestWithRetry("https://yx.ccsjy.cn/api/cloud-school/v1/cloudLesson/getOnDemandLessonPage", argSource.toString(), false, MaxRetryCount); if (respSource != null) { JSONObject jsonObjSource = JSONObject.parseObject(respSource); JSONArray jsonArrSource = jsonObjSource.getJSONObject("data").getJSONArray("rows"); print("正在爬取:【" + value + "," + editionName + "," + bookName + "," + nodeName + "】下的资源!"); for (int m = 0; m < jsonArrSource.size(); m++) { JSONObject jsonSource = jsonArrSource.getJSONObject(m); // 解析日期时间字符串 String publishTimeStr = jsonSource.getString("publishTime"); LocalDateTime dateTime = LocalDateTime.parse(publishTimeStr, DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")); LocalDate date = dateTime.toLocalDate(); Record record = new Record() .set("lesson_id", jsonSource.getString("lessonId")) .set("lesson_name", jsonSource.getString("lessonName")) .set("node_id", nodeId) .set("teacher_id", jsonSource.getString("teacherId")) .set("teacher_school_id", jsonSource.getString("teacherSchoolId")) .set("teacher_school_name", jsonSource.getString("teacherSchoolName")) .set("teacher_name", jsonSource.getString("teacherName")) .set("book_id", businessBookId) .set("scheme_id", editionName) .set("subject_id", subjectCode) .set("grade_code", jsonSource.getString("gradeCode")) .set("publish_time", date) .set("stage_id", key) //增加学习次数 .set("preview_count", jsonSource.getIntValue("previewCount")) .set("learning_person_count", jsonSource.getIntValue("learningPersonCount")) .set("learning_person_times", jsonSource.getIntValue("learningPersonTimes")) .set("id", UUID.randomUUID().toString()); lessonList.add(record); int count = resourceCount.incrementAndGet(); if (count % 100 == 0) { print("已收集资源数量:" + count + "个。"); } } } } /** * 获取标准学校名称列表 * * @return */ public static Map getBzSchoolNameList() { SqlPara sqlPara = Db.getSqlPara("YunXiao.getBzSchoolNameList"); List list = Db.find(sqlPara); Map map = new HashMap<>(); for (Record record : list) { String bz_school_name = record.getStr("organization_name"); Kv kv = Kv.create(); kv.set("organization_no", record.getStr("organization_no")); kv.set("gather_regionc", record.getStr("gather_regionc")); kv.set("school_running_type", record.getStr("school_running_type")); map.put(bz_school_name, kv); } return map; } /** * 获取手动匹配学校名称列表 * * @return */ public static Map getHandMatchSchoolList() { String sql = "select * from t_crawler_lesson_school where match_type=2"; List list = Db.find(sql); Map map = new HashMap<>(); for (Record record : list) { String original_school_name = record.getStr("original_school_name"); Kv kv = Kv.create(); String organization_name = record.getStr("organization_name"); String organization_no = record.getStr("organization_no"); String gather_regionc = record.getStr("gather_regionc"); kv.set("organization_name", organization_name); kv.set("organization_no", organization_no); kv.set("gather_regionc", gather_regionc); map.put(original_school_name, kv); } return map; } /** * 获取已匹配的课程 * * @return */ public static Set getLessonAlreadyMatch() { String sql = "select * from t_crawler_lesson_school where match_type>0"; List list = Db.find(sql); Set set = new HashSet<>(); for (Record record : list) { set.add(record.getStr("lesson_id")); } return set; } public static void Start() { Map map = new HashMap<>(); map.put("1", "学前"); map.put("2", "小学"); map.put("3", "初中"); map.put("4", "高中"); //初始化 init(); print("开始爬取数据!"); //记录开始时间 long startTime = System.currentTimeMillis(); // 创建线程池 ExecutorService executorService = Executors.newFixedThreadPool(10); CountDownLatch mainLatch = new CountDownLatch(map.size()); map.forEach((key, value) -> { executorService.submit(() -> { try { String respSubject = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/subject/list/" + key, null, true, MaxRetryCount); if (respSubject != null) { JSONObject jsonObj = JSONObject.parseObject(respSubject); JSONArray subjects = jsonObj.getJSONObject("data").getJSONArray("rows"); ExecutorService subjectExecutor = Executors.newFixedThreadPool(ThreadCount); CountDownLatch subjectLatch = new CountDownLatch(subjects.size()); for (int i = 0; i < subjects.size(); i++) { JSONObject subject = subjects.getJSONObject(i); subjectExecutor.submit(() -> { try { processSubject(subject, key, value); } catch (InterruptedException e) { throw new RuntimeException(e); } finally { subjectLatch.countDown(); } }); } subjectLatch.await(); subjectExecutor.shutdown(); } } catch (Exception e) { e.printStackTrace(); } finally { mainLatch.countDown(); } }); }); try { mainLatch.await(); } catch (InterruptedException e) { e.printStackTrace(); } executorService.shutdown(); // 清空表 String[] tables = {"t_crawler_subject", "t_crawler_scheme", "t_crawler_book", "t_crawler_structure", "t_crawler_lesson"}; for (String table : tables) { Db.update("truncate table " + table); } String sql = "delete from t_crawler_lesson_school where match_type=0;"; Db.update(sql); // 保存数据 print("开始保存数据..."); Db.batchSave("t_crawler_subject", subjectList, batchSize); Db.batchSave("t_crawler_scheme", schemeList, batchSize); Db.batchSave("t_crawler_book", bookList, batchSize); Db.batchSave("t_crawler_structure", structureList, batchSize); Db.batchSave("t_crawler_lesson", lessonList, batchSize); print("开始记录课程与学校的关系..."); //保存课程与学校的关联表 List writeList = new ArrayList<>(); for (Record record : lessonList) { String original_school_name = record.getStr("teacher_school_name");//原始学校名称 String teacher_name = record.getStr("teacher_name"); String organization_name = ""; String organization_no = ""; String gather_regionc = ""; String school_running_type = ""; int match_type; String lesson_id = record.getStr("lesson_id"); if (lessonAlreadyMatch.contains(lesson_id)) continue;//如果记录过此课程的学校关系,本次就不再记录了 if (bzSchoolNameList.containsKey(original_school_name)) { //100%命中的名称 Kv kv = bzSchoolNameList.get(original_school_name); organization_name = original_school_name; organization_no = kv.getStr("organization_no"); gather_regionc = kv.getStr("gather_regionc"); school_running_type = kv.getStr("school_running_type"); match_type = 1; } else if (handMatchSchoolList.containsKey(original_school_name)) {//手动映射过的名称 Kv kv = handMatchSchoolList.get(original_school_name); organization_name = kv.getStr("organization_name"); organization_no = kv.getStr("organization_no"); gather_regionc = kv.getStr("gather_regionc"); school_running_type = kv.getStr("school_running_type"); match_type = 2; } else { match_type = 0;//待匹配 } Record rWrite = new Record(); rWrite.set("lesson_id", lesson_id); rWrite.set("original_school_name", original_school_name); rWrite.set("organization_name", organization_name); rWrite.set("organization_no", organization_no); rWrite.set("gather_regionc", gather_regionc); rWrite.set("match_type", match_type); rWrite.set("teacher_name", teacher_name); rWrite.set("school_running_type", school_running_type); writeList.add(rWrite); } //对writeList根据lesson_id去重 writeList = writeList.stream().collect( Collectors.collectingAndThen(Collectors.toCollection(() -> new TreeSet<>( Comparator.comparing(r -> r.getStr("lesson_id")) ) ), ArrayList::new)); Db.batchSave("t_crawler_lesson_school", writeList, batchSize); print("爬取数据完成!"); print("总共收集资源:" + lessonList.size() + "个"); print("总共收集节点:" + structureList.size() + "个"); //记录结束时间,并输出两者的差值是多少分钟多少少 long endTime = System.currentTimeMillis(); print("爬取数据耗时:" + (endTime - startTime) / 1000 + "秒"); } /** * 打补丁 */ public static void fixPatch() { //初始化 init(); print("开始打补丁..."); //开始打补丁 String sql = "select * from t_crawler_fixdata where teacher_name<>'--'"; List fixList = Db.find(sql); Map fixMap = new HashMap<>();//生成一个HashMap for (Record record : fixList) { String key = record.getStr("lesson_id"); fixMap.put(key, record); } //找出所有需要补丁的课程【课程与学校的对应关系】 sql = "select * from t_crawler_lesson_school where (organization_name='' or organization_name is null)"; List toFixList = Db.find(sql); List writeList = new ArrayList<>(); for (Record toFixRecord : toFixList) { String lesson_id = toFixRecord.getStr("lesson_id"); if (fixMap.containsKey(lesson_id)) { Record fixRecord = fixMap.get(lesson_id); String teacherName = fixRecord.getStr("teacher_name"); String teacher_school_name = fixRecord.getStr("teacher_school_name"); toFixRecord.set("original_school_name", teacher_school_name);//原始名称 if (bzSchoolNameList.containsKey(teacher_school_name)) { Kv kv = bzSchoolNameList.get(teacher_school_name); toFixRecord.set("organization_name", teacher_school_name); toFixRecord.set("organization_no", kv.getStr("organization_no")); toFixRecord.set("gather_regionc", kv.getStr("gather_regionc")); toFixRecord.set("school_running_type", kv.getStr("school_running_type")); toFixRecord.set("match_type", 1); toFixRecord.set("teacher_name", teacherName); writeList.add(toFixRecord); } else if (handMatchSchoolList.containsKey(teacher_school_name)) { Kv kv = handMatchSchoolList.get(teacher_school_name); toFixRecord.set("organization_name", kv.getStr("organization_name")); toFixRecord.set("organization_no", kv.getStr("organization_no")); toFixRecord.set("gather_regionc", kv.getStr("gather_regionc")); toFixRecord.set("school_running_type", kv.getStr("school_running_type")); toFixRecord.set("match_type", 2); toFixRecord.set("teacher_name", teacherName); writeList.add(toFixRecord); }else{ toFixRecord.set("teacher_name", teacherName); toFixRecord.set("match_type", 0); writeList.add(toFixRecord); } } } Db.batchUpdate("t_crawler_lesson_school", "lesson_id", writeList, batchSize); // 还需要继续打补丁【课程与教师的名称对应关系】 sql = "select * from t_crawler_lesson where teacher_name='--'"; List toFixList2 = Db.find(sql); writeList = new ArrayList<>(); for (Record record : toFixList2) { String lesson_id = record.getStr("lesson_id"); if (fixMap.containsKey(lesson_id)) { Record r = fixMap.get(lesson_id); String teacherName = r.getStr("teacher_name"); String teacher_school_name = r.getStr("teacher_school_name"); record.set("teacher_school_name", teacher_school_name);//原始名称 record.set("teacher_name", teacherName); writeList.add(record); } } Db.batchUpdate("t_crawler_lesson", "lesson_id", writeList, batchSize); print("打补丁完成!"); } }