main
黄海 7 months ago
parent 13965fcb6d
commit 0d23b4ebce

@ -0,0 +1,238 @@
package Tools.Crawler.Backup;
import Tools.Crawler.Util;
import cn.hutool.core.date.DateTime;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.jfinal.kit.PropKit;
import com.jfinal.plugin.activerecord.ActiveRecordPlugin;
import com.jfinal.plugin.activerecord.CaseInsensitiveContainerFactory;
import com.jfinal.plugin.activerecord.Db;
import com.jfinal.plugin.activerecord.dialect.PostgreSqlDialect;
import com.jfinal.plugin.hikaricp.HikariCpPlugin;
import com.jfinal.plugin.activerecord.Record;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;
import java.util.List;
// 爬取一次大约需要20分钟
public class YunXiaoOneByOne {
public static JSONArray tempTree = new JSONArray();
public static void traverseTree(JSONArray treeArray) {
for (Object item : treeArray) {
JSONObject node = (JSONObject) item;
JSONObject jsonObj = new JSONObject();
jsonObj.put("nodeId", node.getString("key"));
jsonObj.put("nodeName", node.getString("title"));
jsonObj.put("isLeaf", node.getBoolean("isLeaf"));
jsonObj.put("parentValue", node.getString("parentValue"));
tempTree.add(jsonObj);
// 判断是否有children并且不为空
if (node.containsKey("children") && !node.getJSONArray("children").isEmpty()) {
JSONArray children = node.getJSONArray("children");
traverseTree(children);
}
}
}
public static void print(String msg) {
//先输出时间,再输出内容
System.out.println(DateTime.now() + " " + msg);
}
public static void main(String[] args) {
PropKit.use("application.properties");
HikariCpPlugin hp = new HikariCpPlugin(PropKit.get("jdbcUrl"), PropKit.get("user"),
PropKit.get("password").trim(), PropKit.get("driverClassName"));
hp.start();
// 配置ActiveRecord插件
ActiveRecordPlugin arp = new ActiveRecordPlugin(hp);
//配置默认小写
arp.setContainerFactory(new CaseInsensitiveContainerFactory(true));
arp.setDialect(new PostgreSqlDialect());
arp.start();
//清空爬虫表
String truncatSql = "truncate table t_crawler_subject";
Db.update(truncatSql);
truncatSql = "truncate table t_crawler_scheme";
Db.update(truncatSql);
truncatSql = "truncate table t_crawler_book";
Db.update(truncatSql);
truncatSql = "truncate table t_crawler_structure";
Db.update(truncatSql);
truncatSql = "truncate table t_crawler_resource";
Db.update(truncatSql);
Map<String, String> map = new HashMap<>();
map.put("1", "学前");
map.put("2", "小学");
map.put("3", "初中");
map.put("4", "高中");
print("开始爬取数据!");
//资源写入的数据集
List<Record> subjectList = new ArrayList<>();
List<Record> lessonList = new ArrayList<>();
List<Record> schemeList = new ArrayList<>();
List<Record> bookList = new ArrayList<>();
List<Record> structureList = new ArrayList<>();
map.forEach((key, value) -> {
String respSubject = Util.doGet("https://yx.ccsjy.cn/api/business/v1/subject/list/" + key);
JSONObject jsonObj = JSONObject.parseObject(respSubject);
JSONArray jsonArr = jsonObj.getJSONObject("data").getJSONArray("rows");
for (int i = 0; i < jsonArr.size(); i++) {
JSONObject jsonSubject = jsonArr.getJSONObject(i);
String subjectCode = jsonSubject.getString("subjectCode");
String subjectName = jsonSubject.getString("subjectName");
Record rSubject = new Record();
rSubject.set("subject_id", subjectCode);
rSubject.set("subject_name", subjectName);
rSubject.set("stage_id", key);
subjectList.add(rSubject);
JSONObject argScheme = new JSONObject();
argScheme.put("subjectCode", subjectCode);
argScheme.put("systemId", 1);
String respScheme = Util.doPost("https://yx.ccsjy.cn/api/business/v1/edition/list", argScheme.toString());
JSONObject jsonObjScheme = JSONObject.parseObject(respScheme);
JSONArray jsonArrScheme = jsonObjScheme.getJSONArray("data");
for (int j = 0; j < jsonArrScheme.size(); j++) {
JSONObject jsonScheme = jsonArrScheme.getJSONObject(j);
String businessEditionId = jsonScheme.getString("businessEditionId");
String editionName = jsonScheme.getString("editionName");
Record rScheme = new Record();
rScheme.set("scheme_id", businessEditionId);
rScheme.set("scheme_name", editionName);
rScheme.set("subject_id", subjectCode);
rScheme.set("stage_id", key);
rScheme.set("id", UUID.randomUUID().toString());
schemeList.add(rScheme);
JSONObject argBook = new JSONObject();
argBook.put("stageCode", key);
argBook.put("subjectCode", subjectCode);
argBook.put("businessEditionId", businessEditionId);
String respBook = Util.doPost("https://yx.ccsjy.cn/api/business/v1/book/list", argBook.toString());
JSONObject jsonObjBook = JSONObject.parseObject(respBook);
JSONArray jsonArrBook = jsonObjBook.getJSONArray("data");
for (int k = 0; k < jsonArrBook.size(); k++) {
JSONObject jsonBook = jsonArrBook.getJSONObject(k);
String businessBookId = jsonBook.getString("businessBookId");
String bookName = jsonBook.getString("bookName");
Record rBook = new Record();
rBook.set("book_id", businessBookId);
rBook.set("book_name", bookName);
rBook.set("scheme_id", businessEditionId);
rBook.set("subject_id", subjectCode);
rBook.set("stage_id", key);
rBook.set("id", UUID.randomUUID().toString());
bookList.add(rBook);
JSONObject argTree = new JSONObject();
argTree.put("businessBookId", businessBookId);
argTree.put("childrensFlag", 1);
argTree.put("parentId", -1);
argTree.put("searchKeyword", "");
String respTree = Util.doPost("https://yx.ccsjy.cn/api/business/v1/chapter/tree", argTree.toString());
JSONObject jsonObjTree = JSONObject.parseObject(respTree);
JSONArray jsonArrTree = jsonObjTree.getJSONObject("data").getJSONArray("tree");
tempTree = new JSONArray();
traverseTree(jsonArrTree);
for (int n = 0; n < tempTree.size(); n++) {
JSONObject jsonTree = tempTree.getJSONObject(n);
String nodeId = jsonTree.getString("nodeId");
String nodeName = jsonTree.getString("nodeName");
Boolean isLeaf = jsonTree.getBoolean("isLeaf");
String parentValue = jsonTree.getString("parentValue");
int is_leaf = 0;
if (isLeaf) {
is_leaf = 1;
}
Record rStructure = new Record();
rStructure.set("node_id", nodeId);
rStructure.set("node_name", nodeName);
rStructure.set("parent_id", parentValue);
rStructure.set("is_leaf", is_leaf);
rStructure.set("book_id", businessBookId);
rStructure.set("scheme_id", businessEditionId);
rStructure.set("subject_id", subjectCode);
rStructure.set("stage_id", key);
rStructure.set("id", UUID.randomUUID().toString());
structureList.add(rStructure);
JSONObject argSource = new JSONObject();
argSource.put("pageNum", 1);
argSource.put("pageSize", 100);
argSource.put("businessBookId", businessBookId);
argSource.put("nodeId", nodeId);
argSource.put("stageCode", key);
argSource.put("subjectCode", subjectCode);
argSource.put("excellentFlag", "");
argSource.put("nodeType", 1);
argSource.put("sortType", 2);
argSource.put("source", "");
argSource.put("searchKeyword", "");
String respSource = Util.doPost("https://yx.ccsjy.cn/api/cloud-school/v1/cloudLesson/getOnDemandLessonPage", argSource.toString());
JSONObject jsonObjSource = JSONObject.parseObject(respSource);
JSONArray jsonArrSource = jsonObjSource.getJSONObject("data").getJSONArray("rows");
print("正在爬取:【" + value + "" + subjectName + "" + editionName + "" + bookName + "" + nodeName + "】下的资源!");
for (int m = 0; m < jsonArrSource.size(); m++) {
JSONObject jsonSource = jsonArrSource.getJSONObject(m);
String lessonId = jsonSource.getString("lessonId");
String lessonName = jsonSource.getString("lessonName");
String teacherSchoolId = jsonSource.getString("teacherSchoolId");
String teacherSchoolName = jsonSource.getString("teacherSchoolName");
String teacherId = jsonSource.getString("teacherId");
String teacherName = jsonSource.getString("teacherName");
Record record = new Record();
record.set("lesson_id", lessonId);
record.set("lesson_name", lessonName);
record.set("node_id", nodeId);
record.set("teacher_id", teacherId);
record.set("teacher_school_id", teacherSchoolId);
record.set("teacher_school_name", teacherSchoolName);
record.set("teacher_name", teacherName);
record.set("book_id", businessBookId);
record.set("scheme_id", businessEditionId);
record.set("subject_id", subjectCode);
record.set("stage_id", key);
record.set("id", UUID.randomUUID().toString());
lessonList.add(record);
}
print("已收集资源数量:"+lessonList.size()+"个。");
}
}
}
}
});
print("开始保存数据...");
Db.batchSave("t_crawler_subject", subjectList, 300);
Db.batchSave("t_crawler_scheme", schemeList, 300);
Db.batchSave("t_crawler_book", bookList, 300);
Db.batchSave("t_crawler_structure", structureList, 300);
Db.batchSave("t_crawler_resource", lessonList, 300);
print("爬取数据完成!");
}
}

@ -3,25 +3,41 @@ package Tools.Crawler;
import cn.hutool.core.date.DateTime;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.dsideal.QingLong.Start;
import com.jfinal.kit.PropKit;
import com.jfinal.plugin.activerecord.ActiveRecordPlugin;
import com.jfinal.plugin.activerecord.CaseInsensitiveContainerFactory;
import com.jfinal.plugin.activerecord.Db;
import com.jfinal.plugin.activerecord.Record;
import com.jfinal.plugin.activerecord.dialect.PostgreSqlDialect;
import com.jfinal.plugin.hikaricp.HikariCpPlugin;
import com.jfinal.plugin.activerecord.Record;
import java.util.ArrayList;
import java.io.File;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;
import java.util.List;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
// 爬取一次大约需要20分钟
public class YunXiao {
public static JSONArray tempTree = new JSONArray();
// 使用线程安全的集合
private static final CopyOnWriteArrayList<Record> subjectList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> lessonList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> schemeList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> bookList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> structureList = new CopyOnWriteArrayList<>();
// 用于记录进度
private static final AtomicInteger resourceCount = new AtomicInteger(0);
public static void traverseTree(JSONArray treeArray) {
//线程池大小
private static final int ThreadCount = 4;
//最大重试次数
private static final int MaxRetryCount = 3;
//每次写入的数量
private static final int batchSize = 300;
public static void traverseTree(JSONArray treeArray, JSONArray tempTree) {
for (Object item : treeArray) {
JSONObject node = (JSONObject) item;
JSONObject jsonObj = new JSONObject();
@ -31,43 +47,333 @@ public class YunXiao {
jsonObj.put("parentValue", node.getString("parentValue"));
tempTree.add(jsonObj);
// 判断是否有children并且不为空
if (node.containsKey("children") && !node.getJSONArray("children").isEmpty()) {
JSONArray children = node.getJSONArray("children");
traverseTree(children);
traverseTree(node.getJSONArray("children"), tempTree);
}
}
}
/**
*
*
* @param msg
*/
public static void print(String msg) {
//先输出时间,再输出内容
System.out.println(DateTime.now() + " " + msg);
}
/**
*
*
* @param url
* @param jsonBody
* @param isGet
* @param maxRetries
* @return
*/
private static String doRequestWithRetry(String url, String jsonBody, boolean isGet, int maxRetries) {
int retries = 0;
while (retries < maxRetries) {
try {
if (isGet) {
return Util.doGet(url);
} else {
return Util.doPost(url, jsonBody);
}
} catch (Exception e) {
retries++;
if (retries == maxRetries) {
print("请求失败: " + url + " 重试次数: " + retries);
throw new RuntimeException("Failed after " + maxRetries + " retries", e);
}
try {
Thread.sleep(1000);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
break;
}
}
}
return null;
}
/**
*
*
* @param subject
* @param key
* @param value
*/
private static void processSubject(JSONObject subject, String key, String value) throws InterruptedException {
String subjectCode = subject.getString("subjectCode");
String subjectName = subject.getString("subjectName");
Record rSubject = new Record().set("subject_id", subjectCode).set("subject_name", subjectName).set("stage_id", key);
subjectList.add(rSubject);
JSONObject argScheme = new JSONObject();
argScheme.put("subjectCode", subjectCode);
argScheme.put("systemId", 1);
String respScheme = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/edition/list",
argScheme.toString(), false, MaxRetryCount);
if (respScheme != null) {
//处理科目后,处理此科目下的版本
processSchemes(JSONObject.parseObject(respScheme).getJSONArray("data"), subjectCode, key, value);
}
}
/**
*
*
* @param schemes
* @param subjectCode
* @param key
* @param value
*/
private static void processSchemes(JSONArray schemes, String subjectCode, String key, String value) throws InterruptedException {
ExecutorService schemeExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch schemeLatch = new CountDownLatch(schemes.size());
for (int j = 0; j < schemes.size(); j++) {
JSONObject scheme = schemes.getJSONObject(j);
schemeExecutor.submit(() -> {
try {
processScheme(scheme, subjectCode, key, value);
} catch (InterruptedException e) {
throw new RuntimeException(e);
} finally {
schemeLatch.countDown();
}
});
}
schemeLatch.await();
schemeExecutor.shutdown();
}
/**
*
*
* @param scheme
* @param subjectCode
* @param key
* @param value
*/
private static void processScheme(JSONObject scheme, String subjectCode, String key, String value) throws InterruptedException {
String businessEditionId = scheme.getString("businessEditionId");
String editionName = scheme.getString("editionName");
Record rScheme = new Record().set("scheme_id", businessEditionId).set("scheme_name", editionName).set("subject_id", subjectCode)
.set("stage_id", key).set("id", UUID.randomUUID().toString());
schemeList.add(rScheme);
JSONObject argBook = new JSONObject();
argBook.put("stageCode", key);
argBook.put("subjectCode", subjectCode);
argBook.put("businessEditionId", businessEditionId);
String respBook = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/book/list", argBook.toString(), false, 3);
if (respBook != null) {
JSONArray books = JSONObject.parseObject(respBook).getJSONArray("data");
// 使用线程池处理每本书
ExecutorService bookExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch bookLatch = new CountDownLatch(books.size());
for (int k = 0; k < books.size(); k++) {
JSONObject book = books.getJSONObject(k);
bookExecutor.submit(() -> {
try {
processBook(book, businessEditionId, subjectCode, key, value, editionName);
} catch (InterruptedException e) {
throw new RuntimeException(e);
} finally {
bookLatch.countDown();
}
});
}
bookLatch.await();
bookExecutor.shutdown();
}
}
/**
*
*
* @param book
* @param businessEditionId
* @param subjectCode
* @param key
* @param value
* @param editionName
*/
private static void processBook(JSONObject book, String businessEditionId, String subjectCode,
String key, String value, String editionName) throws InterruptedException {
String businessBookId = book.getString("businessBookId");
String bookName = book.getString("bookName");
Record rBook = new Record().set("book_id", businessBookId)
.set("book_name", bookName).set("scheme_id", businessEditionId)
.set("subject_id", subjectCode).set("stage_id", key)
.set("id", UUID.randomUUID().toString());
bookList.add(rBook);
JSONObject argTree = new JSONObject();
argTree.put("businessBookId", businessBookId);
argTree.put("childrensFlag", 1);
argTree.put("parentId", -1);
argTree.put("searchKeyword", "");
String respTree = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/chapter/tree",
argTree.toString(), false, MaxRetryCount);
if (respTree != null) {
JSONArray jsonArrTree = JSONObject.parseObject(respTree)
.getJSONObject("data")
.getJSONArray("tree");
JSONArray tempTree = new JSONArray();
traverseTree(jsonArrTree, tempTree);
// 使用线程池处理每个节点
ExecutorService nodeExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch nodeLatch = new CountDownLatch(tempTree.size());
for (int n = 0; n < tempTree.size(); n++) {
JSONObject jsonTree = tempTree.getJSONObject(n);
nodeExecutor.submit(() -> {
try {
processNode(jsonTree, businessBookId, businessEditionId, subjectCode,
key, value, editionName, bookName);
} finally {
nodeLatch.countDown();
}
});
}
nodeLatch.await();
nodeExecutor.shutdown();
}
}
/**
*
*
* @param jsonTree
* @param businessBookId
* @param businessEditionId
* @param subjectCode
* @param key
* @param value
* @param editionName
* @param bookName
*/
private static void processNode(JSONObject jsonTree, String businessBookId, String businessEditionId,
String subjectCode, String key, String value, String editionName,
String bookName) {
String nodeId = jsonTree.getString("nodeId");
String nodeName = jsonTree.getString("nodeName");
Boolean isLeaf = jsonTree.getBoolean("isLeaf");
String parentValue = jsonTree.getString("parentValue");
Record rStructure = new Record()
.set("node_id", nodeId).set("node_name", nodeName).set("parent_id", parentValue)
.set("is_leaf", isLeaf ? 1 : 0).set("book_id", businessBookId)
.set("scheme_id", businessEditionId).set("subject_id", subjectCode)
.set("stage_id", key).set("id", UUID.randomUUID().toString());
structureList.add(rStructure);
processResource(nodeId, businessBookId, subjectCode, key, value,
editionName, bookName, nodeName);
}
/**
*
*
* @param nodeId
* @param businessBookId
* @param subjectCode
* @param key
* @param value
* @param editionName
* @param bookName
* @param nodeName
*/
private static void processResource(String nodeId, String businessBookId, String subjectCode,
String key, String value, String editionName, String bookName, String nodeName) {
JSONObject argSource = new JSONObject();
argSource.put("pageNum", 1);
argSource.put("pageSize", 100);
argSource.put("businessBookId", businessBookId);
argSource.put("nodeId", nodeId);
argSource.put("stageCode", key);
argSource.put("subjectCode", subjectCode);
argSource.put("excellentFlag", "");
argSource.put("nodeType", 1);
argSource.put("sortType", 2);
argSource.put("source", "");
argSource.put("searchKeyword", "");
String respSource = doRequestWithRetry("https://yx.ccsjy.cn/api/cloud-school/v1/cloudLesson/getOnDemandLessonPage",
argSource.toString(), false, MaxRetryCount);
if (respSource != null) {
JSONObject jsonObjSource = JSONObject.parseObject(respSource);
JSONArray jsonArrSource = jsonObjSource.getJSONObject("data").getJSONArray("rows");
print("正在爬取:【" + value + "" + editionName + "" + bookName + "" + nodeName + "】下的资源!");
for (int m = 0; m < jsonArrSource.size(); m++) {
JSONObject jsonSource = jsonArrSource.getJSONObject(m);
Record record = new Record()
.set("lesson_id", jsonSource.getString("lessonId"))
.set("lesson_name", jsonSource.getString("lessonName"))
.set("node_id", nodeId)
.set("teacher_id", jsonSource.getString("teacherId"))
.set("teacher_school_id", jsonSource.getString("teacherSchoolId"))
.set("teacher_school_name", jsonSource.getString("teacherSchoolName"))
.set("teacher_name", jsonSource.getString("teacherName"))
.set("book_id", businessBookId)
.set("scheme_id", editionName)
.set("subject_id", subjectCode)
.set("stage_id", key)
.set("id", UUID.randomUUID().toString());
lessonList.add(record);
int count = resourceCount.incrementAndGet();
if (count % 100 == 0) {
print("已收集资源数量:" + count + "个。");
}
}
}
}
public static void main(String[] args) {
// 初始化数据库连接
PropKit.use("application.properties");
HikariCpPlugin hp = new HikariCpPlugin(PropKit.get("jdbcUrl"), PropKit.get("user"),
PropKit.get("password").trim(), PropKit.get("driverClassName"));
hp.start();
// 配置ActiveRecord插件
ActiveRecordPlugin arp = new ActiveRecordPlugin(hp);
//配置默认小写
arp.setContainerFactory(new CaseInsensitiveContainerFactory(true));
arp.setDialect(new PostgreSqlDialect());
String basePath = Start.class.getClassLoader().getResource(".").getPath();
File sqlDir = new File(basePath + "/Sql");
File[] sqlFiles = sqlDir.listFiles();
File[] var10 = sqlFiles != null ? sqlFiles : new File[0];
int var11 = var10.length;
for (int i = 0; i < var11; ++i) {
File sqlFile = var10[i];
if (sqlFile.getName().indexOf(".sql") > 0) {
arp.addSqlTemplate("/Sql/" + sqlFile.getName());
}
}
arp.start();
//清空爬虫表
String truncatSql = "truncate table t_crawler_subject";
Db.update(truncatSql);
truncatSql = "truncate table t_crawler_scheme";
Db.update(truncatSql);
truncatSql = "truncate table t_crawler_book";
Db.update(truncatSql);
truncatSql = "truncate table t_crawler_structure";
Db.update(truncatSql);
truncatSql = "truncate table t_crawler_resource";
Db.update(truncatSql);
// 清空表
String[] tables = {"t_crawler_subject", "t_crawler_scheme", "t_crawler_book",
"t_crawler_structure", "t_crawler_resource"};
for (String table : tables) {
Db.update("truncate table " + table);
}
Map<String, String> map = new HashMap<>();
map.put("1", "学前");
@ -76,162 +382,69 @@ public class YunXiao {
map.put("4", "高中");
print("开始爬取数据!");
//资源写入的数据集
List<Record> subjectList = new ArrayList<>();
List<Record> lessonList = new ArrayList<>();
List<Record> schemeList = new ArrayList<>();
List<Record> bookList = new ArrayList<>();
List<Record> structureList = new ArrayList<>();
//记录开始时间
long startTime = System.currentTimeMillis();
// 创建线程池
ExecutorService executorService = Executors.newFixedThreadPool(10);
CountDownLatch mainLatch = new CountDownLatch(map.size());
map.forEach((key, value) -> {
String respSubject = Util.doGet("https://yx.ccsjy.cn/api/business/v1/subject/list/" + key);
JSONObject jsonObj = JSONObject.parseObject(respSubject);
JSONArray jsonArr = jsonObj.getJSONObject("data").getJSONArray("rows");
for (int i = 0; i < jsonArr.size(); i++) {
JSONObject jsonSubject = jsonArr.getJSONObject(i);
String subjectCode = jsonSubject.getString("subjectCode");
String subjectName = jsonSubject.getString("subjectName");
Record rSubject = new Record();
rSubject.set("subject_id", subjectCode);
rSubject.set("subject_name", subjectName);
rSubject.set("stage_id", key);
subjectList.add(rSubject);
JSONObject argScheme = new JSONObject();
argScheme.put("subjectCode", subjectCode);
argScheme.put("systemId", 1);
String respScheme = Util.doPost("https://yx.ccsjy.cn/api/business/v1/edition/list", argScheme.toString());
JSONObject jsonObjScheme = JSONObject.parseObject(respScheme);
JSONArray jsonArrScheme = jsonObjScheme.getJSONArray("data");
for (int j = 0; j < jsonArrScheme.size(); j++) {
JSONObject jsonScheme = jsonArrScheme.getJSONObject(j);
String businessEditionId = jsonScheme.getString("businessEditionId");
String editionName = jsonScheme.getString("editionName");
Record rScheme = new Record();
rScheme.set("scheme_id", businessEditionId);
rScheme.set("scheme_name", editionName);
rScheme.set("subject_id", subjectCode);
rScheme.set("stage_id", key);
rScheme.set("id", UUID.randomUUID().toString());
schemeList.add(rScheme);
JSONObject argBook = new JSONObject();
argBook.put("stageCode", key);
argBook.put("subjectCode", subjectCode);
argBook.put("businessEditionId", businessEditionId);
String respBook = Util.doPost("https://yx.ccsjy.cn/api/business/v1/book/list", argBook.toString());
JSONObject jsonObjBook = JSONObject.parseObject(respBook);
JSONArray jsonArrBook = jsonObjBook.getJSONArray("data");
for (int k = 0; k < jsonArrBook.size(); k++) {
JSONObject jsonBook = jsonArrBook.getJSONObject(k);
String businessBookId = jsonBook.getString("businessBookId");
String bookName = jsonBook.getString("bookName");
Record rBook = new Record();
rBook.set("book_id", businessBookId);
rBook.set("book_name", bookName);
rBook.set("scheme_id", businessEditionId);
rBook.set("subject_id", subjectCode);
rBook.set("stage_id", key);
rBook.set("id", UUID.randomUUID().toString());
bookList.add(rBook);
JSONObject argTree = new JSONObject();
argTree.put("businessBookId", businessBookId);
argTree.put("childrensFlag", 1);
argTree.put("parentId", -1);
argTree.put("searchKeyword", "");
String respTree = Util.doPost("https://yx.ccsjy.cn/api/business/v1/chapter/tree", argTree.toString());
JSONObject jsonObjTree = JSONObject.parseObject(respTree);
JSONArray jsonArrTree = jsonObjTree.getJSONObject("data").getJSONArray("tree");
tempTree = new JSONArray();
traverseTree(jsonArrTree);
for (int n = 0; n < tempTree.size(); n++) {
JSONObject jsonTree = tempTree.getJSONObject(n);
String nodeId = jsonTree.getString("nodeId");
String nodeName = jsonTree.getString("nodeName");
Boolean isLeaf = jsonTree.getBoolean("isLeaf");
String parentValue = jsonTree.getString("parentValue");
int is_leaf = 0;
if (isLeaf) {
is_leaf = 1;
}
Record rStructure = new Record();
rStructure.set("node_id", nodeId);
rStructure.set("node_name", nodeName);
rStructure.set("parent_id", parentValue);
rStructure.set("is_leaf", is_leaf);
rStructure.set("book_id", businessBookId);
rStructure.set("scheme_id", businessEditionId);
rStructure.set("subject_id", subjectCode);
rStructure.set("stage_id", key);
rStructure.set("id", UUID.randomUUID().toString());
structureList.add(rStructure);
JSONObject argSource = new JSONObject();
argSource.put("pageNum", 1);
argSource.put("pageSize", 100);
argSource.put("businessBookId", businessBookId);
argSource.put("nodeId", nodeId);
argSource.put("stageCode", key);
argSource.put("subjectCode", subjectCode);
argSource.put("excellentFlag", "");
argSource.put("nodeType", 1);
argSource.put("sortType", 2);
argSource.put("source", "");
argSource.put("searchKeyword", "");
String respSource = Util.doPost("https://yx.ccsjy.cn/api/cloud-school/v1/cloudLesson/getOnDemandLessonPage", argSource.toString());
JSONObject jsonObjSource = JSONObject.parseObject(respSource);
JSONArray jsonArrSource = jsonObjSource.getJSONObject("data").getJSONArray("rows");
print("正在爬取:【" + value + "" + subjectName + "" + editionName + "" + bookName + "" + nodeName + "】下的资源!");
for (int m = 0; m < jsonArrSource.size(); m++) {
JSONObject jsonSource = jsonArrSource.getJSONObject(m);
String lessonId = jsonSource.getString("lessonId");
String lessonName = jsonSource.getString("lessonName");
String teacherSchoolId = jsonSource.getString("teacherSchoolId");
String teacherSchoolName = jsonSource.getString("teacherSchoolName");
String teacherId = jsonSource.getString("teacherId");
String teacherName = jsonSource.getString("teacherName");
Record record = new Record();
record.set("lesson_id", lessonId);
record.set("lesson_name", lessonName);
record.set("node_id", nodeId);
record.set("teacher_id", teacherId);
record.set("teacher_school_id", teacherSchoolId);
record.set("teacher_school_name", teacherSchoolName);
record.set("teacher_name", teacherName);
record.set("book_id", businessBookId);
record.set("scheme_id", businessEditionId);
record.set("subject_id", subjectCode);
record.set("stage_id", key);
record.set("id", UUID.randomUUID().toString());
lessonList.add(record);
}
print("已收集资源数量:"+lessonList.size()+"个。");
executorService.submit(() -> {
try {
String respSubject = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/subject/list/" + key,
null, true, MaxRetryCount);
if (respSubject != null) {
JSONObject jsonObj = JSONObject.parseObject(respSubject);
JSONArray subjects = jsonObj.getJSONObject("data").getJSONArray("rows");
ExecutorService subjectExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch subjectLatch = new CountDownLatch(subjects.size());
for (int i = 0; i < subjects.size(); i++) {
JSONObject subject = subjects.getJSONObject(i);
subjectExecutor.submit(() -> {
try {
processSubject(subject, key, value);
} catch (InterruptedException e) {
throw new RuntimeException(e);
} finally {
subjectLatch.countDown();
}
});
}
subjectLatch.await();
subjectExecutor.shutdown();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
mainLatch.countDown();
}
}
});
});
try {
mainLatch.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
executorService.shutdown();
// 保存数据
print("开始保存数据...");
Db.batchSave("t_crawler_subject", subjectList, 300);
Db.batchSave("t_crawler_scheme", schemeList, 300);
Db.batchSave("t_crawler_book", bookList, 300);
Db.batchSave("t_crawler_structure", structureList, 300);
Db.batchSave("t_crawler_resource", lessonList, 300);
Db.batchSave("t_crawler_subject", subjectList, batchSize);
Db.batchSave("t_crawler_scheme", schemeList, batchSize);
Db.batchSave("t_crawler_book", bookList, batchSize);
Db.batchSave("t_crawler_structure", structureList, batchSize);
Db.batchSave("t_crawler_resource", lessonList, batchSize);
print("爬取数据完成!");
print("总共收集资源:" + lessonList.size() + "个");
print("总共收集节点:" + structureList.size() + "个");
//记录结束时间,并输出两者的差值是多少分钟多少少
long endTime = System.currentTimeMillis();
print("爬取数据耗时:" + (endTime - startTime) / 1000 + "秒");
}
}
}

@ -1,435 +0,0 @@
package Tools.Crawler;
import cn.hutool.core.date.DateTime;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.jfinal.kit.PropKit;
import com.jfinal.plugin.activerecord.ActiveRecordPlugin;
import com.jfinal.plugin.activerecord.CaseInsensitiveContainerFactory;
import com.jfinal.plugin.activerecord.Db;
import com.jfinal.plugin.activerecord.Record;
import com.jfinal.plugin.activerecord.dialect.PostgreSqlDialect;
import com.jfinal.plugin.hikaricp.HikariCpPlugin;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
public class YunXiaoYouHua {
// 使用线程安全的集合
private static final CopyOnWriteArrayList<Record> subjectList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> lessonList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> schemeList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> bookList = new CopyOnWriteArrayList<>();
private static final CopyOnWriteArrayList<Record> structureList = new CopyOnWriteArrayList<>();
// 用于记录进度
private static final AtomicInteger resourceCount = new AtomicInteger(0);
//线程池大小
private static final int ThreadCount = 4;
//最大重试次数
private static final int MaxRetryCount = 3;
//每次写入的数量
private static final int batchSize = 300;
public static void traverseTree(JSONArray treeArray, JSONArray tempTree) {
for (Object item : treeArray) {
JSONObject node = (JSONObject) item;
JSONObject jsonObj = new JSONObject();
jsonObj.put("nodeId", node.getString("key"));
jsonObj.put("nodeName", node.getString("title"));
jsonObj.put("isLeaf", node.getBoolean("isLeaf"));
jsonObj.put("parentValue", node.getString("parentValue"));
tempTree.add(jsonObj);
if (node.containsKey("children") && !node.getJSONArray("children").isEmpty()) {
traverseTree(node.getJSONArray("children"), tempTree);
}
}
}
/**
*
*
* @param msg
*/
public static void print(String msg) {
System.out.println(DateTime.now() + " " + msg);
}
/**
*
*
* @param url
* @param jsonBody
* @param isGet
* @param maxRetries
* @return
*/
private static String doRequestWithRetry(String url, String jsonBody, boolean isGet, int maxRetries) {
int retries = 0;
while (retries < maxRetries) {
try {
if (isGet) {
return Util.doGet(url);
} else {
return Util.doPost(url, jsonBody);
}
} catch (Exception e) {
retries++;
if (retries == maxRetries) {
print("请求失败: " + url + " 重试次数: " + retries);
throw new RuntimeException("Failed after " + maxRetries + " retries", e);
}
try {
Thread.sleep(1000);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
break;
}
}
}
return null;
}
/**
*
*
* @param subject
* @param key
* @param value
*/
private static void processSubject(JSONObject subject, String key, String value) throws InterruptedException {
String subjectCode = subject.getString("subjectCode");
String subjectName = subject.getString("subjectName");
Record rSubject = new Record().set("subject_id", subjectCode).set("subject_name", subjectName).set("stage_id", key);
subjectList.add(rSubject);
JSONObject argScheme = new JSONObject();
argScheme.put("subjectCode", subjectCode);
argScheme.put("systemId", 1);
String respScheme = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/edition/list",
argScheme.toString(), false, MaxRetryCount);
if (respScheme != null) {
//处理科目后,处理此科目下的版本
processSchemes(JSONObject.parseObject(respScheme).getJSONArray("data"), subjectCode, key, value);
}
}
/**
*
*
* @param schemes
* @param subjectCode
* @param key
* @param value
*/
private static void processSchemes(JSONArray schemes, String subjectCode, String key, String value) throws InterruptedException {
ExecutorService schemeExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch schemeLatch = new CountDownLatch(schemes.size());
for (int j = 0; j < schemes.size(); j++) {
JSONObject scheme = schemes.getJSONObject(j);
schemeExecutor.submit(() -> {
try {
processScheme(scheme, subjectCode, key, value);
} catch (InterruptedException e) {
throw new RuntimeException(e);
} finally {
schemeLatch.countDown();
}
});
}
schemeLatch.await();
schemeExecutor.shutdown();
}
/**
*
*
* @param scheme
* @param subjectCode
* @param key
* @param value
*/
private static void processScheme(JSONObject scheme, String subjectCode, String key, String value) throws InterruptedException {
String businessEditionId = scheme.getString("businessEditionId");
String editionName = scheme.getString("editionName");
Record rScheme = new Record().set("scheme_id", businessEditionId).set("scheme_name", editionName).set("subject_id", subjectCode)
.set("stage_id", key).set("id", UUID.randomUUID().toString());
schemeList.add(rScheme);
JSONObject argBook = new JSONObject();
argBook.put("stageCode", key);
argBook.put("subjectCode", subjectCode);
argBook.put("businessEditionId", businessEditionId);
String respBook = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/book/list", argBook.toString(), false, 3);
if (respBook != null) {
JSONArray books = JSONObject.parseObject(respBook).getJSONArray("data");
// 使用线程池处理每本书
ExecutorService bookExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch bookLatch = new CountDownLatch(books.size());
for (int k = 0; k < books.size(); k++) {
JSONObject book = books.getJSONObject(k);
bookExecutor.submit(() -> {
try {
processBook(book, businessEditionId, subjectCode, key, value, editionName);
} catch (InterruptedException e) {
throw new RuntimeException(e);
} finally {
bookLatch.countDown();
}
});
}
bookLatch.await();
bookExecutor.shutdown();
}
}
/**
*
*
* @param book
* @param businessEditionId
* @param subjectCode
* @param key
* @param value
* @param editionName
*/
private static void processBook(JSONObject book, String businessEditionId, String subjectCode,
String key, String value, String editionName) throws InterruptedException {
String businessBookId = book.getString("businessBookId");
String bookName = book.getString("bookName");
Record rBook = new Record().set("book_id", businessBookId)
.set("book_name", bookName).set("scheme_id", businessEditionId)
.set("subject_id", subjectCode).set("stage_id", key)
.set("id", UUID.randomUUID().toString());
bookList.add(rBook);
JSONObject argTree = new JSONObject();
argTree.put("businessBookId", businessBookId);
argTree.put("childrensFlag", 1);
argTree.put("parentId", -1);
argTree.put("searchKeyword", "");
String respTree = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/chapter/tree",
argTree.toString(), false, MaxRetryCount);
if (respTree != null) {
JSONArray jsonArrTree = JSONObject.parseObject(respTree)
.getJSONObject("data")
.getJSONArray("tree");
JSONArray tempTree = new JSONArray();
traverseTree(jsonArrTree, tempTree);
// 使用线程池处理每个节点
ExecutorService nodeExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch nodeLatch = new CountDownLatch(tempTree.size());
for (int n = 0; n < tempTree.size(); n++) {
JSONObject jsonTree = tempTree.getJSONObject(n);
nodeExecutor.submit(() -> {
try {
processNode(jsonTree, businessBookId, businessEditionId, subjectCode,
key, value, editionName, bookName);
} finally {
nodeLatch.countDown();
}
});
}
nodeLatch.await();
nodeExecutor.shutdown();
}
}
/**
*
*
* @param jsonTree
* @param businessBookId
* @param businessEditionId
* @param subjectCode
* @param key
* @param value
* @param editionName
* @param bookName
*/
private static void processNode(JSONObject jsonTree, String businessBookId, String businessEditionId,
String subjectCode, String key, String value, String editionName,
String bookName) {
String nodeId = jsonTree.getString("nodeId");
String nodeName = jsonTree.getString("nodeName");
Boolean isLeaf = jsonTree.getBoolean("isLeaf");
String parentValue = jsonTree.getString("parentValue");
Record rStructure = new Record()
.set("node_id", nodeId).set("node_name", nodeName).set("parent_id", parentValue)
.set("is_leaf", isLeaf ? 1 : 0).set("book_id", businessBookId)
.set("scheme_id", businessEditionId).set("subject_id", subjectCode)
.set("stage_id", key).set("id", UUID.randomUUID().toString());
structureList.add(rStructure);
processResource(nodeId, businessBookId, subjectCode, key, value,
editionName, bookName, nodeName);
}
/**
*
*
* @param nodeId
* @param businessBookId
* @param subjectCode
* @param key
* @param value
* @param editionName
* @param bookName
* @param nodeName
*/
private static void processResource(String nodeId, String businessBookId, String subjectCode,
String key, String value, String editionName, String bookName, String nodeName) {
JSONObject argSource = new JSONObject();
argSource.put("pageNum", 1);
argSource.put("pageSize", 100);
argSource.put("businessBookId", businessBookId);
argSource.put("nodeId", nodeId);
argSource.put("stageCode", key);
argSource.put("subjectCode", subjectCode);
argSource.put("excellentFlag", "");
argSource.put("nodeType", 1);
argSource.put("sortType", 2);
argSource.put("source", "");
argSource.put("searchKeyword", "");
String respSource = doRequestWithRetry("https://yx.ccsjy.cn/api/cloud-school/v1/cloudLesson/getOnDemandLessonPage",
argSource.toString(), false, MaxRetryCount);
if (respSource != null) {
JSONObject jsonObjSource = JSONObject.parseObject(respSource);
JSONArray jsonArrSource = jsonObjSource.getJSONObject("data").getJSONArray("rows");
print("正在爬取:【" + value + "" + editionName + "" + bookName + "" + nodeName + "】下的资源!");
for (int m = 0; m < jsonArrSource.size(); m++) {
JSONObject jsonSource = jsonArrSource.getJSONObject(m);
Record record = new Record()
.set("lesson_id", jsonSource.getString("lessonId"))
.set("lesson_name", jsonSource.getString("lessonName"))
.set("node_id", nodeId)
.set("teacher_id", jsonSource.getString("teacherId"))
.set("teacher_school_id", jsonSource.getString("teacherSchoolId"))
.set("teacher_school_name", jsonSource.getString("teacherSchoolName"))
.set("teacher_name", jsonSource.getString("teacherName"))
.set("book_id", businessBookId)
.set("scheme_id", editionName)
.set("subject_id", subjectCode)
.set("stage_id", key)
.set("id", UUID.randomUUID().toString());
lessonList.add(record);
int count = resourceCount.incrementAndGet();
if (count % 100 == 0) {
print("已收集资源数量:" + count + "个。");
}
}
}
}
public static void main(String[] args) {
// 初始化数据库连接
PropKit.use("application.properties");
HikariCpPlugin hp = new HikariCpPlugin(PropKit.get("jdbcUrl"), PropKit.get("user"),
PropKit.get("password").trim(), PropKit.get("driverClassName"));
hp.start();
ActiveRecordPlugin arp = new ActiveRecordPlugin(hp);
arp.setContainerFactory(new CaseInsensitiveContainerFactory(true));
arp.setDialect(new PostgreSqlDialect());
arp.start();
// 清空表
String[] tables = {"t_crawler_subject", "t_crawler_scheme", "t_crawler_book",
"t_crawler_structure", "t_crawler_resource"};
for (String table : tables) {
Db.update("truncate table " + table);
}
Map<String, String> map = new HashMap<>();
map.put("1", "学前");
map.put("2", "小学");
map.put("3", "初中");
map.put("4", "高中");
print("开始爬取数据!");
//记录开始时间
long startTime = System.currentTimeMillis();
// 创建线程池
ExecutorService executorService = Executors.newFixedThreadPool(10);
CountDownLatch mainLatch = new CountDownLatch(map.size());
map.forEach((key, value) -> {
executorService.submit(() -> {
try {
String respSubject = doRequestWithRetry("https://yx.ccsjy.cn/api/business/v1/subject/list/" + key,
null, true, MaxRetryCount);
if (respSubject != null) {
JSONObject jsonObj = JSONObject.parseObject(respSubject);
JSONArray subjects = jsonObj.getJSONObject("data").getJSONArray("rows");
ExecutorService subjectExecutor = Executors.newFixedThreadPool(ThreadCount);
CountDownLatch subjectLatch = new CountDownLatch(subjects.size());
for (int i = 0; i < subjects.size(); i++) {
JSONObject subject = subjects.getJSONObject(i);
subjectExecutor.submit(() -> {
try {
processSubject(subject, key, value);
} catch (InterruptedException e) {
throw new RuntimeException(e);
} finally {
subjectLatch.countDown();
}
});
}
subjectLatch.await();
subjectExecutor.shutdown();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
mainLatch.countDown();
}
});
});
try {
mainLatch.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
executorService.shutdown();
// 保存数据
print("开始保存数据...");
Db.batchSave("t_crawler_subject", subjectList, batchSize);
Db.batchSave("t_crawler_scheme", schemeList, batchSize);
Db.batchSave("t_crawler_book", bookList, batchSize);
Db.batchSave("t_crawler_structure", structureList, batchSize);
Db.batchSave("t_crawler_resource", lessonList, batchSize);
print("爬取数据完成!");
print("总共收集资源:" + lessonList.size() + "个");
print("总共收集节点:" + structureList.size() + "个");
//记录结束时间,并输出两者的差值是多少分钟多少少
long endTime = System.currentTimeMillis();
print("爬取数据耗时:" + (endTime - startTime) / 1000 + "秒");
}
}
Loading…
Cancel
Save