|
|
|
@ -4,20 +4,18 @@ import cn.hutool.core.date.DateTime;
|
|
|
|
|
import com.alibaba.fastjson.JSONArray;
|
|
|
|
|
import com.alibaba.fastjson.JSONObject;
|
|
|
|
|
import com.dsideal.QingLong.Start;
|
|
|
|
|
import com.jfinal.kit.Kv;
|
|
|
|
|
import com.jfinal.kit.PropKit;
|
|
|
|
|
import com.jfinal.plugin.activerecord.ActiveRecordPlugin;
|
|
|
|
|
import com.jfinal.plugin.activerecord.CaseInsensitiveContainerFactory;
|
|
|
|
|
import com.jfinal.plugin.activerecord.Db;
|
|
|
|
|
import com.jfinal.plugin.activerecord.*;
|
|
|
|
|
import com.jfinal.plugin.activerecord.Record;
|
|
|
|
|
import com.jfinal.plugin.activerecord.dialect.PostgreSqlDialect;
|
|
|
|
|
import com.jfinal.plugin.hikaricp.HikariCpPlugin;
|
|
|
|
|
|
|
|
|
|
import java.io.File;
|
|
|
|
|
import java.util.HashMap;
|
|
|
|
|
import java.util.Map;
|
|
|
|
|
import java.util.UUID;
|
|
|
|
|
import java.util.*;
|
|
|
|
|
import java.util.concurrent.*;
|
|
|
|
|
import java.util.concurrent.atomic.AtomicInteger;
|
|
|
|
|
import java.util.stream.Collectors;
|
|
|
|
|
|
|
|
|
|
public class YunXiao {
|
|
|
|
|
// 使用线程安全的集合
|
|
|
|
@ -37,6 +35,13 @@ public class YunXiao {
|
|
|
|
|
//每次写入的数量
|
|
|
|
|
private static final int batchSize = 300;
|
|
|
|
|
|
|
|
|
|
//标准名称
|
|
|
|
|
public static Map<String, Kv> bzSchoolNameList = new HashMap<>();
|
|
|
|
|
//手工映射过名称
|
|
|
|
|
public static Map<String, Kv> handMatchSchoolList = new HashMap<>();
|
|
|
|
|
//哪些课程已经匹配过
|
|
|
|
|
public static Set<String> lessonAlreadyMatch = new HashSet<>();
|
|
|
|
|
|
|
|
|
|
public static void traverseTree(JSONArray treeArray, JSONArray tempTree) {
|
|
|
|
|
for (Object item : treeArray) {
|
|
|
|
|
JSONObject node = (JSONObject) item;
|
|
|
|
@ -344,6 +349,64 @@ public class YunXiao {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 获取标准学校名称列表
|
|
|
|
|
*
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
public static Map<String, Kv> getBzSchoolNameList() {
|
|
|
|
|
SqlPara sqlPara = Db.getSqlPara("YunXiao.getBzSchoolNameList");
|
|
|
|
|
List<Record> list = Db.find(sqlPara);
|
|
|
|
|
Map<String, Kv> map = new HashMap<>();
|
|
|
|
|
for (Record record : list) {
|
|
|
|
|
String bz_school_name = record.getStr("organization_name");
|
|
|
|
|
Kv kv = Kv.create();
|
|
|
|
|
kv.set("organization_no", record.getStr("organization_no"));
|
|
|
|
|
kv.set("gather_regionc", record.getStr("gather_regionc"));
|
|
|
|
|
map.put(bz_school_name, kv);
|
|
|
|
|
}
|
|
|
|
|
return map;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 获取手动匹配学校名称列表
|
|
|
|
|
*
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
public static Map<String, Kv> getHandMatchSchoolList() {
|
|
|
|
|
String sql = "select * from t_crawler_lesson_school where match_type=2";
|
|
|
|
|
List<Record> list = Db.find(sql);
|
|
|
|
|
Map<String, Kv> map = new HashMap<>();
|
|
|
|
|
for (Record record : list) {
|
|
|
|
|
String original_school_name = record.getStr("original_school_name");
|
|
|
|
|
Kv kv = Kv.create();
|
|
|
|
|
String organization_name = record.getStr("organization_name");
|
|
|
|
|
String organization_no = record.getStr("organization_no");
|
|
|
|
|
String gather_regionc = record.getStr("gather_regionc");
|
|
|
|
|
kv.set("organization_name", organization_name);
|
|
|
|
|
kv.set("organization_no", organization_no);
|
|
|
|
|
kv.set("gather_regionc", gather_regionc);
|
|
|
|
|
map.put(original_school_name, kv);
|
|
|
|
|
}
|
|
|
|
|
return map;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 获取已匹配的课程
|
|
|
|
|
*
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
public static Set<String> getLessonAlreadyMatch() {
|
|
|
|
|
String sql = "select * from t_crawler_lesson_school";
|
|
|
|
|
List<Record> list = Db.find(sql);
|
|
|
|
|
Set<String> set = new HashSet<>();
|
|
|
|
|
for (Record record : list) {
|
|
|
|
|
set.add(record.getStr("lesson_id"));
|
|
|
|
|
}
|
|
|
|
|
return set;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static void main(String[] args) {
|
|
|
|
|
// 初始化数据库连接
|
|
|
|
|
PropKit.use("application.properties");
|
|
|
|
@ -370,7 +433,7 @@ public class YunXiao {
|
|
|
|
|
|
|
|
|
|
// 清空表
|
|
|
|
|
String[] tables = {"t_crawler_subject", "t_crawler_scheme", "t_crawler_book",
|
|
|
|
|
"t_crawler_structure", "t_crawler_resource"};
|
|
|
|
|
"t_crawler_structure", "t_crawler_lesson"};
|
|
|
|
|
for (String table : tables) {
|
|
|
|
|
Db.update("truncate table " + table);
|
|
|
|
|
}
|
|
|
|
@ -381,6 +444,11 @@ public class YunXiao {
|
|
|
|
|
map.put("3", "初中");
|
|
|
|
|
map.put("4", "高中");
|
|
|
|
|
|
|
|
|
|
//初始化三个全局量
|
|
|
|
|
bzSchoolNameList = getBzSchoolNameList();
|
|
|
|
|
handMatchSchoolList = getHandMatchSchoolList();
|
|
|
|
|
lessonAlreadyMatch = getLessonAlreadyMatch();
|
|
|
|
|
|
|
|
|
|
print("开始爬取数据!");
|
|
|
|
|
|
|
|
|
|
//记录开始时间
|
|
|
|
@ -438,8 +506,52 @@ public class YunXiao {
|
|
|
|
|
Db.batchSave("t_crawler_scheme", schemeList, batchSize);
|
|
|
|
|
Db.batchSave("t_crawler_book", bookList, batchSize);
|
|
|
|
|
Db.batchSave("t_crawler_structure", structureList, batchSize);
|
|
|
|
|
Db.batchSave("t_crawler_resource", lessonList, batchSize);
|
|
|
|
|
|
|
|
|
|
Db.batchSave("t_crawler_lesson", lessonList, batchSize);
|
|
|
|
|
|
|
|
|
|
print("开始记录课程与学校的关系...");
|
|
|
|
|
//保存课程与学校的关联表
|
|
|
|
|
List<Record> writeList = new ArrayList<>();
|
|
|
|
|
for (Record record : lessonList) {
|
|
|
|
|
String original_school_name = record.getStr("teacher_school_name");//原始学校名称
|
|
|
|
|
String organization_name = "";
|
|
|
|
|
String organization_no = "";
|
|
|
|
|
String gather_regionc = "";
|
|
|
|
|
int match_type = 1;
|
|
|
|
|
String lesson_id = record.getStr("lesson_id");
|
|
|
|
|
if (lessonAlreadyMatch.contains(lesson_id)) continue;//如果记录过此课程的学校关系,本次就不再记录了
|
|
|
|
|
if (bzSchoolNameList.containsKey(record.getStr("school_name"))) {
|
|
|
|
|
//100%命中的名称
|
|
|
|
|
Kv kv = bzSchoolNameList.get(record.getStr("school_name"));
|
|
|
|
|
organization_name = kv.getStr("organization_name");
|
|
|
|
|
organization_no = kv.getStr("organization_no");
|
|
|
|
|
gather_regionc = kv.getStr("gather_regionc");
|
|
|
|
|
match_type = 1;
|
|
|
|
|
} else if (handMatchSchoolList.containsKey(record.getStr("school_name"))) {//手动映射过的名称
|
|
|
|
|
Kv kv = handMatchSchoolList.get(record.getStr("school_name"));
|
|
|
|
|
organization_name = kv.getStr("organization_name");
|
|
|
|
|
organization_no = kv.getStr("organization_no");
|
|
|
|
|
gather_regionc = kv.getStr("gather_regionc");
|
|
|
|
|
match_type = 2;
|
|
|
|
|
} else {
|
|
|
|
|
match_type = 0;//待匹配
|
|
|
|
|
}
|
|
|
|
|
Record rWrite = new Record();
|
|
|
|
|
rWrite.set("lesson_id", lesson_id);
|
|
|
|
|
rWrite.set("original_school_name", original_school_name);
|
|
|
|
|
rWrite.set("organization_name", organization_name);
|
|
|
|
|
rWrite.set("organization_no", organization_no);
|
|
|
|
|
rWrite.set("gather_regionc", gather_regionc);
|
|
|
|
|
rWrite.set("match_type", match_type);
|
|
|
|
|
writeList.add(rWrite);
|
|
|
|
|
}
|
|
|
|
|
//对writeList根据lesson_id去重
|
|
|
|
|
writeList = writeList.stream().collect(
|
|
|
|
|
Collectors.collectingAndThen(Collectors.toCollection(() -> new TreeSet<>(
|
|
|
|
|
Comparator.comparing(r -> r.getStr("lesson_id"))
|
|
|
|
|
)
|
|
|
|
|
), ArrayList::new));
|
|
|
|
|
|
|
|
|
|
Db.batchSave("t_crawler_lesson_school", writeList, batchSize);
|
|
|
|
|
print("爬取数据完成!");
|
|
|
|
|
print("总共收集资源:" + lessonList.size() + "个");
|
|
|
|
|
print("总共收集节点:" + structureList.size() + "个");
|
|
|
|
@ -447,4 +559,4 @@ public class YunXiao {
|
|
|
|
|
long endTime = System.currentTimeMillis();
|
|
|
|
|
print("爬取数据耗时:" + (endTime - startTime) / 1000 + "秒");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|