main
黄海 7 months ago
parent 0d23b4ebce
commit 1324cbdd50

@ -67,7 +67,7 @@ public class YunXiaoOneByOne {
Db.update(truncatSql);
truncatSql = "truncate table t_crawler_structure";
Db.update(truncatSql);
truncatSql = "truncate table t_crawler_resource";
truncatSql = "truncate table t_crawler_lesson";
Db.update(truncatSql);
Map<String, String> map = new HashMap<>();
@ -232,7 +232,7 @@ public class YunXiaoOneByOne {
Db.batchSave("t_crawler_scheme", schemeList, 300);
Db.batchSave("t_crawler_book", bookList, 300);
Db.batchSave("t_crawler_structure", structureList, 300);
Db.batchSave("t_crawler_resource", lessonList, 300);
Db.batchSave("t_crawler_lesson", lessonList, 300);
print("爬取数据完成!");
}
}

@ -4,20 +4,18 @@ import cn.hutool.core.date.DateTime;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.dsideal.QingLong.Start;
import com.jfinal.kit.Kv;
import com.jfinal.kit.PropKit;
import com.jfinal.plugin.activerecord.ActiveRecordPlugin;
import com.jfinal.plugin.activerecord.CaseInsensitiveContainerFactory;
import com.jfinal.plugin.activerecord.Db;
import com.jfinal.plugin.activerecord.*;
import com.jfinal.plugin.activerecord.Record;
import com.jfinal.plugin.activerecord.dialect.PostgreSqlDialect;
import com.jfinal.plugin.hikaricp.HikariCpPlugin;
import java.io.File;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
public class YunXiao {
// 使用线程安全的集合
@ -37,6 +35,13 @@ public class YunXiao {
//每次写入的数量
private static final int batchSize = 300;
//标准名称
public static Map<String, Kv> bzSchoolNameList = new HashMap<>();
//手工映射过名称
public static Map<String, Kv> handMatchSchoolList = new HashMap<>();
//哪些课程已经匹配过
public static Set<String> lessonAlreadyMatch = new HashSet<>();
public static void traverseTree(JSONArray treeArray, JSONArray tempTree) {
for (Object item : treeArray) {
JSONObject node = (JSONObject) item;
@ -344,6 +349,64 @@ public class YunXiao {
}
}
/**
*
*
* @return
*/
public static Map<String, Kv> getBzSchoolNameList() {
SqlPara sqlPara = Db.getSqlPara("YunXiao.getBzSchoolNameList");
List<Record> list = Db.find(sqlPara);
Map<String, Kv> map = new HashMap<>();
for (Record record : list) {
String bz_school_name = record.getStr("organization_name");
Kv kv = Kv.create();
kv.set("organization_no", record.getStr("organization_no"));
kv.set("gather_regionc", record.getStr("gather_regionc"));
map.put(bz_school_name, kv);
}
return map;
}
/**
*
*
* @return
*/
public static Map<String, Kv> getHandMatchSchoolList() {
String sql = "select * from t_crawler_lesson_school where match_type=2";
List<Record> list = Db.find(sql);
Map<String, Kv> map = new HashMap<>();
for (Record record : list) {
String original_school_name = record.getStr("original_school_name");
Kv kv = Kv.create();
String organization_name = record.getStr("organization_name");
String organization_no = record.getStr("organization_no");
String gather_regionc = record.getStr("gather_regionc");
kv.set("organization_name", organization_name);
kv.set("organization_no", organization_no);
kv.set("gather_regionc", gather_regionc);
map.put(original_school_name, kv);
}
return map;
}
/**
*
*
* @return
*/
public static Set<String> getLessonAlreadyMatch() {
String sql = "select * from t_crawler_lesson_school";
List<Record> list = Db.find(sql);
Set<String> set = new HashSet<>();
for (Record record : list) {
set.add(record.getStr("lesson_id"));
}
return set;
}
public static void main(String[] args) {
// 初始化数据库连接
PropKit.use("application.properties");
@ -370,7 +433,7 @@ public class YunXiao {
// 清空表
String[] tables = {"t_crawler_subject", "t_crawler_scheme", "t_crawler_book",
"t_crawler_structure", "t_crawler_resource"};
"t_crawler_structure", "t_crawler_lesson"};
for (String table : tables) {
Db.update("truncate table " + table);
}
@ -381,6 +444,11 @@ public class YunXiao {
map.put("3", "初中");
map.put("4", "高中");
//初始化三个全局量
bzSchoolNameList = getBzSchoolNameList();
handMatchSchoolList = getHandMatchSchoolList();
lessonAlreadyMatch = getLessonAlreadyMatch();
print("开始爬取数据!");
//记录开始时间
@ -438,8 +506,52 @@ public class YunXiao {
Db.batchSave("t_crawler_scheme", schemeList, batchSize);
Db.batchSave("t_crawler_book", bookList, batchSize);
Db.batchSave("t_crawler_structure", structureList, batchSize);
Db.batchSave("t_crawler_resource", lessonList, batchSize);
Db.batchSave("t_crawler_lesson", lessonList, batchSize);
print("开始记录课程与学校的关系...");
//保存课程与学校的关联表
List<Record> writeList = new ArrayList<>();
for (Record record : lessonList) {
String original_school_name = record.getStr("teacher_school_name");//原始学校名称
String organization_name = "";
String organization_no = "";
String gather_regionc = "";
int match_type = 1;
String lesson_id = record.getStr("lesson_id");
if (lessonAlreadyMatch.contains(lesson_id)) continue;//如果记录过此课程的学校关系,本次就不再记录了
if (bzSchoolNameList.containsKey(record.getStr("school_name"))) {
//100%命中的名称
Kv kv = bzSchoolNameList.get(record.getStr("school_name"));
organization_name = kv.getStr("organization_name");
organization_no = kv.getStr("organization_no");
gather_regionc = kv.getStr("gather_regionc");
match_type = 1;
} else if (handMatchSchoolList.containsKey(record.getStr("school_name"))) {//手动映射过的名称
Kv kv = handMatchSchoolList.get(record.getStr("school_name"));
organization_name = kv.getStr("organization_name");
organization_no = kv.getStr("organization_no");
gather_regionc = kv.getStr("gather_regionc");
match_type = 2;
} else {
match_type = 0;//待匹配
}
Record rWrite = new Record();
rWrite.set("lesson_id", lesson_id);
rWrite.set("original_school_name", original_school_name);
rWrite.set("organization_name", organization_name);
rWrite.set("organization_no", organization_no);
rWrite.set("gather_regionc", gather_regionc);
rWrite.set("match_type", match_type);
writeList.add(rWrite);
}
//对writeList根据lesson_id去重
writeList = writeList.stream().collect(
Collectors.collectingAndThen(Collectors.toCollection(() -> new TreeSet<>(
Comparator.comparing(r -> r.getStr("lesson_id"))
)
), ArrayList::new));
Db.batchSave("t_crawler_lesson_school", writeList, batchSize);
print("爬取数据完成!");
print("总共收集资源:" + lessonList.size() + "个");
print("总共收集节点:" + structureList.size() + "个");
@ -447,4 +559,4 @@ public class YunXiao {
long endTime = System.currentTimeMillis();
print("爬取数据耗时:" + (endTime - startTime) / 1000 + "秒");
}
}
}

@ -0,0 +1,6 @@
#namespace("YunXiao")
-- 获取标准的学校名称列表
#sql("getBzSchoolNameList")
select organization_no,organization_name,gather_regionc from 1001 where a=01 and length(organization_no)=10
#end
#end
Loading…
Cancel
Save