|
|
|
@ -1,6 +1,7 @@
|
|
|
|
|
package com.dsideal.base.Test;
|
|
|
|
|
|
|
|
|
|
import cn.hutool.core.io.FileUtil;
|
|
|
|
|
import com.dsideal.base.DataEase.Model.DataEaseModel;
|
|
|
|
|
import com.jfinal.kit.StrKit;
|
|
|
|
|
import org.apache.commons.io.FileUtils;
|
|
|
|
|
import org.dom4j.Document;
|
|
|
|
@ -13,7 +14,9 @@ import java.util.Arrays;
|
|
|
|
|
import java.util.List;
|
|
|
|
|
import java.util.zip.ZipEntry;
|
|
|
|
|
import java.util.zip.ZipInputStream;
|
|
|
|
|
|
|
|
|
|
import com.dsideal.base.Tools.Util.LocalMysqlConnectUtil;
|
|
|
|
|
|
|
|
|
|
import static com.dsideal.base.Tools.FillData.DataEaseKit.DsKit.DocxUnzipDirectory;
|
|
|
|
|
|
|
|
|
|
public class ReadWordTOC {
|
|
|
|
@ -21,6 +24,9 @@ public class ReadWordTOC {
|
|
|
|
|
static String[] printDx = {"一", "二", "三", "四", "五", "六", "七", "八", "九", "十"};
|
|
|
|
|
//转为 List<String>
|
|
|
|
|
static List<String> printDxList = Arrays.asList(printDx);
|
|
|
|
|
//哪些是处理不了的,就不处理了~
|
|
|
|
|
static String[] excludeCityList = {"~$", "磨憨-磨丁", "经开区", "阳宗海"};
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 提取Word文档中的目录信息
|
|
|
|
|
*
|
|
|
|
@ -69,7 +75,7 @@ public class ReadWordTOC {
|
|
|
|
|
Element root = document.getRootElement();// 获取根元素
|
|
|
|
|
List<Element> children = root.element("body").elements("p");//工作区
|
|
|
|
|
boolean out = false;
|
|
|
|
|
boolean parent = false;
|
|
|
|
|
int parent = 0;
|
|
|
|
|
for (Element child : children) {
|
|
|
|
|
if (child.getName().equals("p")) {
|
|
|
|
|
List<Element> pChildren = child.elements();
|
|
|
|
@ -94,19 +100,25 @@ public class ReadWordTOC {
|
|
|
|
|
//太长的不要
|
|
|
|
|
if (content.length() > 40) continue;
|
|
|
|
|
if (printDxList.contains(content.substring(0, 1))) {
|
|
|
|
|
if ((content.charAt(0) == '四' || content.charAt(0) == '五') && content.charAt(1) == '、') {
|
|
|
|
|
parent = true;
|
|
|
|
|
} else {
|
|
|
|
|
parent = false;
|
|
|
|
|
if (content.charAt(0) == '四' && content.charAt(1) == '、') {
|
|
|
|
|
parent = 4;
|
|
|
|
|
}
|
|
|
|
|
if (content.charAt(0) == '五' && content.charAt(1) == '、') {
|
|
|
|
|
parent = 5;
|
|
|
|
|
}
|
|
|
|
|
out = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (StrKit.isBlank(content.trim())) continue;
|
|
|
|
|
if (out && parent) {
|
|
|
|
|
if (content.startsWith("(")) System.out.print("\t");
|
|
|
|
|
if (content.matches("^[0-9]+\\..*")) System.out.print("\t\t");
|
|
|
|
|
System.out.println(content.split("。")[0]);
|
|
|
|
|
if (out && parent > 0) {
|
|
|
|
|
if (!content.startsWith("(")) continue;
|
|
|
|
|
if (parent == 4) {
|
|
|
|
|
System.out.println("==================四===============");
|
|
|
|
|
System.out.println(content.split("。")[0]);
|
|
|
|
|
}
|
|
|
|
|
if (parent == 5) {
|
|
|
|
|
System.out.println("==================五===============");
|
|
|
|
|
System.out.println(content.split("。")[0]);
|
|
|
|
|
}
|
|
|
|
|
out = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -115,8 +127,10 @@ public class ReadWordTOC {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static DataEaseModel dm = new DataEaseModel();
|
|
|
|
|
|
|
|
|
|
public static void main(String[] args) throws IOException, DocumentException {
|
|
|
|
|
//初始化数据库连接
|
|
|
|
|
//初始化数据库连接
|
|
|
|
|
LocalMysqlConnectUtil.Init();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -130,6 +144,25 @@ public class ReadWordTOC {
|
|
|
|
|
String fileName = file.getName();
|
|
|
|
|
//判断是否为docx文件
|
|
|
|
|
if (fileName.endsWith(".docx") && !fileName.startsWith("~")) {
|
|
|
|
|
boolean flag = false;
|
|
|
|
|
for (String s : excludeCityList) {
|
|
|
|
|
if (file.getName().contains(s)) {
|
|
|
|
|
flag = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (flag) continue;
|
|
|
|
|
//县区名称
|
|
|
|
|
String areaName = dm.getAreaName(file.getName());
|
|
|
|
|
//市州名称
|
|
|
|
|
String cityName = dm.getCityNameByAreaName(areaName);
|
|
|
|
|
|
|
|
|
|
if (StrKit.isBlank(cityName) || StrKit.isBlank(areaName)) {
|
|
|
|
|
System.out.println("发现异常数据,请人工处理:" + file.getName());
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
//县区名称
|
|
|
|
|
System.out.println("正在进行" + cityName + "-" + areaName + "的数据填充~");
|
|
|
|
|
getToc(file.getAbsolutePath());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|