|
|
|
@ -13,12 +13,22 @@ import java.util.Arrays;
|
|
|
|
|
import java.util.List;
|
|
|
|
|
import java.util.zip.ZipEntry;
|
|
|
|
|
import java.util.zip.ZipInputStream;
|
|
|
|
|
|
|
|
|
|
import com.dsideal.base.Tools.Util.LocalMysqlConnectUtil;
|
|
|
|
|
import static com.dsideal.base.Tools.FillData.DataEaseKit.DsKit.DocxUnzipDirectory;
|
|
|
|
|
|
|
|
|
|
public class ReadWordTOC {
|
|
|
|
|
public static void main(String[] args) throws IOException, DocumentException {
|
|
|
|
|
String wordPath = "c:/4.docx";
|
|
|
|
|
//只输出四和五
|
|
|
|
|
static String[] printDx = {"一", "二", "三", "四", "五", "六", "七", "八", "九", "十"};
|
|
|
|
|
//转为 List<String>
|
|
|
|
|
static List<String> printDxList = Arrays.asList(printDx);
|
|
|
|
|
/**
|
|
|
|
|
* 提取Word文档中的目录信息
|
|
|
|
|
*
|
|
|
|
|
* @param wordPath
|
|
|
|
|
* @throws DocumentException
|
|
|
|
|
* @throws IOException
|
|
|
|
|
*/
|
|
|
|
|
public static void getToc(String wordPath) throws DocumentException, IOException {
|
|
|
|
|
//解压缩
|
|
|
|
|
if (new File(DocxUnzipDirectory).exists()) {
|
|
|
|
|
FileUtils.deleteDirectory(new File(DocxUnzipDirectory));
|
|
|
|
@ -77,20 +87,14 @@ public class ReadWordTOC {
|
|
|
|
|
if (!StrKit.isBlank(content)) {
|
|
|
|
|
//如果content是 "图"+数字形式的,不输出
|
|
|
|
|
if (!content.contains("(图") && !content.contains("(图")) {
|
|
|
|
|
//只输出四和五
|
|
|
|
|
String[] printDx = {"一", "二", "三", "四", "五", "六", "七", "八", "九", "十"};
|
|
|
|
|
//转为 List<String>
|
|
|
|
|
List<String> printDxList = Arrays.asList(printDx);
|
|
|
|
|
//如果文字不是以上面printDx中的某一个开头,而且不是以数字+.开头,不输出
|
|
|
|
|
if (content.startsWith("(")) {
|
|
|
|
|
out = true;
|
|
|
|
|
}
|
|
|
|
|
//如果content第一位是数字,第二位是小数点
|
|
|
|
|
if (content.length() > 1 && content.charAt(1) == '.' && (content.charAt(0) >= '0' && content.charAt(0) <= '9')) {
|
|
|
|
|
if (content.startsWith("(") && printDxList.contains(String.valueOf(content.charAt(1)))) {
|
|
|
|
|
out = true;
|
|
|
|
|
}
|
|
|
|
|
//太长的不要
|
|
|
|
|
if (content.length() > 40) continue;
|
|
|
|
|
if (printDxList.contains(content.substring(0, 1))) {
|
|
|
|
|
if ((content.substring(0, 1).equals("四") || content.substring(0, 1).equals("五"))) {
|
|
|
|
|
if ((content.charAt(0) == '四' || content.charAt(0) == '五') && content.charAt(1) == '、') {
|
|
|
|
|
parent = true;
|
|
|
|
|
} else {
|
|
|
|
|
parent = false;
|
|
|
|
@ -102,7 +106,7 @@ public class ReadWordTOC {
|
|
|
|
|
if (out && parent) {
|
|
|
|
|
if (content.startsWith("(")) System.out.print("\t");
|
|
|
|
|
if (content.matches("^[0-9]+\\..*")) System.out.print("\t\t");
|
|
|
|
|
System.out.println(content);
|
|
|
|
|
System.out.println(content.split("。")[0]);
|
|
|
|
|
out = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -110,4 +114,25 @@ public class ReadWordTOC {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static void main(String[] args) throws IOException, DocumentException {
|
|
|
|
|
//初始化数据库连接
|
|
|
|
|
LocalMysqlConnectUtil.Init();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
String parentPath = "D:\\dsidealDoc\\全省及州市县区人口与教育报告集20241023\\133个县区报告2022\\县区研究报告";
|
|
|
|
|
List<File> files = FileUtil.loopFiles(parentPath, file -> true);
|
|
|
|
|
//处理这个目录
|
|
|
|
|
if (files != null) {
|
|
|
|
|
for (File file : files) {
|
|
|
|
|
//判断file是不是目录,是目录的需要跳过
|
|
|
|
|
if (file.isDirectory()) continue;
|
|
|
|
|
String fileName = file.getName();
|
|
|
|
|
//判断是否为docx文件
|
|
|
|
|
if (fileName.endsWith(".docx") && !fileName.startsWith("~")) {
|
|
|
|
|
getToc(file.getAbsolutePath());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|