You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

113 lines
4.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

package com.dsideal.base.Test;
import cn.hutool.core.io.FileUtil;
import com.jfinal.kit.StrKit;
import org.apache.commons.io.FileUtils;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import java.io.*;
import java.util.Arrays;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import static com.dsideal.base.Tools.FillData.DataEaseKit.DsKit.DocxUnzipDirectory;
public class ReadWordTOC {
public static void main(String[] args) throws IOException, DocumentException {
String wordPath = "c:/4.docx";
//解压缩
if (new File(DocxUnzipDirectory).exists()) {
FileUtils.deleteDirectory(new File(DocxUnzipDirectory));
}
File file = new File(wordPath);//取得word文件
FileInputStream inputStream = new FileInputStream(file);
ZipInputStream zipInputStream = new ZipInputStream(inputStream);
ZipEntry entry;
byte[] ch = new byte[256];
while ((entry = zipInputStream.getNextEntry()) != null) {
File zFile = new File(DocxUnzipDirectory + entry.getName());
if (entry.isDirectory()) {
if (!zFile.exists()) {
zFile.mkdirs();
}
zipInputStream.closeEntry();
} else {
File fpath = new File(zFile.getParent());
if (!fpath.exists()) {
fpath.mkdirs();
}
FileOutputStream outputStream = new FileOutputStream(zFile);
int i;
while ((i = zipInputStream.read(ch)) != -1) {
outputStream.write(ch, 0, i);
}
zipInputStream.closeEntry();
outputStream.close();
}
}
inputStream.close();
//读入XML
String xmlPath = DocxUnzipDirectory + "word\\document.xml";
SAXReader reader = new SAXReader(); // 创建 SAXReader 对象,读取 XML 文件
Document document = reader.read(new File(xmlPath));
Element root = document.getRootElement();// 获取根元素
List<Element> children = root.element("body").elements("p");//工作区
boolean out = false;
boolean parent = false;
for (Element child : children) {
if (child.getName().equals("p")) {
List<Element> pChildren = child.elements();
String content = "";
for (Element pChild : pChildren) {
if (!pChild.getName().equals("pPr")) {
if (pChild.getName().equals("r")) {
for (Element t : pChild.elements("t")) {
content = content + t.getText();
}
}
}
}
if (!StrKit.isBlank(content)) {
//如果content是 "图"+数字形式的,不输出
if (!content.contains("(图") && !content.contains("(图")) {
//只输出四和五
String[] printDx = {"一", "二", "三", "四", "五", "六", "七", "八", "九", "十"};
//转为 List<String>
List<String> printDxList = Arrays.asList(printDx);
//如果文字不是以上面printDx中的某一个开头而且不是以数字+.开头,不输出
if (content.startsWith("")) {
out = true;
}
//如果content第一位是数字第二位是小数点
if (content.length() > 1 && content.charAt(1) == '.' && (content.charAt(0) >= '0' && content.charAt(0) <= '9')) {
out = true;
}
if (printDxList.contains(content.substring(0, 1))) {
if ((content.substring(0, 1).equals("四") || content.substring(0, 1).equals("五"))) {
parent = true;
} else {
parent = false;
}
out = true;
}
if (StrKit.isBlank(content.trim())) continue;
if (out && parent) {
if (content.startsWith("")) System.out.print("\t");
if (content.matches("^[0-9]+\\..*")) System.out.print("\t\t");
System.out.println(content);
out = false;
}
}
}
}
}
}
}