You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

113 lines
4.9 KiB

8 months ago
package com.dsideal.base.Test;
import cn.hutool.core.io.FileUtil;
8 months ago
import com.jfinal.kit.StrKit;
8 months ago
import org.apache.commons.io.FileUtils;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import java.io.*;
8 months ago
import java.util.Arrays;
8 months ago
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import static com.dsideal.base.Tools.FillData.DataEaseKit.DsKit.DocxUnzipDirectory;
public class ReadWordTOC {
public static void main(String[] args) throws IOException, DocumentException {
8 months ago
String wordPath = "c:/4.docx";
8 months ago
//解压缩
if (new File(DocxUnzipDirectory).exists()) {
FileUtils.deleteDirectory(new File(DocxUnzipDirectory));
}
File file = new File(wordPath);//取得word文件
FileInputStream inputStream = new FileInputStream(file);
ZipInputStream zipInputStream = new ZipInputStream(inputStream);
ZipEntry entry;
byte[] ch = new byte[256];
while ((entry = zipInputStream.getNextEntry()) != null) {
File zFile = new File(DocxUnzipDirectory + entry.getName());
if (entry.isDirectory()) {
if (!zFile.exists()) {
zFile.mkdirs();
}
zipInputStream.closeEntry();
} else {
File fpath = new File(zFile.getParent());
if (!fpath.exists()) {
fpath.mkdirs();
}
FileOutputStream outputStream = new FileOutputStream(zFile);
int i;
while ((i = zipInputStream.read(ch)) != -1) {
outputStream.write(ch, 0, i);
}
zipInputStream.closeEntry();
outputStream.close();
}
}
inputStream.close();
//读入XML
String xmlPath = DocxUnzipDirectory + "word\\document.xml";
SAXReader reader = new SAXReader(); // 创建 SAXReader 对象,读取 XML 文件
Document document = reader.read(new File(xmlPath));
Element root = document.getRootElement();// 获取根元素
List<Element> children = root.element("body").elements("p");//工作区
8 months ago
boolean out = false;
8 months ago
boolean parent = false;
8 months ago
for (Element child : children) {
if (child.getName().equals("p")) {
List<Element> pChildren = child.elements();
String content = "";
for (Element pChild : pChildren) {
8 months ago
if (!pChild.getName().equals("pPr")) {
8 months ago
if (pChild.getName().equals("r")) {
for (Element t : pChild.elements("t")) {
content = content + t.getText();
}
}
}
}
8 months ago
if (!StrKit.isBlank(content)) {
//如果content是 "图"+数字形式的,不输出
if (!content.contains("(图") && !content.contains("(图")) {
//只输出四和五
8 months ago
String[] printDx = {"一", "二", "三", "四", "五", "六", "七", "八", "九", "十"};
8 months ago
//转为 List<String>
List<String> printDxList = Arrays.asList(printDx);
8 months ago
//如果文字不是以上面printDx中的某一个开头而且不是以数字+.开头,不输出
if (content.startsWith("")) {
8 months ago
out = true;
}
8 months ago
//如果content第一位是数字第二位是小数点
if (content.length() > 1 && content.charAt(1) == '.' && (content.charAt(0) >= '0' && content.charAt(0) <= '9')) {
out = true;
}
if (printDxList.contains(content.substring(0, 1))) {
8 months ago
if ((content.substring(0, 1).equals("四") || content.substring(0, 1).equals("五"))) {
parent = true;
} else {
parent = false;
}
8 months ago
out = true;
8 months ago
}
8 months ago
if (StrKit.isBlank(content.trim())) continue;
8 months ago
if (out && parent) {
8 months ago
if (content.startsWith("")) System.out.print("\t");
if (content.matches("^[0-9]+\\..*")) System.out.print("\t\t");
8 months ago
System.out.println(content);
8 months ago
out = false;
8 months ago
}
}
}
8 months ago
}
}
}
}