package com.dsideal.base.Test; import cn.hutool.core.io.FileUtil; import com.jfinal.kit.StrKit; import org.apache.commons.io.FileUtils; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; import java.io.*; import java.util.Arrays; import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import static com.dsideal.base.Tools.FillData.DataEaseKit.DsKit.DocxUnzipDirectory; public class ReadWordTOC { public static void main(String[] args) throws IOException, DocumentException { String wordPath = "c:/2.docx"; //解压缩 if (new File(DocxUnzipDirectory).exists()) { FileUtils.deleteDirectory(new File(DocxUnzipDirectory)); } File file = new File(wordPath);//取得word文件 FileInputStream inputStream = new FileInputStream(file); ZipInputStream zipInputStream = new ZipInputStream(inputStream); ZipEntry entry; byte[] ch = new byte[256]; while ((entry = zipInputStream.getNextEntry()) != null) { File zFile = new File(DocxUnzipDirectory + entry.getName()); if (entry.isDirectory()) { if (!zFile.exists()) { zFile.mkdirs(); } zipInputStream.closeEntry(); } else { File fpath = new File(zFile.getParent()); if (!fpath.exists()) { fpath.mkdirs(); } FileOutputStream outputStream = new FileOutputStream(zFile); int i; while ((i = zipInputStream.read(ch)) != -1) { outputStream.write(ch, 0, i); } zipInputStream.closeEntry(); outputStream.close(); } } inputStream.close(); //读入XML String xmlPath = DocxUnzipDirectory + "word\\document.xml"; SAXReader reader = new SAXReader(); // 创建 SAXReader 对象,读取 XML 文件 Document document = reader.read(new File(xmlPath)); Element root = document.getRootElement();// 获取根元素 List children = root.element("body").elements("p");//工作区 boolean out = false; boolean parent = false; for (Element child : children) { if (child.getName().equals("p")) { List pChildren = child.elements(); String content = ""; for (Element pChild : pChildren) { if (!pChild.getName().equals("pPr")) { if (pChild.getName().equals("r")) { for (Element t : pChild.elements("t")) { content = content + t.getText(); } } } } if (!StrKit.isBlank(content)) { //如果content是 "图"+数字形式的,不输出 if (!content.contains("(图") && !content.contains("(图")) { //只输出四和五 String[] printDx = {"一", "二", "三", "四", "五", "六", "七", "八", "九", "十"}; //转为 List List printDxList = Arrays.asList(printDx); //如果文字不是以上面printDx中的某一个开头,而且不是以数字+.开头,不输出 if (content.startsWith("(")) { out = true; } //如果content第一位是数字,第二位是小数点 if (content.length() > 1 && content.charAt(1) == '.' && (content.charAt(0) >= '0' && content.charAt(0) <= '9')) { out = true; } if (printDxList.contains(content.substring(0, 1))) { if ((content.substring(0, 1).equals("四") || content.substring(0, 1).equals("五"))) { parent = true; } else { parent = false; } out = true; } if (StrKit.isBlank(content.trim())) continue; if (out && parent) { if (content.startsWith("(")) System.out.print("\t"); if (content.matches("^[0-9]+\\..*")) System.out.print("\t\t"); System.out.println(content); out = false; } } } } } } }