package com.dsideal.base.Test; import cn.hutool.core.io.FileUtil; import com.jfinal.kit.StrKit; import org.apache.commons.io.FileUtils; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; import java.io.*; import java.util.Arrays; import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import static com.dsideal.base.Tools.FillData.DataEaseKit.DsKit.DocxUnzipDirectory; public class ReadWordTOC { public static void main(String[] args) throws IOException, DocumentException { String wordPath = "c:/双柏县人口变化及其对教育的影响.docx"; //解压缩 if (new File(DocxUnzipDirectory).exists()) { FileUtils.deleteDirectory(new File(DocxUnzipDirectory)); } File file = new File(wordPath);//取得word文件 FileInputStream inputStream = new FileInputStream(file); ZipInputStream zipInputStream = new ZipInputStream(inputStream); ZipEntry entry; byte[] ch = new byte[256]; while ((entry = zipInputStream.getNextEntry()) != null) { File zFile = new File(DocxUnzipDirectory + entry.getName()); if (entry.isDirectory()) { if (!zFile.exists()) { zFile.mkdirs(); } zipInputStream.closeEntry(); } else { File fpath = new File(zFile.getParent()); if (!fpath.exists()) { fpath.mkdirs(); } FileOutputStream outputStream = new FileOutputStream(zFile); int i; while ((i = zipInputStream.read(ch)) != -1) { outputStream.write(ch, 0, i); } zipInputStream.closeEntry(); outputStream.close(); } } inputStream.close(); //读入XML String xmlPath = DocxUnzipDirectory + "word\\document.xml"; ///w:document/w:body/w:p/w:r/w:t //System.out.println(FileUtil.readUtf8String(xmlPath).contains("加强")); SAXReader reader = new SAXReader(); // 创建 SAXReader 对象,读取 XML 文件 Document document = reader.read(new File(xmlPath)); Element root = document.getRootElement();// 获取根元素 List children = root.element("body").elements("p");//工作区 boolean out = false; for (Element child : children) { if (child.getName().equals("p")) { // if (child.element("pPr") == null) continue; // if (child.element("pPr").element("rPr") == null) continue; // if (child.element("pPr").element("rPr").element("rFonts") == null) continue; // // String font = child.element("pPr").element("rPr").element("rFonts").attribute("ascii").getValue(); // //子元素需要继续处理 // if (font.equals("方正仿宋简体")) continue; // if (font.equals("华文楷体")) continue; // // if (child.element("pPr").element("rPr").element("sz") != null) { // int fontSize = Integer.parseInt(child.element("pPr").element("rPr").element("sz").attribute("val").getValue()); // if (fontSize != 32) continue; // } List pChildren = child.elements(); String content = ""; for (Element pChild : pChildren) { if (!pChild.getName().equals("pPr")) { if (pChild.getName().equals("r")) { for (Element t : pChild.elements("t")) { //此元素的字体与字号,同受它同级上面属性ascii控制 String font = pChild.element("rPr").element("rFonts").attribute("ascii").getValue(); if (font.equals("华文楷体")) continue; int fontSize = Integer.parseInt(pChild.element("rPr").element("sz").attribute("val").getValue()); if (fontSize != 32) continue; if (font.equals("方正仿宋简体")) { //加粗,有/b标签的保留,其它的不保留continue掉 if (pChild.element("rPr").element("b") == null) continue; } content = content + t.getText(); } } } } if (!StrKit.isBlank(content)) { //如果content是 "图"+数字形式的,不输出 if (!content.contains("(图") && !content.contains("(图")) { //输出全部内容 // if(content.startsWith("(")) System.out.print("\t"); // //如果content是以 数字+.开头的,那么多输出两个tab // if (content.matches("^[0-9]+\\..*")) System.out.print("\t\t"); // System.out.println(content); //只输出四和五 String[] printDx = {"一", "二", "三", "六", "七", "八", "九", "十"}; //转为 List List printDxList = Arrays.asList(printDx); if (content.startsWith("四") || content.startsWith("五")) { out = true; } for (String s : printDxList) { if (content.startsWith(s)) out = false; } if (out) { if(content.startsWith("(")) System.out.print("\t"); System.out.println(content); } } } } } } }