|
|
|
@ -18,7 +18,7 @@ import static com.dsideal.base.Tools.FillData.DataEaseKit.DsKit.DocxUnzipDirecto
|
|
|
|
|
|
|
|
|
|
public class ReadWordTOC {
|
|
|
|
|
public static void main(String[] args) throws IOException, DocumentException {
|
|
|
|
|
String wordPath = "c:/双柏县人口变化及其对教育的影响.docx";
|
|
|
|
|
String wordPath = "c:/2.docx";
|
|
|
|
|
//解压缩
|
|
|
|
|
if (new File(DocxUnzipDirectory).exists()) {
|
|
|
|
|
FileUtils.deleteDirectory(new File(DocxUnzipDirectory));
|
|
|
|
@ -53,8 +53,6 @@ public class ReadWordTOC {
|
|
|
|
|
|
|
|
|
|
//读入XML
|
|
|
|
|
String xmlPath = DocxUnzipDirectory + "word\\document.xml";
|
|
|
|
|
///w:document/w:body/w:p/w:r/w:t
|
|
|
|
|
//System.out.println(FileUtil.readUtf8String(xmlPath).contains("加强"));
|
|
|
|
|
|
|
|
|
|
SAXReader reader = new SAXReader(); // 创建 SAXReader 对象,读取 XML 文件
|
|
|
|
|
Document document = reader.read(new File(xmlPath));
|
|
|
|
@ -63,36 +61,12 @@ public class ReadWordTOC {
|
|
|
|
|
boolean out = false;
|
|
|
|
|
for (Element child : children) {
|
|
|
|
|
if (child.getName().equals("p")) {
|
|
|
|
|
// if (child.element("pPr") == null) continue;
|
|
|
|
|
// if (child.element("pPr").element("rPr") == null) continue;
|
|
|
|
|
// if (child.element("pPr").element("rPr").element("rFonts") == null) continue;
|
|
|
|
|
//
|
|
|
|
|
// String font = child.element("pPr").element("rPr").element("rFonts").attribute("ascii").getValue();
|
|
|
|
|
// //子元素需要继续处理
|
|
|
|
|
// if (font.equals("方正仿宋简体")) continue;
|
|
|
|
|
// if (font.equals("华文楷体")) continue;
|
|
|
|
|
//
|
|
|
|
|
// if (child.element("pPr").element("rPr").element("sz") != null) {
|
|
|
|
|
// int fontSize = Integer.parseInt(child.element("pPr").element("rPr").element("sz").attribute("val").getValue());
|
|
|
|
|
// if (fontSize != 32) continue;
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
|
List<Element> pChildren = child.elements();
|
|
|
|
|
String content = "";
|
|
|
|
|
for (Element pChild : pChildren) {
|
|
|
|
|
if (!pChild.getName().equals("pPr")) {
|
|
|
|
|
if (pChild.getName().equals("r")) {
|
|
|
|
|
for (Element t : pChild.elements("t")) {
|
|
|
|
|
//此元素的字体与字号,同受它同级上面<rPr>的<rFonts>属性ascii控制
|
|
|
|
|
String font = pChild.element("rPr").element("rFonts").attribute("ascii").getValue();
|
|
|
|
|
if (font.equals("华文楷体")) continue;
|
|
|
|
|
int fontSize = Integer.parseInt(pChild.element("rPr").element("sz").attribute("val").getValue());
|
|
|
|
|
if (fontSize != 32) continue;
|
|
|
|
|
if (font.equals("方正仿宋简体")) {
|
|
|
|
|
//加粗,有/b标签的保留,其它的不保留continue掉
|
|
|
|
|
if (pChild.element("rPr").element("b") == null) continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
content = content + t.getText();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -102,26 +76,28 @@ public class ReadWordTOC {
|
|
|
|
|
if (!StrKit.isBlank(content)) {
|
|
|
|
|
//如果content是 "图"+数字形式的,不输出
|
|
|
|
|
if (!content.contains("(图") && !content.contains("(图")) {
|
|
|
|
|
//输出全部内容
|
|
|
|
|
// if(content.startsWith("(")) System.out.print("\t");
|
|
|
|
|
// //如果content是以 数字+.开头的,那么多输出两个tab
|
|
|
|
|
// if (content.matches("^[0-9]+\\..*")) System.out.print("\t\t");
|
|
|
|
|
// System.out.println(content);
|
|
|
|
|
//只输出四和五
|
|
|
|
|
String[] printDx = {"一", "二", "三", "六", "七", "八", "九", "十"};
|
|
|
|
|
String[] printDx = {"一", "二", "三", "四", "五", "六", "七", "八", "九", "十"};
|
|
|
|
|
//转为 List<String>
|
|
|
|
|
List<String> printDxList = Arrays.asList(printDx);
|
|
|
|
|
|
|
|
|
|
if (content.startsWith("四") || content.startsWith("五")) {
|
|
|
|
|
//如果文字不是以上面printDx中的某一个开头,而且不是以数字+.开头,不输出
|
|
|
|
|
if (content.startsWith("(")) {
|
|
|
|
|
out = true;
|
|
|
|
|
}
|
|
|
|
|
for (String s : printDxList) {
|
|
|
|
|
if (content.startsWith(s)) out = false;
|
|
|
|
|
//如果content第一位是数字,第二位是小数点
|
|
|
|
|
if (content.length() > 1 && content.charAt(1) == '.' && (content.charAt(0) >= '0' && content.charAt(0) <= '9')) {
|
|
|
|
|
out = true;
|
|
|
|
|
}
|
|
|
|
|
if (printDxList.contains(content.substring(0, 1))) {
|
|
|
|
|
out = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (StrKit.isBlank(content.trim())) continue;
|
|
|
|
|
if (out) {
|
|
|
|
|
if(content.startsWith("(")) System.out.print("\t");
|
|
|
|
|
if (content.startsWith("(")) System.out.print("\t");
|
|
|
|
|
if (content.matches("^[0-9]+\\..*")) System.out.print("\t\t");
|
|
|
|
|
System.out.println(content);
|
|
|
|
|
out = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|