poi方式读取word目录大纲_poi读取word大纲级别_勒布朗.马克思的博客-CSDN博客

如下图的word目录大纲该怎么获取呢?
在这里插入图片描述
试过用jacob方式确实可以直接读取到大纲的编号,但是jacob不支持在linux上的调用(不确定反正网络上都说不支持,自己目前没有找到可以解决linux如何调用的问题),没办法只好使用poi硬编码实现了一把,目前只支持读取4个层级以内的大纲(大纲的编号,就是1.、1.1.、…这些编号必须是word自动生成的不是手动写上去的,这是规范),基本原理是读取大纲的标题级别根据大纲标题级别来构造层级关系。
话不多说上才艺:

maven依赖

<dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>4.1.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>4.1.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml-schemas</artifactId>
            <version>4.1.2</version>
        </dependency>

ReadWordTest.java

import org.apache.commons.lang3.StringUtils;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;

import java.io.File;
import java.io.FileInputStream;
import java.util.*;

public class ReadWordTest {
    /**
     * Word中的大纲级别,可以通过getPPr().getOutlineLvl()直接提取,但需要注意,Word中段落级别,通过如下三种方式定义:
     * 1、直接对段落进行定义;
     * 2、对段落的样式进行定义;
     * 3、对段落样式的基础样式进行定义。
     * 因此,在通过“getPPr().getOutlineLvl()”提取时,需要依次在如上三处读取。
     *
     * @param doc
     * @param para
     * @return
     */
    private static String getTitleLvl(XWPFDocument doc, XWPFParagraph para) {
        String titleLvl = "";
        try {
            //判断该段落是否设置了大纲级别
            if (para.getCTP().getPPr().getOutlineLvl() != null) {
                return String.valueOf(para.getCTP().getPPr().getOutlineLvl().getVal());
            }
        } catch (Exception e) {
        }
        try {
            //判断该段落的样式是否设置了大纲级别
            if (doc.getStyles().getStyle(para.getStyle()).getCTStyle().getPPr().getOutlineLvl() != null) {
                return String.valueOf(doc.getStyles().getStyle(para.getStyle()).getCTStyle().getPPr().getOutlineLvl().getVal());
            }
        } catch (Exception e) {
        }
        try {
            //判断该段落的样式的基础样式是否设置了大纲级别
            if (doc.getStyles().getStyle(doc.getStyles().getStyle(para.getStyle()).getCTStyle().getBasedOn().getVal())
                    .getCTStyle().getPPr().getOutlineLvl() != null) {
                String styleName = doc.getStyles().getStyle(para.getStyle()).getCTStyle().getBasedOn().getVal();
                return String.valueOf(doc.getStyles().getStyle(styleName).getCTStyle().getPPr().getOutlineLvl().getVal());
            }
        } catch (Exception e) {

        }
        try {
            if (para.getStyleID() != null) {
                return para.getStyleID();
            }
        } catch (Exception e) {

        }

        return titleLvl;
    }


    public static void main(String[] args) throws Exception {
        File file = new File("C:\\Users\\lenovo\\Desktop\\test5.docx");
        FileInputStream fis = new FileInputStream(file);
        XWPFDocument xdoc = new XWPFDocument(fis);
        List<XWPFParagraph> paragraphs = xdoc.getParagraphs();
        List<ReadDto> readDtos = new ArrayList<>();
        for (XWPFParagraph paragraph : paragraphs) {
            String text = paragraph.getText();
            String titleLvl = getTitleLvl(xdoc, paragraph);
            if (StringUtils.isNotEmpty(titleLvl)) {
                int level = Integer.valueOf(titleLvl);
//                System.out.println("text: " + text + ", titleLvl: " + titleLvl);
                ReadDto readDto = new ReadDto();
                readDto.setText(text);
                readDto.setTitleLevel(level);
                readDtos.add(readDto);
            }
        }
        int zeroCount = 0;//0出现的次数
        int oneCount = 0;//1出现的次数
        int twoCount = 0;//2出现的次数
        int threeCount = 0;//3出现的次数
        int curPoint = 0;//当前指针值
        for (int i = 0; i < readDtos.size(); i++) {
            int curLevel = readDtos.get(i).getTitleLevel();
            if (curLevel > 4) {
                throw new RuntimeException("暂不支持目录层级超过4层!!!");
            }
            if (curPoint == 0) {
                zeroCount++;
                curPoint = 1;
                readDtos.get(i).setPrefix(zeroCount + ".");
            } else if (curPoint == 1) {
                if (curLevel == 0) {
                    zeroCount++;
                    oneCount = 0;
                    twoCount = 0;
                    threeCount = 0;
                    curPoint = 1;
                    readDtos.get(i).setPrefix(zeroCount + ".");
                }
                if (curLevel == 1) {
                    curPoint++;
                    oneCount++;
                    readDtos.get(i).setPrefix(zeroCount + "." + "1.");
                }
            } else if (curPoint == 2) {
                if (curLevel == 0) {
                    zeroCount++;
                    oneCount = 0;
                    twoCount = 0;
                    threeCount = 0;
                    curPoint = 1;
                    readDtos.get(i).setPrefix(zeroCount + ".");
                } else if (curLevel == 1) {
                    oneCount++;
                    twoCount = 0;
                    curPoint = 2;
                    readDtos.get(i).setPrefix(zeroCount + "." + oneCount + ".");
                } else if (curLevel == 2) {
                    curPoint = 3;
                    twoCount++;
                    threeCount = 0;
                    readDtos.get(i).setPrefix(zeroCount + "." + oneCount + "." + twoCount + ".");
                }
            } else if (curPoint == 3) {
                if (curLevel == 0) {
                    zeroCount++;
                    oneCount = 0;
                    twoCount = 0;
                    threeCount = 0;
                    curPoint = 1;
                    readDtos.get(i).setPrefix(zeroCount + ".");
                } else if (curLevel == 1) {
                    oneCount++;
                    curPoint = 2;
                    twoCount = 0;
                    readDtos.get(i).setPrefix(zeroCount + "." + oneCount + ".");
                } else if (curLevel == 2) {
                    curPoint = 3;
                    twoCount++;
                    threeCount = 0;
                    readDtos.get(i).setPrefix(zeroCount + "." + oneCount + "." + twoCount + ".");
                } else if (curLevel == 3) {
                    threeCount++;
                    if (i < readDtos.size() - 1) {
                        int nextLevel = readDtos.get(i + 1).getTitleLevel();
                        if (nextLevel > 3) {
                            throw new RuntimeException("暂不支持目录层级超过4层!!!");
                        }
                        if (nextLevel == 3) {
                            curPoint = 3;
                        } else if (nextLevel < 3) {
                            curPoint = nextLevel + 1;
                        }
                    }
                    readDtos.get(i).setPrefix(zeroCount + "." + oneCount + "." + twoCount + "." + threeCount + ".");
                }
            }
        }
        for (ReadDto dto : readDtos) {
            System.out.println("text:" + dto.getPrefix() + dto.getText() + ",level:" + dto.getTitleLevel());

        }
    }
}

ReadDto.java

public class ReadDto {
    private String prefix;
    private String text;
    private int titleLevel;

    public String getPrefix() {
        return prefix;
    }

    public void setPrefix(String prefix) {
        this.prefix = prefix;
    }

    public String getText() {
        return text;
    }

    public void setText(String text) {
        this.text = text;
    }

    public int getTitleLevel() {
        return titleLevel;
    }

    public void setTitleLevel(int titleLevel) {
        this.titleLevel = titleLevel;
    }
}

输出如下:

text:1.第一章,level:0
text:1.1.模块一,level:1
text:1.1.1.顶顶顶顶,level:2
text:1.2.模块二,level:1
text:1.2.1.功能一,level:2
text:1.2.2.功能二,level:2
text:1.3.模块三,level:1
text:1.3.1.功能一,level:2
text:2.第二章,level:0
text:2.1.模块一一,level:1
text:2.2.模块二二,level:1

目前的实现写的比较丑陋,后续有空看能不能实现一个优雅点且不限制层级的吧>_<


原网址: 访问
创建于: 2023-05-08 16:40:49
目录: default
标签: 无

请先后发表评论
  • 最新评论
  • 总共0条评论