使用Java的POI进行Word文档的解析并生成XML格式文档_poi解析xml_runnersun的博客-CSDN博客

    如下代码可以实现使用Java的POI进行Word文档的解析并生成XML格式文档功能,此代码编译通过,但是运行有问题,读者可以亲自试试并能否改bug:

import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.OutputStream;import java.io.OutputStreamWriter;import java.io.Writer;import java.nio.charset.Charset; import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.model.StyleDescription;import org.apache.poi.hwpf.model.StyleSheet;import org.apache.poi.hwpf.usermodel.CharacterRun;import org.apache.poi.hwpf.usermodel.Paragraph;import org.apache.poi.hwpf.usermodel.Range; public final class Word2Forrest {    Writer _out;    HWPFDocument _doc;     @SuppressWarnings("unused")    public Word2Forrest(HWPFDocument doc, OutputStream stream) throws IOException {        OutputStreamWriter out = new OutputStreamWriter(stream, Charset.forName("UTF-8"));        _out = out;        _doc = doc;         init();        openDocument();        openBody();         Range r = doc.getRange();        StyleSheet styleSheet = doc.getStyleSheet();         int sectionLevel = 0;        int lenParagraph = r.numParagraphs();        boolean inCode = false;        for (int x = 0; x < lenParagraph; x++) {            Paragraph p = r.getParagraph(x);             String text = p.text();            if (text.trim().length() == 0) {                continue;            }            StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex());            String styleName = paragraphStyle.getName();            if (styleName.startsWith("Heading")) {                if (inCode) {                    closeSource();                    inCode = false;                }                 int headerLevel = Integer.parseInt(styleName.substring(8));                if (headerLevel > sectionLevel) {                    openSection();                } else {                    for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) {                        closeSection();                    }                    openSection();                }                sectionLevel = headerLevel;                openTitle();                System.out.println("++++++" + p.text());                writePlainText(text);                closeTitle();            } else {                int cruns = p.numCharacterRuns();                CharacterRun run = p.getCharacterRun(0);                String fontName = run.getFontName();                if (fontName.startsWith("Courier")) {                    if (!inCode) {                        openSource();                        inCode = true;                    }                    System.out.println("------" + p.text());                    writePlainText(p.text());                } else {                    if (inCode) {                        inCode = false;                        closeSource();                    }                    openParagraph();                    System.out.println("******" + p.text());                    writePlainText(p.text());                    closeParagraph();                }            }        }        for (int x = 0; x < sectionLevel; x++) {            closeSection();        }        closeBody();        closeDocument();        _out.flush();     }     public void init() throws IOException {        _out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n");        _out.write(                "<!DOCTYPE document PUBLIC \"-//APACHE//DTD Documentation V1.1//EN\" \"./dtd/document-v11.dtd\">\r\n");    }     public void openDocument() throws IOException {        _out.write("<document>\r\n");    }     public void closeDocument() throws IOException {        _out.write("</document>\r\n");    }     public void openBody() throws IOException {        _out.write("<body>\r\n");    }     public void closeBody() throws IOException {        _out.write("</body>\r\n");    }     public void openSection() throws IOException {        _out.write("<section>");     }     public void closeSection() throws IOException {        _out.write("</section>");     }     public void openTitle() throws IOException {        _out.write("<title>");    }     public void closeTitle() throws IOException {        _out.write("</title>");    }     public void writePlainText(String text) throws IOException {        _out.write(text);    }     public void openParagraph() throws IOException {        _out.write("<p>");    }     public void closeParagraph() throws IOException {        _out.write("</p>");    }     public void openSource() throws IOException {        _out.write("<source><![CDATA[");    }     public void closeSource() throws IOException {        _out.write("]]></source>");    }     public static void main(String[] args) throws IOException {        InputStream is = new FileInputStream("D:/QMDownload/hwpftest.doc");        OutputStream out = new FileOutputStream("D:/QMDownload/test.xml");        try {            new Word2Forrest(new HWPFDocument(is), out);        } finally {            out.close();            is.close();        }    }}

原网址: 访问
创建于: 2023-04-24 14:20:11
目录: default
标签: 无

请先后发表评论
  • 最新评论
  • 总共0条评论