如下代码可以实现使用Java的POI进行Word文档的解析并生成XML格式文档功能,此代码编译通过,但是运行有问题,读者可以亲自试试并能否改bug:
import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.OutputStream;import java.io.OutputStreamWriter;import java.io.Writer;import java.nio.charset.Charset; import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.model.StyleDescription;import org.apache.poi.hwpf.model.StyleSheet;import org.apache.poi.hwpf.usermodel.CharacterRun;import org.apache.poi.hwpf.usermodel.Paragraph;import org.apache.poi.hwpf.usermodel.Range; public final class Word2Forrest { Writer _out; HWPFDocument _doc; @SuppressWarnings("unused") public Word2Forrest(HWPFDocument doc, OutputStream stream) throws IOException { OutputStreamWriter out = new OutputStreamWriter(stream, Charset.forName("UTF-8")); _out = out; _doc = doc; init(); openDocument(); openBody(); Range r = doc.getRange(); StyleSheet styleSheet = doc.getStyleSheet(); int sectionLevel = 0; int lenParagraph = r.numParagraphs(); boolean inCode = false; for (int x = 0; x < lenParagraph; x++) { Paragraph p = r.getParagraph(x); String text = p.text(); if (text.trim().length() == 0) { continue; } StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex()); String styleName = paragraphStyle.getName(); if (styleName.startsWith("Heading")) { if (inCode) { closeSource(); inCode = false; } int headerLevel = Integer.parseInt(styleName.substring(8)); if (headerLevel > sectionLevel) { openSection(); } else { for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) { closeSection(); } openSection(); } sectionLevel = headerLevel; openTitle(); System.out.println("++++++" + p.text()); writePlainText(text); closeTitle(); } else { int cruns = p.numCharacterRuns(); CharacterRun run = p.getCharacterRun(0); String fontName = run.getFontName(); if (fontName.startsWith("Courier")) { if (!inCode) { openSource(); inCode = true; } System.out.println("------" + p.text()); writePlainText(p.text()); } else { if (inCode) { inCode = false; closeSource(); } openParagraph(); System.out.println("******" + p.text()); writePlainText(p.text()); closeParagraph(); } } } for (int x = 0; x < sectionLevel; x++) { closeSection(); } closeBody(); closeDocument(); _out.flush(); } public void init() throws IOException { _out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n"); _out.write( "<!DOCTYPE document PUBLIC \"-//APACHE//DTD Documentation V1.1//EN\" \"./dtd/document-v11.dtd\">\r\n"); } public void openDocument() throws IOException { _out.write("<document>\r\n"); } public void closeDocument() throws IOException { _out.write("</document>\r\n"); } public void openBody() throws IOException { _out.write("<body>\r\n"); } public void closeBody() throws IOException { _out.write("</body>\r\n"); } public void openSection() throws IOException { _out.write("<section>"); } public void closeSection() throws IOException { _out.write("</section>"); } public void openTitle() throws IOException { _out.write("<title>"); } public void closeTitle() throws IOException { _out.write("</title>"); } public void writePlainText(String text) throws IOException { _out.write(text); } public void openParagraph() throws IOException { _out.write("<p>"); } public void closeParagraph() throws IOException { _out.write("</p>"); } public void openSource() throws IOException { _out.write("<source><![CDATA["); } public void closeSource() throws IOException { _out.write("]]></source>"); } public static void main(String[] args) throws IOException { InputStream is = new FileInputStream("D:/QMDownload/hwpftest.doc"); OutputStream out = new FileOutputStream("D:/QMDownload/test.xml"); try { new Word2Forrest(new HWPFDocument(is), out); } finally { out.close(); is.close(); } }}
原网址: 访问
创建于: 2023-04-24 14:20:11
目录: default
标签: 无
未标明原创文章均为采集,版权归作者所有,转载无需和我联系,请注明原出处,南摩阿彌陀佛,知识,不只知道,要得到
最新评论