apache poi解析word(doc)文档成xml及导出成html_sinat_30110061的博客-CSDN博客

//遇到解析word文档的需求,文档格式不定,在网上看了好多文章,大多是简单应用api解析内容,要不就是需要windows环境,还有个在线编辑的第三方的,不太符合本人需求,目前实现了doc的,docx的后续再说吧。

//本来是想把word解析成xml形式的字符串,就 参考apache poi中的 wordToHtmlConverter部分源码,因为它做了很多样式的解析,本人用不到这些,解析过多反而累赘,所以参考源码 +dom4j实现此功能,不过后来就顺便实现了下导出html(只有一两个样式)

//注意修改代码中word文档的地址和导出图片的存放地址(getImgUrl)

privatestatic Document _document_ = null;

    static{

            _document_ = DocumentHelper.createDocument();

    }  

publicstaticvoid main(String[] args) {

        File file = new File("C:/Users/css/Desktop/1.doc"); //注意文档地址

        if(file.exists()){

            HWPFDocument doc;

            try {

                doc = new HWPFDocument(new FileInputStream(file));

                Range range = doc.getRange();

                wordToHtml(doc);

//                printRange(doc);

            } catch (FileNotFoundException e) {

                e.printStackTrace();

            } catch (IOException e) {

                e.printStackTrace();

            }

        }

    }

/**

     * word解析并导出html文件

     * @param doc

     */  

privatestaticvoid wordToHtml(HWPFDocument doc){

        Element htmlElement = DocumentHelper.createElement("html");

        _document_.setRootElement(htmlElement);

        Element headElement = DocumentHelper.createElement("head");

        Element charSetElement = DocumentHelper.createElement("meta");

        charSetElement.addAttribute("http-equiv", "Content-Type");

        charSetElement.addAttribute("content", "text/html; charset=UTF-8");

        headElement.add(charSetElement);

        htmlElement.add(headElement);

        Element bodyElement = DocumentHelper.createElement("body");

        Element contentElement = formatRange(doc); //解析word

        bodyElement.add(contentElement);

        htmlElement.add(bodyElement);

        String docString = _document_.asXML();

        writeFile(docString, "C:/Users/css/Desktop/12345.html");

        System._out_.println(docString);

    } 

/**

     * 导出文件

     * @param content

     * @param path

     */

    private static void writeFile(String content, String path) {  

        FileOutputStream fos = null;  

        BufferedWriter bw = null;  

        try {  

            File file = new File(path);  

            fos = new FileOutputStream(file);  

            bw = new BufferedWriter(new OutputStreamWriter(fos,"UTF-8"));  

            bw.write(content);  

        } catch (FileNotFoundException fnfe) {  

            fnfe.printStackTrace();  

        } catch (IOException ioe) {  

            ioe.printStackTrace();  

        } finally {  

            try {  

                if (bw != null)  

                    bw.close();  

                if (fos != null)  

                    fos.close();  

            } catch (IOException ie) {  

            }  

        }  

    }  

    /**

     * 解析word文档(包括table表格、图片、text文本内容)

     * @param doc

     * @return

     */

    private static Element formatRange(HWPFDocument doc){

        Element contentEle = DocumentHelper.createElement("div");

        Range range = doc.getRange();

        PicturesTable pt = doc.getPicturesTable();

        int pnum = range.numParagraphs();

        Paragraph paragraph = null;

        String text = null;

        Element divElement = null;

        CharacterRun run = null;

        int numCharacterRuns = 0;

        StringBuilder styleStr = null;

        for(int i=0;i<pnum;i++){

            paragraph = range.getParagraph(i);//段落

            styleStr = new StringBuilder();

            if(paragraph.isInTable()){//是否为table

                Table table = range.getTable(paragraph);

                divElement = formatTable(table);//解析table内容

                i += table.numParagraphs();//跳过table

                i--;

            }else if(paragraph.isInList()){ //目前解析的word模板没用到也就没做

                System._out_.println("handle paragrah list----------------");

            }else{

                text = paragraph.text();

                if(text != null && !"".equals(text)){

                    run = paragraph.getCharacterRun(0);

                    numCharacterRuns = paragraph.numCharacterRuns();

                    if(run != null && numCharacterRuns>0){

                        if(run.text().charAt(0)==0x01 && pt.hasPicture(run)){//图片

                            divElement = formatImg(pt.extractPicture(run, true));//解析图片,创建img节点,并导出图片

                        }else{//文本内容

                            divElement = DocumentHelper.createElement("div");

                            divElement.addText(text);

                            if(run.isBold()){//加粗样式

                                styleStr.append("font-weight:bold;");

                            }

                            if(styleStr.length()>0){

                                divElement.addAttribute("style", styleStr.toString());

                            }

                        }

                    }

                }

            }

            if(divElement != null){//添加段落节点

                contentEle.add(divElement);

            }

        }

        return contentEle;

    }

    /**

     * 解析table表格

     * @param table

     * @return

     */

    private static Element formatTable(Table table){

        Element tableElement = DocumentHelper.createElement("table");

        Element theadElement = DocumentHelper.createElement("thead");

        Element tbodyElement = DocumentHelper.createElement("tbody");

        int[] tableCellEdges = buildTableCellEdgesArray(table); //单元格边界

        int rownum = table.numRows();

        int maxColumns = Integer._MIN_VALUE_;

        for ( int r = 0; r < rownum; r++ ){

            maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() );

        }

        Element rowElement = null;

        TableRow row = null;

        int cellnum = 0;

        Element cellElement = null;

        TableCell cell = null;

        int rowSpan = 0;

        int colSpan = 0;

        tableElement.addAttribute("border", "1");

        tableElement.addAttribute("style", "border-spacing:0");

        for(int i=0;i<rownum;i++){

            row = table.getRow(i);

            if(row != null){

                rowElement = DocumentHelper.createElement("tr");

                cellnum = row.numCells();

                int currentEdgeIndex = 0;

                for(int j=0;j<cellnum;j++){

                    cell = row.getCell(j);

                    if(cell.isVerticallyMerged() && !cell.isFirstVerticallyMerged())

                    {

                        currentEdgeIndex += getNumberColumnsSpanned(tableCellEdges, currentEdgeIndex, cell );

                        continue;

                    }

                    if(cell != null){

                        if(row.isTableHeader()){

                            cellElement = DocumentHelper.createElement("th");

                        }else{

                            cellElement = DocumentHelper.createElement("td");

                        }

                        colSpan = getNumberColumnsSpanned( tableCellEdges, currentEdgeIndex, cell );//取得列合并数

                        currentEdgeIndex += colSpan;

                        if ( colSpan == 0 ){

                            continue;

                        }

                        if ( colSpan != 1 ){

                            cellElement.addAttribute( "colspan", String.valueOf( colSpan ) );

                        }

                        rowSpan = getNumberRowsSpanned( table, tableCellEdges, i, j, cell );//取得行合并数

                        if ( rowSpan > 1 ){

                            cellElement.addAttribute( "rowspan", String.valueOf( rowSpan ) );

                        }

                        cellElement.addText(cell.text());

                    }

                    if(cellElement != null){

                        rowElement.add(cellElement);

                    }

                }

            }

            if(row.isTableHeader()){

                theadElement.add(rowElement);

            }else{

                tbodyElement.add(rowElement);

            }

        }

        if(theadElement.hasContent()){

            tableElement.add(theadElement);

        }

        if(tbodyElement.hasContent()){

            tableElement.add(tbodyElement);

        }

        return tableElement;
http://sina.lt/cb6e
http://sina.lt/cb6f
http://sina.lt/cb6g
http://sina.lt/cb6h
http://sina.lt/cb6j
http://sina.lt/cb6k
http://sina.lt/cb6m
http://sina.lt/cb6p
http://sina.lt/cb6q
http://sina.lt/cb6r
http://sina.lt/cb6s
http://sina.lt/cb6t
http://sina.lt/cb6u
http://sina.lt/cb6v
http://sina.lt/cb6w
http://sina.lt/cb6x
http://sina.lt/cb6y
http://sina.lt/cb6z
http://sina.lt/cb62
http://sina.lt/cb63
http://sina.lt/cb64
http://sina.lt/cb65
http://sina.lt/cb66
http://sina.lt/cb67
http://sina.lt/cb68
http://sina.lt/cb69
http://sina.lt/cb6A
http://sina.lt/cb6B
http://sina.lt/cb6C

    }

/**

     * 解析table单元格边界

     * @param table

     * @return

     */

    private static int[] buildTableCellEdgesArray( Table table )

    {

        Set<Integer> edges = new TreeSet<Integer>();

        for ( int r = 0; r < table.numRows(); r++ )

        {

            TableRow tableRow = table.getRow( r );

            for ( int c = 0; c < tableRow.numCells(); c++ )

            {

                TableCell tableCell = tableRow.getCell( c );

                edges.add( Integer.valueOf( tableCell.getLeftEdge() ) );

                edges.add( Integer.valueOf( tableCell.getLeftEdge()

                        + tableCell.getWidth() ) );

            }

        }

        Integer[] sorted = edges.toArray( new Integer[edges.size()] );

        int[] result = new int[sorted.length];

        for ( int i = 0; i < sorted.length; i++ )

        {

            result[i] = sorted[i].intValue();

        }

        return result;

    }

/**

     * 解析table列合并数

     * @param tableCellEdges

     * @param currentEdgeIndex

     * @param tableCell

     * @return

     */

    private static int getNumberColumnsSpanned( int[] tableCellEdges,

            int currentEdgeIndex, TableCell tableCell )

    {

        int nextEdgeIndex = currentEdgeIndex;

        int colSpan = 0;

        int cellRightEdge = tableCell.getLeftEdge() + tableCell.getWidth();

        while ( tableCellEdges[nextEdgeIndex] < cellRightEdge )

        {

            colSpan++;

            nextEdgeIndex++;

        }

        return colSpan;

    }

/**

     * 解析table行合并数

     * @param table

     * @param tableCellEdges

     * @param currentRowIndex

     * @param currentColumnIndex

     * @param tableCell

     * @return

     */

    private static int getNumberRowsSpanned( Table table,

            final int[] tableCellEdges, int currentRowIndex,

            int currentColumnIndex, TableCell tableCell )

    {

        if ( !tableCell.isFirstVerticallyMerged() )

            return 1;

        final int numRows = table.numRows();

        int count = 1;

        for ( int r1 = currentRowIndex + 1; r1 < numRows; r1++ )

        {

            TableRow nextRow = table.getRow( r1 );

            if ( currentColumnIndex >= nextRow.numCells() )

                break;

            // we need to skip row if he don't have cells at all

            boolean hasCells = false;

            int currentEdgeIndex = 0;

            for ( int c = 0; c < nextRow.numCells(); c++ )

            {

                TableCell nextTableCell = nextRow.getCell( c );

                if ( !nextTableCell.isVerticallyMerged()

                        || nextTableCell.isFirstVerticallyMerged() )

                {

                    int colSpan = getNumberColumnsSpanned( tableCellEdges,

                            currentEdgeIndex, nextTableCell );

                    currentEdgeIndex += colSpan;

                    if ( colSpan != 0 )

                    {

                        hasCells = true;

                        break;

                    }

                }

                else

                {

                    currentEdgeIndex += getNumberColumnsSpanned(

                            tableCellEdges, currentEdgeIndex, nextTableCell );

                }

            }

            if ( !hasCells )

                continue;

            TableCell nextCell = nextRow.getCell( currentColumnIndex );

            if ( !nextCell.isVerticallyMerged()

                    || nextCell.isFirstVerticallyMerged() )

                break;

            count++;

        }

        return count;

    } 

/**

     * 解析图片,创建img节点,并导出图片

     * @param picture

     * @return

     */

    private static Element formatImg(Picture picture){

        Element imgElement = null;

        if(picture != null){

            String imgUrl = getImgUrl(picture.suggestFullFileName());

            exportImg(picture, imgUrl);

            imgElement = DocumentHelper.createElement("img");

            imgElement.addAttribute("src", imgUrl);

        }

        return imgElement;

    }

/**

     * 获取图片保存位置

     * @param suggestedName

     * @return

     */

    private static String getImgUrl(String suggestedName){

        return "C:/Users/css/Desktop/"+suggestedName; //注意图片导出地址
    }

/**

     * 导出图片

     * @param picture

     * @param expUrl

     */

    private static void exportImg(Picture picture,String expUrl){

        if(picture != null && expUrl != null && !"".equals(expUrl))

        try {

            picture.writeImageContent(new FileOutputStream(expUrl));

        } catch (IOException e) {

            e.printStackTrace();

        }

    }


原网址: 访问
创建于: 2023-05-08 16:42:40
目录: default
标签: 无

请先后发表评论
  • 最新评论
  • 总共0条评论