Java引用POI实现Word转Html方法 -

yanghongxia9

浏览: 116619 次
性别:
来自: 深圳

最近访客更多访客>>

御羽倾城

walsh

cc_liyb

alexlvsh

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

Java引用POI实现Word转Html方法

博客分类：

java
DOC

1.下载POI工具并引用
2.读取整个doc文档，获得该文档的所有字符串。
3.从该字符串中得到标题，把该标题构成一个HTML格式的字符串，如<html><head><title>测试文档</title></head><body>。
4.从该文档中判断是否有表格，如有，把每个表格的开始偏移量，结束偏移量记录下来，同时根据每个表格的行，列读取表格的内容，并构造出表格的HTML字符串。
5.从该字符串的第一个字符开始逐个字符循环，得到字符的字体，字号大小，直到下一个字符的字体，字号不一样时，把这些字符内容构造成一个HTML格式的字符串。
6.如果碰到字符为回车符，制表符，把回车符，制表符构造成HTML格式的字符串。
7.如果碰到字符为图片，读取图片，把图片放在指定路径，再把这一路径的信息构造成HTML字符串，如<img src='c://test//1.jpg'/>。
8.如读取字符串的位置等于表格的开始偏移量时，插入前面一构造出的表格HTML字符串，同时跳过表格的结束偏移量，继续往下循环读取字符。
9.由于以上读取是按字符串逐个读取，并且根据字符的变化同时构造出HTML字符串，所以当字符串读取完毕后，即构造出一个完整的HTML字符串。
10.举例
Word文件
HTML文件
11.源代码
WordExcelToHtml.java

01.package com;
02.import java.io.BufferedWriter;
03.import java.io.File;
04.import java.io.FileInputStream;
05.import java.io.FileNotFoundException;
06.import java.io.FileOutputStream;
07.import java.io.IOException;
08.import java.io.OutputStream;
09.import java.io.OutputStreamWriter;
10.
11.import org.apache.poi.hwpf.HWPFDocument;
12.import org.apache.poi.hwpf.model.PicturesTable;
13.import org.apache.poi.hwpf.usermodel.CharacterRun;
14.import org.apache.poi.hwpf.usermodel.Picture;
15.import org.apache.poi.hwpf.usermodel.Range;
16.import org.apache.poi.hwpf.extractor.WordExtractor;
17.import org.apache.poi.hwpf.usermodel.Paragraph;
18.import org.apache.poi.hwpf.usermodel.Table;
19.import org.apache.poi.hwpf.usermodel.TableCell;
20.import org.apache.poi.hwpf.usermodel.TableIterator;
21.import org.apache.poi.hwpf.usermodel.TableRow;
22.
23.
24.public class WordExcelToHtml {
25.
26.    /**
27.     * 回车符ASCII码
28.     */
29.    private static final short ENTER_ASCII = 13;
30.
31.    /**
32.     * 空格符ASCII码
33.     */
34.    private static final short SPACE_ASCII = 32;
35.
36.    /**
37.     * 水平制表符ASCII码
38.     */
39.    private static final short TABULATION_ASCII = 9;
40.
41.    public static String htmlText = "";
42.    public static String htmlTextTbl = "";
43.    public static int counter=0;
44.    public static int beginPosi=0;
45.    public static int endPosi=0;
46.    public static int beginArray[];
47.    public static int endArray[];
48.    public static String htmlTextArray[];
49.    public static boolean tblExist=false;
50.
51.    public static final String inputFile="c://bb.doc";
52.    public static void main(String argv[])
53.    {
54.        try {
55.            getWordAndStyle(inputFile);
56.        } catch (Exception e) {
57.            // TODO Auto-generated catch block
58.            e.printStackTrace();
59.        }
60.    }
61.
62.    /**
63.     * 读取每个文字样式
64.     *
65.     * @param fileName
66.     * @throws Exception
67.     */
68.
69.
70.    public static void getWordAndStyle(String fileName) throws Exception {
71.        FileInputStream in = new FileInputStream(new File(fileName));
72.        HWPFDocument doc = new HWPFDocument(in);
73.
74.         Range rangetbl = doc.getRange();//得到文档的读取范围
75.         TableIterator it = new TableIterator(rangetbl);
76.         int num=100;
77.
78.
79.         beginArray=new int[num];
80.         endArray=new int[num];
81.         htmlTextArray=new String[num];
82.
83.
84.
85.
86.
87.
88.        // 取得文档中字符的总数
89.        int length = doc.characterLength();
90.        // 创建图片容器
91.        PicturesTable pTable = doc.getPicturesTable();
92.
93.        htmlText = "<html><head><title>" + doc.getSummaryInformation().getTitle() + "</title></head><body>";
94.        // 创建临时字符串,好加以判断一串字符是否存在相同格式
95.
96.         if(it.hasNext())
97.         {
98.             readTable(it,rangetbl);
99.         }
100.
101.         int cur=0;
102.
103.        String tempString = "";
104.        for (int i = 0; i < length - 1; i++) {
105.            // 整篇文章的字符通过一个个字符的来判断,range为得到文档的范围
106.            Range range = new Range(i, i + 1, doc);
107.
108.
109.
110.            CharacterRun cr = range.getCharacterRun(0);
111.            //beginArray=new int[num];
112.             //endArray=new int[num];
113.             //htmlTextArray=new String[num];
114.            if(tblExist)
115.            {
116.                if(i==beginArray[cur])
117.                {
118.                    htmlText+=tempString+htmlTextArray[cur];
119.                    tempString="";
120.                    i=endArray[cur]-1;
121.                    cur++;
122.                    continue;
123.                }
124.            }
125.            if (pTable.hasPicture(cr)) {
126.                htmlText += tempString ;
127.                // 读写图片
128.                readPicture(pTable, cr);
129.                tempString = "";
130.            }
131.            else {
132.
133.                Range range2 = new Range(i + 1, i + 2, doc);
134.                // 第二个字符
135.                CharacterRun cr2 = range2.getCharacterRun(0);
136.                char c = cr.text().charAt(0);
137.
138.                System.out.println(i+"::"+range.getEndOffset()+"::"+range.getStartOffset()+"::"+c);
139.
140.                // 判断是否为回车符
141.                if (c == ENTER_ASCII)
142.                    {
143.                    tempString += "<br/>";
144.
145.                    }
146.                // 判断是否为空格符
147.                else if (c == SPACE_ASCII)
148.                    tempString += " ";
149.                // 判断是否为水平制表符
150.                else if (c == TABULATION_ASCII)
151.                    tempString += "    ";
152.                // 比较前后2个字符是否具有相同的格式
153.                boolean flag = compareCharStyle(cr, cr2);
154.                if (flag)
155.                    tempString += cr.text();
156.                else {
157.                    String fontStyle = "<span style="font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 + "pt;";
158.
159.                    if (cr.isBold())
160.                        fontStyle += "font-weight:bold;";
161.                    if (cr.isItalic())
162.                        fontStyle += "font-style:italic;";
163.
164.                    htmlText += fontStyle + "" mce_style="font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 + "pt;";
165.
166.                    if (cr.isBold())
167.                        fontStyle += "font-weight:bold;";
168.                    if (cr.isItalic())
169.                        fontStyle += "font-style:italic;";
170.
171.                    htmlText += fontStyle + "">" + tempString + cr.text() + "</span>";
172.                    tempString = "";
173.                }
174.            }
175.        }
176.
177.        htmlText += tempString+"</body></html>";
178.        writeFile(htmlText);
179.    }
180.
181.    /**
182.     * 读写文档中的表格
183.     *
184.     * @param pTable
185.     * @param cr
186.     * @throws Exception
187.     */
188.    public static void readTable(TableIterator it, Range rangetbl) throws Exception {
189.
190.        htmlTextTbl="";
191.        //迭代文档中的表格
192.
193.        counter=-1;
194.        while (it.hasNext())
195.        {
196.            tblExist=true;
197.             htmlTextTbl="";
198.             Table tb = (Table) it.next();
199.             beginPosi=tb.getStartOffset() ;
200.             endPosi=tb.getEndOffset();
201.
202.             System.out.println("............"+beginPosi+"...."+endPosi);
203.             counter=counter+1;
204.             //迭代行，默认从0开始
205.             beginArray[counter]=beginPosi;
206.             endArray[counter]=endPosi;
207.
208.             htmlTextTbl+="<table border>";
209.             for (int i = 0; i < tb.numRows(); i++) {
210.             TableRow tr = tb.getRow(i);
211.
212.             htmlTextTbl+="<tr>";
213.             //迭代列，默认从0开始
214.             for (int j = 0; j < tr.numCells(); j++) {
215.                 TableCell td = tr.getCell(j);//取得单元格
216.                 int cellWidth=td.getWidth();
217.
218.                 //取得单元格的内容
219.                 for(int k=0;k<td.numParagraphs();k++){
220.                         Paragraph para =td.getParagraph(k);
221.                         String s = para.text().toString().trim();
222.                         if(s=="")
223.                         {
224.                             s=" ";
225.                         }
226.                         System.out.println(s);
227.                         htmlTextTbl += "<td width="+cellWidth+ ">"+s+"</td>";
228.                         System.out.println(i+":"+j+":"+cellWidth+":"+s);
229.                    } //end for
230.                 }   //end for
231.              }   //end for
232.            htmlTextTbl+="</table>" ;
233.            htmlTextArray[counter]=htmlTextTbl;
234.
235.        } //end while
236.    }
237.
238.    /**
239.     * 读写文档中的图片
240.     *
241.     * @param pTable
242.     * @param cr
243.     * @throws Exception
244.     */
245.    public static void readPicture(PicturesTable pTable, CharacterRun cr) throws Exception {
246.        // 提取图片
247.        Picture pic = pTable.extractPicture(cr, false);
248.        // 返回POI建议的图片文件名
249.        String afileName = pic.suggestFullFileName();
250.        OutputStream out = new FileOutputStream(new File("c://test" + File.separator + afileName));
251.        pic.writeImageContent(out);
252.        htmlText += "<img src="c://test//" + afileName + "" mce_src="c://test//" + afileName + ""/>";
253.    }
254.
255.    public static boolean compareCharStyle(CharacterRun cr1, CharacterRun cr2)
256.    {
257.        boolean flag = false;
258.        if (cr1.isBold() == cr2.isBold() && cr1.isItalic() == cr2.isItalic() && cr1.getFontName().equals(cr2.getFontName()) && cr1.getFontSize() == cr2.getFontSize())
259.        {
260.            flag = true;
261.        }
262.        return flag;
263.    }
264.
265.
266.    /**
267.     * 写文件
268.     *
269.     * @param s
270.     */
271.    public static void writeFile(String s) {
272.        FileOutputStream fos = null;
273.        BufferedWriter bw = null;
274.        try {
275.            File file = new File("c://abc.html");
276.            fos = new FileOutputStream(file);
277.            bw = new BufferedWriter(new OutputStreamWriter(fos));
278.            bw.write(s);
279.        } catch (FileNotFoundException fnfe) {
280.            fnfe.printStackTrace();
281.        } catch (IOException ioe) {
282.            ioe.printStackTrace();
283.        } finally {
284.            try {
285.                if (bw != null)
286.                    bw.close();
287.                if (fos != null)
288.                    fos.close();
289.            } catch (IOException ie) {
290.            }
291.        }
292.    }
293.
294.

lib.zip (5.8 MB)
下载次数: 0

分享到：

字节数组处理 | 读取DOC的图片

2013-06-25 16:37
浏览 1163
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

Java引用POI实现Word转Html方法

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

Java引用POI实现Word转Html方法

评论

发表评论

相关推荐

java 通过反射获取泛型的类型

Java生产压缩包的方法

具有公有地静态final数组域

Java创建目录

字节数组处理

读取DOC的图片

读取DOC的表

iText产生PDF文件

itext itextAsian

生成DOC和PDF文件

计算跨度为90天的开始时间和结束时间

Random

JSONObject学习

Djunit

Java URLConnection 总结

JAVA面试题

HttpSession

JAVA上传文件

JAVA下载文件

Java编程中“为了性能”尽量要做到的一些地方

最近访客更多访客>>