pdf word xls parser 效率

全部 Hibernate Spring Struts iBATIS 企业应用 Lucene SOA Java综合 Tomcat 设计模式 OO JBoss

浏览 7088 次

锁定老帖子主题：pdf word xls parser 效率精华帖 (0) :: 良好帖 (0) :: 新手帖 (0) :: 隐藏帖 (0)
作者	正文
roger51 等级: 初级会员性别: 文章: 20 积分: 40	发表时间：2007-07-24 相关推荐: java PDF/Word/Excel文件内容关键字检索 JAVA解析PDF、WORD、EXCEL文档 java poi读取pdf word excel文档，读取pdf文字图片 pdf在线解析html代码,使用tika解析word,xml,html,pdf生成lucene索引 solr直接对pdf、word等建索引更多相关推荐企业应用各位好：在javaeye好长时间了，一直在各大网站学习各位的经验很感谢各位，目前我遇到一个关于lucene索引的问题，在国内和国外的网站上找了很久也没找到一个比较满意的解决办法，所以在这里想问问大家，希望有过这方面的经验的朋友给些帮助，最好能有些比较好的代码或可行性建议，我的代码大概如下 import com.messagesolution.message.viewer.util.HtmlDocument; import com.messagesolution.util.logger.Logger; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.util.PDFTextStripper; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.textmining.text.extraction.WordExtractor; import java.io.*; public class DocumentConverter { public static boolean convertPDF(String fromfile, String tofile) { PDFParser parser = null; String s = null; FileInputStream in = null; FileOutputStream fos = null; //BufferedOutputStream bos = null; DataOutputStream dos = null; try { try { PDFTextStripper _stripper = new PDFTextStripper(); in = new FileInputStream(new File(fromfile)); parser = new PDFParser(in); parser.parse(); s = _stripper.getText(parser.getDocument()); if (StringToolKit.isEmpty(s)){ Logger.getInstance().error("read string of pdf is empty"); return false; //nothing to write } } catch (Exception e) { Logger.getInstance().error("read pdf or convert it error"); e.printStackTrace(); return false; } try { //now write this string to a file fos = new FileOutputStream(new File(tofile)); //bos = new BufferedOutputStream(fos); //bos.write(s.getBytes()); //what about other language? dos = new DataOutputStream(fos); dos.writeBytes(s); } catch (Exception e) { Logger.getInstance().error("write converted txt error"); e.printStackTrace(); return false; } } catch (Throwable t) { if (t instanceof OutOfMemoryError) Logger.getInstance().fatal("OutOfMemoryError occurred in convertPDF for file: " + fromfile, t); System.err.println("Exception occurred in convertPDF, t: " + t); t.printStackTrace(); return false; //something wrong during the conversion } finally { try { if (parser != null) parser.getDocument().close(); if (in != null) in.close(); if (fos != null) fos.close(); //if (bos != null) // bos.close(); if (dos != null) dos.close(); } catch (Exception ex) { Logger.getInstance().error(ex.toString()); } } return true; } public static boolean convertDOC(String fromfile, String tofile) { FileInputStream fis = null; FileOutputStream fos = null; DataOutputStream dos = null; try { fis = new FileInputStream (new File(fromfile)); WordExtractor extractor = new WordExtractor(); String s = extractor.extractText(fis); //now write this string to a file fos = new FileOutputStream(new File(tofile)); //bos = new BufferedOutputStream(fos); //bos.write(s.getBytes()); //what about other language? dos = new DataOutputStream(fos); dos.writeBytes(s); } catch (Throwable t) { if (t instanceof OutOfMemoryError) Logger.getInstance().fatal("OutOfMemoryError occurred in convertDOC for file: " + fromfile, t); System.err.println("Exception occurred in convertDOC, t: " + t); t.printStackTrace(); return false; //something wrong during the conversion } finally { try { if (fis != null) fis.close(); if (fos != null) fos.close(); if (dos != null) dos.close(); } catch (Exception e) {} } return true; } public static boolean convertHTML(String fromfile, String tofile) { try { String htmlCharset = HtmlDocument.convertHtml(fromfile, tofile); System.out.println("htmlCharset: " + htmlCharset); } catch (Throwable t) { if (t instanceof OutOfMemoryError) Logger.getInstance().fatal("OutOfMemoryError occurred in convertHTML for file: " + fromfile, t); System.err.println("Exception occurred in convertHTML, t: " + t); t.printStackTrace(); return false; //something wrong during the conversion } return true; } public static boolean convertPPT(String fromfile, String tofile) { System.err.println("convertPPT not supported yet!"); Thread.dumpStack(); return false; // return false; } public static boolean convertXLS(String fromfile, String tofile) { StringBuffer sb = new StringBuffer(); FileInputStream fis = null; FileOutputStream fos = null; DataOutputStream dos = null; HSSFWorkbook wb = null; try { fis = new FileInputStream(new File(fromfile)); wb = new HSSFWorkbook(fis); int numSheets = wb.getNumberOfSheets(); for (int i=0;i<numSheets;++i) { HSSFSheet sheet = wb.getSheetAt(i); int numRows = sheet.getLastRowNum(); for (int j=0;j<numRows;++j) { HSSFRow row = sheet.getRow(j); if (row == null) continue; int numCells = row.getLastCellNum(); for (int k=0;k<numCells;++k) { HSSFCell cell = row.getCell((short)k); if(cell!=null) { int type = cell.getCellType(); if(type==HSSFCell.CELL_TYPE_STRING) { String str = cell.getStringCellValue(); str=str.trim(); str=replace(str,"\n",""); sb.append(str).append(" "); } } // We will ignore all other types - numbers, forumlas, etc. // as these don't hold alot of meaning outside of their tabular context. // else if(type==, CELL_TYPE_NUMERIC, CELL_TYPE_FORMULA, CELL_TYPE_BOOLEAN, CELL_TYPE_ERROR } // cells //sb.append("\n"); // break on each row } // rows sb.append("\n"); // break on each sheet } // sheets String s = sb.toString(); //now write this string to a file fos = new FileOutputStream(new File(tofile)); //bos = new BufferedOutputStream(fos); //bos.write(s.getBytes()); //what about other language? dos = new DataOutputStream(fos); dos.writeBytes(s); } catch (Throwable t) { if (t instanceof OutOfMemoryError) Logger.getInstance().fatal("OutOfMemoryError occurred in convertXSL for file: " + fromfile, t); System.err.println("Exception occurred in convertXSL, t: " + t); t.printStackTrace(); return false; //something wrong during the conversion } finally { try { if (fis != null) fis.close(); if (fos != null) fos.close(); if (dos != null) dos.close(); } catch (Exception e) {} } return true; } // This should really be made 'static' and moved into a utility class, // included here to simplify things private final static String replace(String line, String oldString, String newString) { if (line == null) { return null; } int i = 0; if ((i = line.indexOf(oldString, i)) >= 0) { char[] line2 = line.toCharArray(); char[] newString2 = newString.toCharArray(); int oLength = oldString.length(); StringBuffer buf = new StringBuffer(line2.length); buf.append(line2, 0, i).append(newString2); i += oLength; int j = i; while ((i = line.indexOf(oldString, i)) > 0) { buf.append(line2, j, i - j).append(newString2); i += oLength; j = i; } buf.append(line2, j, line2.length - j); return buf.toString(); } return line; } public static void main(String[] args) { int index = 0; String action = args[index++]; String f1 = args[index++]; String f2 = args[index++]; long start = System.currentTimeMillis(); long end = 0; if (action.equals("pdf")) convertPDF(f1, f2); else if (action.equals("doc")) convertDOC(f1, f2); else if (action.equals("xls")) convertXLS(f1, f2); else if (action.equals("ppt")) convertPPT(f1, f2); else if (action.equals("ppt")) convertHTML(f1, f2); end = System.currentTimeMillis(); System.out.println(action + " convert " + f1 + " took " + ((end-start)/1000) + " seconds."); } } main方法主要是输入三个参数第一个是转换文档的格式，第二个是文档存放的路径，第三个是要输出的文档存放的位置，然后对输出的文档进行索引, 平均每个文档在1M-5M之间，问题：在进行文档转换的时候pdf，word，xls 都非常慢，本来想写一个threadpool来进行文档的转换，可是测试数据表明多线程转换还不如单线程的快，而且也容易出现outofmemory, 后来我又想了一个办法，把大的pdf ,word xls 进行切分，可是写了一个java的切分成小文档的方法，只能对txt文档进行转换，word 和pdf 因为里面有很多格式和样式的东西都是二进制的，在合成一个大的文档就合并不回去了（c++ 或.net 到时有办法切分），所以希望有过索引大量pdf ，word，xls 文档的朋友给写帮助，能快速处理，目前的数据量大概是1T(大概是100G)，服务器配置大概是4个cpu ,4G内存，虚拟机开到了1.2个G用的是jdk1.4在大也开不了了，谢谢帮助声明：ITeye文章版权属于作者，受法律保护。没有作者书面许可不得转载。推荐链接
返回顶楼

wl95421 等级: 性别: 文章: 352 积分: 693	发表时间：2007-07-24 你先用Profier这种东东看一下大致的资源开销分布是内存不足，频繁回收，或者是解析文件太慢又或者是因为merge的次数太频繁你还可以尝试一下将文件读入内存，使用RamDirectory来进行索引，再将结果写入FSDirectory，性能肯定会高很多
返回顶楼	回帖地址 0 0 请登录后投票

roger51 等级: 初级会员性别: 文章: 20 积分: 40	发表时间：2007-07-24 感谢你的帮助，使用RamDirectory来进行索引，再将结果写入FSDirectory，性能肯定会高很多，我目前已经是在lucene indexwriter 的时候用了，应该不是这的问题，merge的基数大概是2000左右，所以问题也不是这里，感觉是开源的jar在处理pdf word ,xls等文件的时候转换的太慢，profier我会试试看，不过其他服务程序还是可以运行的不错的，所以内存上感觉问题也不大，
返回顶楼	回帖地址 0 0 请登录后投票

wl95421 等级: 性别: 文章: 352 积分: 693	发表时间：2007-07-24 POI处理Excel的速度不快你最好用其它的方法如果你是在Windows下而且速度要求比较高可以考虑用Jacob将Excel先转成Html 然后做索引 PDF和Word也是这样处理特别是PDF，如果用iText处理，非常的慢我测试过iText输出PDF，400页的文件，约400M内存，而且和用WPS输出PDF的性能有2-3个数量级的差距
返回顶楼	回帖地址 0 0 请登录后投票

jenkinv 等级: 初级会员性别: 文章: 2 积分: 30 来自: 武汉	发表时间：2007-07-30 我遇到和你一样的问题，，期待好的解决方案。
返回顶楼	回帖地址 0 0 请登录后投票

roger51 等级: 初级会员性别: 文章: 20 积分: 40	发表时间：2007-07-31 我的解决了，就是用我io 效率那个blog的办法，原来是一个10word 文档要16秒左右，现在只需要二秒就够了
返回顶楼	回帖地址 0 0 请登录后投票

nolan022 等级: 初级会员性别: 文章: 10 积分: 0 来自: 青岛	发表时间：2008-07-13 roger51 写道各位好：在javaeye好长时间了，一直在各大网站学习各位的经验很感谢各位，目前我遇到一个关于lucene索引的问题，在国内和国外的网站上找了很久也没找到一个比较满意的解决办法，所以在这里想问问大家，希望有过这方面的经验的朋友给些帮助，最好能有些比较好的代码或可行性建议，我的代码大概如下 import com.messagesolution.message.viewer.util.HtmlDocument; import com.messagesolution.util.logger.Logger; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.util.PDFTextStripper; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.textmining.text.extraction.WordExtractor; import java.io.*; public class DocumentConverter { public static boolean convertPDF(String fromfile, String tofile) { PDFParser parser = null; String s = null; FileInputStream in = null; FileOutputStream fos = null; //BufferedOutputStream bos = null; DataOutputStream dos = null; try { try { PDFTextStripper _stripper = new PDFTextStripper(); in = new FileInputStream(new File(fromfile)); parser = new PDFParser(in); parser.parse(); s = _stripper.getText(parser.getDocument()); if (StringToolKit.isEmpty(s)){ Logger.getInstance().error("read string of pdf is empty"); return false; //nothing to write } } catch (Exception e) { Logger.getInstance().error("read pdf or convert it error"); e.printStackTrace(); return false; } try { //now write this string to a file fos = new FileOutputStream(new File(tofile)); //bos = new BufferedOutputStream(fos); //bos.write(s.getBytes()); //what about other language? dos = new DataOutputStream(fos); dos.writeBytes(s); } catch (Exception e) { Logger.getInstance().error("write converted txt error"); e.printStackTrace(); return false; } } catch (Throwable t) { if (t instanceof OutOfMemoryError) Logger.getInstance().fatal("OutOfMemoryError occurred in convertPDF for file: " + fromfile, t); System.err.println("Exception occurred in convertPDF, t: " + t); t.printStackTrace(); return false; //something wrong during the conversion } finally { try { if (parser != null) parser.getDocument().close(); if (in != null) in.close(); if (fos != null) fos.close(); //if (bos != null) // bos.close(); if (dos != null) dos.close(); } catch (Exception ex) { Logger.getInstance().error(ex.toString()); } } return true; } public static boolean convertDOC(String fromfile, String tofile) { FileInputStream fis = null; FileOutputStream fos = null; DataOutputStream dos = null; try { fis = new FileInputStream (new File(fromfile)); WordExtractor extractor = new WordExtractor(); String s = extractor.extractText(fis); //now write this string to a file fos = new FileOutputStream(new File(tofile)); //bos = new BufferedOutputStream(fos); //bos.write(s.getBytes()); //what about other language? dos = new DataOutputStream(fos); dos.writeBytes(s); } catch (Throwable t) { if (t instanceof OutOfMemoryError) Logger.getInstance().fatal("OutOfMemoryError occurred in convertDOC for file: " + fromfile, t); System.err.println("Exception occurred in convertDOC, t: " + t); t.printStackTrace(); return false; //something wrong during the conversion } finally { try { if (fis != null) fis.close(); if (fos != null) fos.close(); if (dos != null) dos.close(); } catch (Exception e) {} } return true; } public static boolean convertHTML(String fromfile, String tofile) { try { String htmlCharset = HtmlDocument.convertHtml(fromfile, tofile); System.out.println("htmlCharset: " + htmlCharset); } catch (Throwable t) { if (t instanceof OutOfMemoryError) Logger.getInstance().fatal("OutOfMemoryError occurred in convertHTML for file: " + fromfile, t); System.err.println("Exception occurred in convertHTML, t: " + t); t.printStackTrace(); return false; //something wrong during the conversion } return true; } public static boolean convertPPT(String fromfile, String tofile) { System.err.println("convertPPT not supported yet!"); Thread.dumpStack(); return false; // return false; } public static boolean convertXLS(String fromfile, String tofile) { StringBuffer sb = new StringBuffer(); FileInputStream fis = null; FileOutputStream fos = null; DataOutputStream dos = null; HSSFWorkbook wb = null; try { fis = new FileInputStream(new File(fromfile)); wb = new HSSFWorkbook(fis); int numSheets = wb.getNumberOfSheets(); for (int i=0;i<numSheets;++i) { HSSFSheet sheet = wb.getSheetAt(i); int numRows = sheet.getLastRowNum(); for (int j=0;j<numRows;++j) { HSSFRow row = sheet.getRow(j); if (row == null) continue; int numCells = row.getLastCellNum(); for (int k=0;k<numCells;++k) { HSSFCell cell = row.getCell((short)k); if(cell!=null) { int type = cell.getCellType(); if(type==HSSFCell.CELL_TYPE_STRING) { String str = cell.getStringCellValue(); str=str.trim(); str=replace(str,"\n",""); sb.append(str).append(" "); } } // We will ignore all other types - numbers, forumlas, etc. // as these don't hold alot of meaning outside of their tabular context. // else if(type==, CELL_TYPE_NUMERIC, CELL_TYPE_FORMULA, CELL_TYPE_BOOLEAN, CELL_TYPE_ERROR } // cells //sb.append("\n"); // break on each row } // rows sb.append("\n"); // break on each sheet } // sheets String s = sb.toString(); //now write this string to a file fos = new FileOutputStream(new File(tofile)); //bos = new BufferedOutputStream(fos); //bos.write(s.getBytes()); //what about other language? dos = new DataOutputStream(fos); dos.writeBytes(s); } catch (Throwable t) { if (t instanceof OutOfMemoryError) Logger.getInstance().fatal("OutOfMemoryError occurred in convertXSL for file: " + fromfile, t); System.err.println("Exception occurred in convertXSL, t: " + t); t.printStackTrace(); return false; //something wrong during the conversion } finally { try { if (fis != null) fis.close(); if (fos != null) fos.close(); if (dos != null) dos.close(); } catch (Exception e) {} } return true; } // This should really be made 'static' and moved into a utility class, // included here to simplify things private final static String replace(String line, String oldString, String newString) { if (line == null) { return null; } int i = 0; if ((i = line.indexOf(oldString, i)) >= 0) { char[] line2 = line.toCharArray(); char[] newString2 = newString.toCharArray(); int oLength = oldString.length(); StringBuffer buf = new StringBuffer(line2.length); buf.append(line2, 0, i).append(newString2); i += oLength; int j = i; while ((i = line.indexOf(oldString, i)) > 0) { buf.append(line2, j, i - j).append(newString2); i += oLength; j = i; } buf.append(line2, j, line2.length - j); return buf.toString(); } return line; } public static void main(String[] args) { int index = 0; String action = args[index++]; String f1 = args[index++]; String f2 = args[index++]; long start = System.currentTimeMillis(); long end = 0; if (action.equals("pdf")) convertPDF(f1, f2); else if (action.equals("doc")) convertDOC(f1, f2); else if (action.equals("xls")) convertXLS(f1, f2); else if (action.equals("ppt")) convertPPT(f1, f2); else if (action.equals("ppt")) convertHTML(f1, f2); end = System.currentTimeMillis(); System.out.println(action + " convert " + f1 + " took " + ((end-start)/1000) + " seconds."); } } main方法主要是输入三个参数第一个是转换文档的格式，第二个是文档存放的路径，第三个是要输出的文档存放的位置，然后对输出的文档进行索引, 平均每个文档在1M-5M之间，问题：在进行文档转换的时候pdf，word，xls 都非常慢，本来想写一个threadpool来进行文档的转换，可是测试数据表明多线程转换还不如单线程的快，而且也容易出现outofmemory, 后来我又想了一个办法，把大的pdf ,word xls 进行切分，可是写了一个java的切分成小文档的方法，只能对txt文档进行转换，word 和pdf 因为里面有很多格式和样式的东西都是二进制的，在合成一个大的文档就合并不回去了（c++ 或.net 到时有办法切分），所以希望有过索引大量pdf ，word，xls 文档的朋友给写帮助，能快速处理，目前的数据量大概是1T(大概是100G)，服务器配置大概是4个cpu ,4G内存，虚拟机开到了1.2个G用的是jdk1.4在大也开不了了，谢谢帮助
返回顶楼	回帖地址 2 0 请登录后投票

论坛首页 → Java企业应用版

跳转论坛: