论坛首页 Java企业应用论坛

pdf word xls parser 效率

浏览 7082 次
精华帖 (0) :: 良好帖 (0) :: 新手帖 (0) :: 隐藏帖 (0)
作者 正文
   发表时间:2007-07-24  

各位好:在javaeye好长时间了,一直在各大网站学习各位的经验很感谢各位,目前我遇到一个关于lucene索引的问题,在国内和国外的网站上找了很久也没找到一个比较满意的解决办法,所以在这里想问问大家,希望有过这方面的经验的朋友给些帮助,最好能有些比较好的代码或可行性建议,我的代码大概如下

import com.messagesolution.message.viewer.util.HtmlDocument;
import com.messagesolution.util.logger.Logger;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.util.PDFTextStripper;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.textmining.text.extraction.WordExtractor;

import java.io.*;


public class DocumentConverter
{

 public static boolean convertPDF(String fromfile, String tofile)
    {
        PDFParser parser = null;
        String s = null;
        FileInputStream in = null;
        FileOutputStream fos = null;
        //BufferedOutputStream bos = null;
        DataOutputStream dos = null;
        try
        {
            try {
    PDFTextStripper _stripper = new PDFTextStripper();
    in = new FileInputStream(new File(fromfile));
    parser = new PDFParser(in);
    parser.parse();
    s = _stripper.getText(parser.getDocument());
    if (StringToolKit.isEmpty(s)){
     Logger.getInstance().error("read string of pdf is empty");
     
     return false;       //nothing to write
    }
       
   } catch (Exception e) {
    Logger.getInstance().error("read pdf or convert it error");
    e.printStackTrace();
    return false;
   }

            try {
    //now write this string to a file
    fos = new FileOutputStream(new File(tofile));
    //bos = new BufferedOutputStream(fos);
    //bos.write(s.getBytes());  //what about other language?
    dos = new DataOutputStream(fos);
    dos.writeBytes(s);
   } catch (Exception e) {
    Logger.getInstance().error("write converted txt error");
    e.printStackTrace();
    return false;
   }
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertPDF for file: " + fromfile, t);
            System.err.println("Exception occurred in convertPDF, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }
        finally {
            try
            {
                if (parser != null)
                    parser.getDocument().close();
                if (in != null)
                    in.close();
                if (fos != null)
                    fos.close();
                //if (bos != null)
                //    bos.close();
                if (dos != null)
                    dos.close();
            } catch (Exception ex) {
              Logger.getInstance().error(ex.toString());
            }
        }

  return true;
 }

 public static boolean convertDOC(String fromfile, String tofile)
    {
        FileInputStream fis = null;
        FileOutputStream fos = null;
        DataOutputStream dos = null;

        try
        {
            fis = new FileInputStream (new File(fromfile));
            WordExtractor extractor = new WordExtractor();
            String s = extractor.extractText(fis);

            //now write this string to a file
            fos = new FileOutputStream(new File(tofile));
            //bos = new BufferedOutputStream(fos);
            //bos.write(s.getBytes());  //what about other language?
            dos = new DataOutputStream(fos);
            dos.writeBytes(s);
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertDOC for file: " + fromfile, t);
            System.err.println("Exception occurred in convertDOC, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }
        finally
        {
            try
            {
                if (fis != null)
                    fis.close();
                if (fos != null)
                    fos.close();
                if (dos != null)
                    dos.close();
            } catch (Exception e) {}
        }

  return true;
 }

 public static boolean convertHTML(String fromfile, String tofile)
    {
        try
        {
            String htmlCharset = HtmlDocument.convertHtml(fromfile, tofile);
            System.out.println("htmlCharset: " + htmlCharset);
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertHTML for file: " + fromfile, t);
            System.err.println("Exception occurred in convertHTML, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }

  return true;
 }

 public static boolean convertPPT(String fromfile, String tofile)
    {
        System.err.println("convertPPT not supported yet!");
        Thread.dumpStack();
        return false;
 // return false;
 }

 public static boolean convertXLS(String fromfile, String tofile)
    {
        StringBuffer sb = new StringBuffer();
        FileInputStream fis = null;
        FileOutputStream fos = null;
        DataOutputStream dos = null;
        HSSFWorkbook wb = null;

        try
        {
            fis = new FileInputStream(new File(fromfile));
            wb = new HSSFWorkbook(fis);

            int numSheets = wb.getNumberOfSheets();
            for (int i=0;i<numSheets;++i)
            {
                HSSFSheet sheet = wb.getSheetAt(i);
                int numRows = sheet.getLastRowNum();
                for (int j=0;j<numRows;++j)
                {
                    HSSFRow row = sheet.getRow(j);
                    if (row == null)
                        continue;
                   
                    int numCells = row.getLastCellNum();
                    for (int k=0;k<numCells;++k)
                    {
                        HSSFCell cell = row.getCell((short)k);
                        if(cell!=null)
                        {
                            int type = cell.getCellType();
                            if(type==HSSFCell.CELL_TYPE_STRING)
                            {
                                String str = cell.getStringCellValue();
                                str=str.trim();
                                str=replace(str,"\n","");
                                sb.append(str).append(" ");
                            }
                        }
                        // We will ignore all other types - numbers, forumlas, etc.
                        // as these don't hold alot of meaning outside of their tabular context.
                        // else if(type==, CELL_TYPE_NUMERIC, CELL_TYPE_FORMULA, CELL_TYPE_BOOLEAN, CELL_TYPE_ERROR
                    } // cells
                    //sb.append("\n"); // break on each row
                } // rows
                sb.append("\n"); // break on each sheet
            } // sheets

            String s = sb.toString();
            //now write this string to a file
            fos = new FileOutputStream(new File(tofile));
            //bos = new BufferedOutputStream(fos);
            //bos.write(s.getBytes());  //what about other language?
            dos = new DataOutputStream(fos);
            dos.writeBytes(s);
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertXSL for file: " + fromfile, t);
            System.err.println("Exception occurred in convertXSL, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }
        finally
        {
            try
            {
                if (fis != null)
                    fis.close();
                if (fos != null)
                    fos.close();
                if (dos != null)
                    dos.close();
            } catch (Exception e) {}
        }

  return true;
 }


    // This should really be made 'static' and moved into a utility class,
 // included here to simplify things
    private final static String replace(String line, String oldString, String newString)
    {
        if (line == null) {
            return null;
        }
        int i = 0;
        if ((i = line.indexOf(oldString, i)) >= 0) {
            char[] line2 = line.toCharArray(); char[] newString2 = newString.toCharArray(); int oLength = oldString.length();
            StringBuffer buf = new StringBuffer(line2.length); buf.append(line2, 0, i).append(newString2); i += oLength;
            int j = i;
            while ((i = line.indexOf(oldString, i)) > 0) {
                buf.append(line2, j, i - j).append(newString2); i += oLength; j = i;
            }
            buf.append(line2, j, line2.length - j); return buf.toString();
        }
        return line;
    }

    public static void main(String[] args)
    {
        int index = 0;
        String action = args[index++];
        String f1 = args[index++];
        String f2 = args[index++];

        long start = System.currentTimeMillis();
        long end = 0;
        if (action.equals("pdf"))
            convertPDF(f1, f2);
        else if (action.equals("doc"))
            convertDOC(f1, f2);
        else if (action.equals("xls"))
            convertXLS(f1, f2);
        else if (action.equals("ppt"))
            convertPPT(f1, f2);
        else if (action.equals("ppt"))
            convertHTML(f1, f2);

        end = System.currentTimeMillis();
        System.out.println(action + " convert " + f1 + " took " + ((end-start)/1000) + " seconds.");
    }

}

main方法主要是输入三个参数 第一个是转换文档的格式,第二个是文档存放的路径,第三个是要输出的文档存放的位置,

然后对输出的文档进行索引, 平均每个文档在1M-5M之间,

问题: 在进行文档转换的时候pdf,word,xls 都非常慢,本来想写一个threadpool来进行文档的转换,可是测试数据表明多线程转换还不如单线程的快,而且也容易出现outofmemory, 后来我又想了一个办法,把大的pdf ,word xls 进行切分,可是写了一个java的切分成小文档的方法,只能对txt文档进行转换,word 和pdf 因为里面有很多格式和样式的东西都是二进制的,在合成一个大的文档就合并不回去了(c++ 或.net 到时有办法切分),所以希望有过索引大量pdf ,word,xls 文档的朋友给写帮助,能快速处理, 目前的数据量大概是1T(大概是100G),服务器配置大概是4个cpu ,4G内存,虚拟机开到了1.2个G用的是jdk1.4在大也开不了了,谢谢帮助

   发表时间:2007-07-24  
你先用Profier这种东东看一下大致的资源开销分布
是内存不足,频繁回收,或者是解析文件太慢
又或者是因为merge的次数太频繁

你还可以尝试一下将文件读入内存,使用RamDirectory来进行索引,再将结果写入FSDirectory,性能肯定会高很多
0 请登录后投票
   发表时间:2007-07-24  

感谢你的帮助,
使用RamDirectory来进行索引,再将结果写入FSDirectory,性能肯定会高很多,我目前已经是在lucene indexwriter 的时候用了,应该不是这的问题,merge的基数大概是2000左右,所以问题也不是这里,感觉是开源的jar在处理pdf word ,xls等文件的时候转换的太慢,profier我会试试看,不过其他服务程序还是可以运行的不错的,所以内存上感觉问题也不大,
0 请登录后投票
   发表时间:2007-07-24  
POI处理Excel的速度不快
你最好用其它的方法
如果你是在Windows下
而且速度要求比较高
可以考虑用Jacob将Excel先转成Html
然后做索引

PDF和Word也是这样处理
特别是PDF,如果用iText处理,非常的慢
我测试过iText输出PDF,400页的文件,约400M内存,而且和用WPS输出PDF的性能有2-3个数量级的差距
0 请登录后投票
   发表时间:2007-07-30  
我遇到和你一样的问题,,期待好的解决方案。
0 请登录后投票
   发表时间:2007-07-31  
我的解决了,就是用我io 效率那个blog的办法,原来是一个10word 文档要16秒左右,现在只需要二秒就够了
0 请登录后投票
   发表时间:2008-07-13  
roger51 写道

各位好:在javaeye好长时间了,一直在各大网站学习各位的经验很感谢各位,目前我遇到一个关于lucene索引的问题,在国内和国外的网站上找了很久也没找到一个比较满意的解决办法,所以在这里想问问大家,希望有过这方面的经验的朋友给些帮助,最好能有些比较好的代码或可行性建议,我的代码大概如下

import com.messagesolution.message.viewer.util.HtmlDocument;
import com.messagesolution.util.logger.Logger;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.util.PDFTextStripper;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.textmining.text.extraction.WordExtractor;

import java.io.*;


public class DocumentConverter
{

 public static boolean convertPDF(String fromfile, String tofile)
    {
        PDFParser parser = null;
        String s = null;
        FileInputStream in = null;
        FileOutputStream fos = null;
        //BufferedOutputStream bos = null;
        DataOutputStream dos = null;
        try
        {
            try {
    PDFTextStripper _stripper = new PDFTextStripper();
    in = new FileInputStream(new File(fromfile));
    parser = new PDFParser(in);
    parser.parse();
    s = _stripper.getText(parser.getDocument());
    if (StringToolKit.isEmpty(s)){
     Logger.getInstance().error("read string of pdf is empty");
     
     return false;       //nothing to write
    }
       
   } catch (Exception e) {
    Logger.getInstance().error("read pdf or convert it error");
    e.printStackTrace();
    return false;
   }

            try {
    //now write this string to a file
    fos = new FileOutputStream(new File(tofile));
    //bos = new BufferedOutputStream(fos);
    //bos.write(s.getBytes());  //what about other language?
    dos = new DataOutputStream(fos);
    dos.writeBytes(s);
   } catch (Exception e) {
    Logger.getInstance().error("write converted txt error");
    e.printStackTrace();
    return false;
   }
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertPDF for file: " + fromfile, t);
            System.err.println("Exception occurred in convertPDF, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }
        finally {
            try
            {
                if (parser != null)
                    parser.getDocument().close();
                if (in != null)
                    in.close();
                if (fos != null)
                    fos.close();
                //if (bos != null)
                //    bos.close();
                if (dos != null)
                    dos.close();
            } catch (Exception ex) {
              Logger.getInstance().error(ex.toString());
            }
        }

  return true;
 }

 public static boolean convertDOC(String fromfile, String tofile)
    {
        FileInputStream fis = null;
        FileOutputStream fos = null;
        DataOutputStream dos = null;

        try
        {
            fis = new FileInputStream (new File(fromfile));
            WordExtractor extractor = new WordExtractor();
            String s = extractor.extractText(fis);

            //now write this string to a file
            fos = new FileOutputStream(new File(tofile));
            //bos = new BufferedOutputStream(fos);
            //bos.write(s.getBytes());  //what about other language?
            dos = new DataOutputStream(fos);
            dos.writeBytes(s);
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertDOC for file: " + fromfile, t);
            System.err.println("Exception occurred in convertDOC, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }
        finally
        {
            try
            {
                if (fis != null)
                    fis.close();
                if (fos != null)
                    fos.close();
                if (dos != null)
                    dos.close();
            } catch (Exception e) {}
        }

  return true;
 }

 public static boolean convertHTML(String fromfile, String tofile)
    {
        try
        {
            String htmlCharset = HtmlDocument.convertHtml(fromfile, tofile);
            System.out.println("htmlCharset: " + htmlCharset);
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertHTML for file: " + fromfile, t);
            System.err.println("Exception occurred in convertHTML, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }

  return true;
 }

 public static boolean convertPPT(String fromfile, String tofile)
    {
        System.err.println("convertPPT not supported yet!");
        Thread.dumpStack();
        return false;
 // return false;
 }

 public static boolean convertXLS(String fromfile, String tofile)
    {
        StringBuffer sb = new StringBuffer();
        FileInputStream fis = null;
        FileOutputStream fos = null;
        DataOutputStream dos = null;
        HSSFWorkbook wb = null;

        try
        {
            fis = new FileInputStream(new File(fromfile));
            wb = new HSSFWorkbook(fis);

            int numSheets = wb.getNumberOfSheets();
            for (int i=0;i<numSheets;++i)
            {
                HSSFSheet sheet = wb.getSheetAt(i);
                int numRows = sheet.getLastRowNum();
                for (int j=0;j<numRows;++j)
                {
                    HSSFRow row = sheet.getRow(j);
                    if (row == null)
                        continue;
                   
                    int numCells = row.getLastCellNum();
                    for (int k=0;k<numCells;++k)
                    {
                        HSSFCell cell = row.getCell((short)k);
                        if(cell!=null)
                        {
                            int type = cell.getCellType();
                            if(type==HSSFCell.CELL_TYPE_STRING)
                            {
                                String str = cell.getStringCellValue();
                                str=str.trim();
                                str=replace(str,"\n","");
                                sb.append(str).append(" ");
                            }
                        }
                        // We will ignore all other types - numbers, forumlas, etc.
                        // as these don't hold alot of meaning outside of their tabular context.
                        // else if(type==, CELL_TYPE_NUMERIC, CELL_TYPE_FORMULA, CELL_TYPE_BOOLEAN, CELL_TYPE_ERROR
                    } // cells
                    //sb.append("\n"); // break on each row
                } // rows
                sb.append("\n"); // break on each sheet
            } // sheets

            String s = sb.toString();
            //now write this string to a file
            fos = new FileOutputStream(new File(tofile));
            //bos = new BufferedOutputStream(fos);
            //bos.write(s.getBytes());  //what about other language?
            dos = new DataOutputStream(fos);
            dos.writeBytes(s);
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertXSL for file: " + fromfile, t);
            System.err.println("Exception occurred in convertXSL, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }
        finally
        {
            try
            {
                if (fis != null)
                    fis.close();
                if (fos != null)
                    fos.close();
                if (dos != null)
                    dos.close();
            } catch (Exception e) {}
        }

  return true;
 }


    // This should really be made 'static' and moved into a utility class,
 // included here to simplify things
    private final static String replace(String line, String oldString, String newString)
    {
        if (line == null) {
            return null;
        }
        int i = 0;
        if ((i = line.indexOf(oldString, i)) >= 0) {
            char[] line2 = line.toCharArray(); char[] newString2 = newString.toCharArray(); int oLength = oldString.length();
            StringBuffer buf = new StringBuffer(line2.length); buf.append(line2, 0, i).append(newString2); i += oLength;
            int j = i;
            while ((i = line.indexOf(oldString, i)) > 0) {
                buf.append(line2, j, i - j).append(newString2); i += oLength; j = i;
            }
            buf.append(line2, j, line2.length - j); return buf.toString();
        }
        return line;
    }

    public static void main(String[] args)
    {
        int index = 0;
        String action = args[index++];
        String f1 = args[index++];
        String f2 = args[index++];

        long start = System.currentTimeMillis();
        long end = 0;
        if (action.equals("pdf"))
            convertPDF(f1, f2);
        else if (action.equals("doc"))
            convertDOC(f1, f2);
        else if (action.equals("xls"))
            convertXLS(f1, f2);
        else if (action.equals("ppt"))
            convertPPT(f1, f2);
        else if (action.equals("ppt"))
            convertHTML(f1, f2);

        end = System.currentTimeMillis();
        System.out.println(action + " convert " + f1 + " took " + ((end-start)/1000) + " seconds.");
    }

}

main方法主要是输入三个参数 第一个是转换文档的格式,第二个是文档存放的路径,第三个是要输出的文档存放的位置,

然后对输出的文档进行索引, 平均每个文档在1M-5M之间,

问题: 在进行文档转换的时候pdf,word,xls 都非常慢,本来想写一个threadpool来进行文档的转换,可是测试数据表明多线程转换还不如单线程的快,而且也容易出现outofmemory, 后来我又想了一个办法,把大的pdf ,word xls 进行切分,可是写了一个java的切分成小文档的方法,只能对txt文档进行转换,word 和pdf 因为里面有很多格式和样式的东西都是二进制的,在合成一个大的文档就合并不回去了(c++ 或.net 到时有办法切分),所以希望有过索引大量pdf ,word,xls 文档的朋友给写帮助,能快速处理, 目前的数据量大概是1T(大概是100G),服务器配置大概是4个cpu ,4G内存,虚拟机开到了1.2个G用的是jdk1.4在大也开不了了,谢谢帮助

 

2 请登录后投票
论坛首页 Java企业应用版

跳转论坛:
Global site tag (gtag.js) - Google Analytics