java pdf

最新推荐文章于 2024-09-30 10:36:58 发布

iteye_13715

最新推荐文章于 2024-09-30 10:36:58 发布

阅读量146

点赞数

分类专栏： java 文章标签： Java DOS Apache lucene J#

本文链接：https://blog.csdn.net/iteye_13715/article/details/81600696

版权

java 专栏收录该内容

31 篇文章 0 订阅

订阅专栏

引用 :http://www.iteye.com/post/599330
各位好：在javaeye好长时间了，一直在各大网站学习各位的经验很感谢各位，目前我遇到一个关于lucene索引的问题，在国内和国外的网站上找了很久也没找到一个比较满意的解决办法，所以在这里想问问大家，希望有过这方面的经验的朋友给些帮助，最好能有些比较好的代码或可行性建议，我的代码大概如下




import com.messagesolution.message.viewer.util.HtmlDocument;
import com.messagesolution.util.logger.Logger;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.util.PDFTextStripper;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.textmining.text.extraction.WordExtractor;

import java.io.*;


public class DocumentConverter
{

 public static boolean convertPDF(String fromfile, String tofile)
    {
        PDFParser parser = null;
        String s = null;
        FileInputStream in = null;
        FileOutputStream fos = null;
        //BufferedOutputStream bos = null;
        DataOutputStream dos = null;
        try
        {
            try {
    PDFTextStripper _stripper = new PDFTextStripper();
    in = new FileInputStream(new File(fromfile));
    parser = new PDFParser(in);
    parser.parse();
    s = _stripper.getText(parser.getDocument());
    if (StringToolKit.isEmpty(s)){
     Logger.getInstance().error("read string of pdf is empty");

     return false;       //nothing to write
    }

   } catch (Exception e) {
    Logger.getInstance().error("read pdf or convert it error");
    e.printStackTrace();
    return false;
   }

            try {
    //now write this string to a file
    fos = new FileOutputStream(new File(tofile));
    //bos = new BufferedOutputStream(fos);
    //bos.write(s.getBytes());  //what about other language?
    dos = new DataOutputStream(fos);
    dos.writeBytes(s);
   } catch (Exception e) {
    Logger.getInstance().error("write converted txt error");
    e.printStackTrace();
    return false;
   }
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertPDF for file: " + fromfile, t);
            System.err.println("Exception occurred in convertPDF, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }
        finally {
            try
            {
                if (parser != null)
                    parser.getDocument().close();
                if (in != null)
                    in.close();
                if (fos != null)
                    fos.close();
                //if (bos != null)
                //    bos.close();
                if (dos != null)
                    dos.close();
            } catch (Exception ex) {
              Logger.getInstance().error(ex.toString());
            }
        }

  return true;
 }

 public static boolean convertDOC(String fromfile, String tofile)
    {
        FileInputStream fis = null;
        FileOutputStream fos = null;
        DataOutputStream dos = null;

        try
        {
            fis = new FileInputStream (new File(fromfile));
            WordExtractor extractor = new WordExtractor();
            String s = extractor.extractText(fis);

            //now write this string to a file
            fos = new FileOutputStream(new File(tofile));
            //bos = new BufferedOutputStream(fos);
            //bos.write(s.getBytes());  //what about other language?
            dos = new DataOutputStream(fos);
            dos.writeBytes(s);
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertDOC for file: " + fromfile, t);
            System.err.println("Exception occurred in convertDOC, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }
        finally
        {
            try
            {
                if (fis != null)
                    fis.close();
                if (fos != null)
                    fos.close();
                if (dos != null)
                    dos.close();
            } catch (Exception e) {}
        }

  return true;
 }

 public static boolean convertHTML(String fromfile, String tofile)
    {
        try
        {
            String htmlCharset = HtmlDocument.convertHtml(fromfile, tofile);
            System.out.println("htmlCharset: " + htmlCharset);
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertHTML for file: " + fromfile, t);
            System.err.println("Exception occurred in convertHTML, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }

  return true;
 }

 public static boolean convertPPT(String fromfile, String tofile)
    {
        System.err.println("convertPPT not supported yet!");
        Thread.dumpStack();
        return false;
 // return false;
 }

 public static boolean convertXLS(String fromfile, String tofile)
    {
        StringBuffer sb = new StringBuffer();
        FileInputStream fis = null;
        FileOutputStream fos = null;
        DataOutputStream dos = null;
        HSSFWorkbook wb = null;

        try
        {
            fis = new FileInputStream(new File(fromfile));
            wb = new HSSFWorkbook(fis);

            int numSheets = wb.getNumberOfSheets();
            for (int i=0;i<numSheets;++i)
            {
                HSSFSheet sheet = wb.getSheetAt(i);
                int numRows = sheet.getLastRowNum();
                for (int j=0;j<numRows;++j)
                {
                    HSSFRow row = sheet.getRow(j);
                    if (row == null)
                        continue;

                    int numCells = row.getLastCellNum();
                    for (int k=0;k<numCells;++k)
                    {
                        HSSFCell cell = row.getCell((short)k);
                        if(cell!=null)
                        {
                            int type = cell.getCellType();
                            if(type==HSSFCell.CELL_TYPE_STRING)
                            {
                                String str = cell.getStringCellValue();
                                str=str.trim();
                                str=replace(str,"\n","");
                                sb.append(str).append(" ");
                            }
                        }
                        // We will ignore all other types - numbers, forumlas, etc.
                        // as these don't hold alot of meaning outside of their tabular context.
                        // else if(type==, CELL_TYPE_NUMERIC, CELL_TYPE_FORMULA, CELL_TYPE_BOOLEAN, CELL_TYPE_ERROR
                    } // cells
                    //sb.append("\n"); // break on each row
                } // rows
                sb.append("\n"); // break on each sheet
            } // sheets

            String s = sb.toString();
            //now write this string to a file
            fos = new FileOutputStream(new File(tofile));
            //bos = new BufferedOutputStream(fos);
            //bos.write(s.getBytes());  //what about other language?
            dos = new DataOutputStream(fos);
            dos.writeBytes(s);
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertXSL for file: " + fromfile, t);
            System.err.println("Exception occurred in convertXSL, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }
        finally
        {
            try
            {
                if (fis != null)
                    fis.close();
                if (fos != null)
                    fos.close();
                if (dos != null)
                    dos.close();
            } catch (Exception e) {}
        }

  return true;
 }


    // This should really be made 'static' and moved into a utility class,
 // included here to simplify things
    private final static String replace(String line, String oldString, String newString)
    {
        if (line == null) {
            return null;
        }
        int i = 0;
        if ((i = line.indexOf(oldString, i)) >= 0) {
            char[] line2 = line.toCharArray(); char[] newString2 = newString.toCharArray(); int oLength = oldString.length();
            StringBuffer buf = new StringBuffer(line2.length); buf.append(line2, 0, i).append(newString2); i += oLength;
            int j = i;
            while ((i = line.indexOf(oldString, i)) > 0) {
                buf.append(line2, j, i - j).append(newString2); i += oLength; j = i;
            }
            buf.append(line2, j, line2.length - j); return buf.toString();
        }
        return line;
    }

    public static void main(String[] args)
    {
        int index = 0;
        String action = args[index++];
        String f1 = args[index++];
        String f2 = args[index++];

        long start = System.currentTimeMillis();
        long end = 0;
        if (action.equals("pdf"))
            convertPDF(f1, f2);
        else if (action.equals("doc"))
            convertDOC(f1, f2);
        else if (action.equals("xls"))
            convertXLS(f1, f2);
        else if (action.equals("ppt"))
            convertPPT(f1, f2);
        else if (action.equals("ppt"))
            convertHTML(f1, f2);

        end = System.currentTimeMillis();
        System.out.println(action + " convert " + f1 + " took " + ((end-start)/1000) + " seconds.");
    }

}

main方法主要是输入三个参数第一个是转换文档的格式，第二个是文档存放的路径，第三个是要输出的文档存放的位置，

然后对输出的文档进行索引, 平均每个文档在1M-5M之间，

问题：在进行文档转换的时候pdf，word，xls 都非常慢，本来想写一个threadpool来进行文档的转换，可是测试数据表明多线程转换还不如单线程的快，而且也容易出现outofmemory, 后来我又想了一个办法，把大的pdf ,word xls 进行切分，可是写了一个java的切分成小文档的方法，只能对txt文档进行转换，word 和pdf 因为里面有很多格式和样式的东西都是二进制的，在合成一个大的文档就合并不回去了（c++ 或.net 到时有办法切分），所以希望有过索引大量pdf ，word，xls 文档的朋友给写帮助，能快速处理，目前的数据量大概是1T(大概是100G)，服务器配置大概是4个cpu ,4G内存，虚拟机开到了1.2个G用的是jdk1.4在大也开不了了，谢谢帮助