java pdf

引用 :http://www.iteye.com/post/599330
各位好:在javaeye好长时间了,一直在各大网站学习各位的经验很感谢各位,目前我遇到一个关于lucene索引的问题,在国内和国外的网站上找了很久也没找到一个比较满意的解决办法,所以在这里想问问大家,希望有过这方面的经验的朋友给些帮助,最好能有些比较好的代码或可行性建议,我的代码大概如下



import com.messagesolution.message.viewer.util.HtmlDocument;
import com.messagesolution.util.logger.Logger;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.util.PDFTextStripper;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.textmining.text.extraction.WordExtractor;

import java.io.*;


public class DocumentConverter
{

public static boolean convertPDF(String fromfile, String tofile)
{
PDFParser parser = null;
String s = null;
FileInputStream in = null;
FileOutputStream fos = null;
//BufferedOutputStream bos = null;
DataOutputStream dos = null;
try
{
try {
PDFTextStripper _stripper = new PDFTextStripper();
in = new FileInputStream(new File(fromfile));
parser = new PDFParser(in);
parser.parse();
s = _stripper.getText(parser.getDocument());
if (StringToolKit.isEmpty(s)){
Logger.getInstance().error("read string of pdf is empty");

return false; //nothing to write
}

} catch (Exception e) {
Logger.getInstance().error("read pdf or convert it error");
e.printStackTrace();
return false;
}

try {
//now write this string to a file
fos = new FileOutputStream(new File(tofile));
//bos = new BufferedOutputStream(fos);
//bos.write(s.getBytes()); //what about other language?
dos = new DataOutputStream(fos);
dos.writeBytes(s);
} catch (Exception e) {
Logger.getInstance().error("write converted txt error");
e.printStackTrace();
return false;
}
}
catch (Throwable t)
{
if (t instanceof OutOfMemoryError)
Logger.getInstance().fatal("OutOfMemoryError occurred in convertPDF for file: " + fromfile, t);
System.err.println("Exception occurred in convertPDF, t: " + t);
t.printStackTrace();
return false; //something wrong during the conversion
}
finally {
try
{
if (parser != null)
parser.getDocument().close();
if (in != null)
in.close();
if (fos != null)
fos.close();
//if (bos != null)
// bos.close();
if (dos != null)
dos.close();
} catch (Exception ex) {
Logger.getInstance().error(ex.toString());
}
}

return true;
}

public static boolean convertDOC(String fromfile, String tofile)
{
FileInputStream fis = null;
FileOutputStream fos = null;
DataOutputStream dos = null;

try
{
fis = new FileInputStream (new File(fromfile));
WordExtractor extractor = new WordExtractor();
String s = extractor.extractText(fis);

//now write this string to a file
fos = new FileOutputStream(new File(tofile));
//bos = new BufferedOutputStream(fos);
//bos.write(s.getBytes()); //what about other language?
dos = new DataOutputStream(fos);
dos.writeBytes(s);
}
catch (Throwable t)
{
if (t instanceof OutOfMemoryError)
Logger.getInstance().fatal("OutOfMemoryError occurred in convertDOC for file: " + fromfile, t);
System.err.println("Exception occurred in convertDOC, t: " + t);
t.printStackTrace();
return false; //something wrong during the conversion
}
finally
{
try
{
if (fis != null)
fis.close();
if (fos != null)
fos.close();
if (dos != null)
dos.close();
} catch (Exception e) {}
}

return true;
}

public static boolean convertHTML(String fromfile, String tofile)
{
try
{
String htmlCharset = HtmlDocument.convertHtml(fromfile, tofile);
System.out.println("htmlCharset: " + htmlCharset);
}
catch (Throwable t)
{
if (t instanceof OutOfMemoryError)
Logger.getInstance().fatal("OutOfMemoryError occurred in convertHTML for file: " + fromfile, t);
System.err.println("Exception occurred in convertHTML, t: " + t);
t.printStackTrace();
return false; //something wrong during the conversion
}

return true;
}

public static boolean convertPPT(String fromfile, String tofile)
{
System.err.println("convertPPT not supported yet!");
Thread.dumpStack();
return false;
// return false;
}

public static boolean convertXLS(String fromfile, String tofile)
{
StringBuffer sb = new StringBuffer();
FileInputStream fis = null;
FileOutputStream fos = null;
DataOutputStream dos = null;
HSSFWorkbook wb = null;

try
{
fis = new FileInputStream(new File(fromfile));
wb = new HSSFWorkbook(fis);

int numSheets = wb.getNumberOfSheets();
for (int i=0;i<numSheets;++i)
{
HSSFSheet sheet = wb.getSheetAt(i);
int numRows = sheet.getLastRowNum();
for (int j=0;j<numRows;++j)
{
HSSFRow row = sheet.getRow(j);
if (row == null)
continue;

int numCells = row.getLastCellNum();
for (int k=0;k<numCells;++k)
{
HSSFCell cell = row.getCell((short)k);
if(cell!=null)
{
int type = cell.getCellType();
if(type==HSSFCell.CELL_TYPE_STRING)
{
String str = cell.getStringCellValue();
str=str.trim();
str=replace(str,"\n","");
sb.append(str).append(" ");
}
}
// We will ignore all other types - numbers, forumlas, etc.
// as these don't hold alot of meaning outside of their tabular context.
// else if(type==, CELL_TYPE_NUMERIC, CELL_TYPE_FORMULA, CELL_TYPE_BOOLEAN, CELL_TYPE_ERROR
} // cells
//sb.append("\n"); // break on each row
} // rows
sb.append("\n"); // break on each sheet
} // sheets

String s = sb.toString();
//now write this string to a file
fos = new FileOutputStream(new File(tofile));
//bos = new BufferedOutputStream(fos);
//bos.write(s.getBytes()); //what about other language?
dos = new DataOutputStream(fos);
dos.writeBytes(s);
}
catch (Throwable t)
{
if (t instanceof OutOfMemoryError)
Logger.getInstance().fatal("OutOfMemoryError occurred in convertXSL for file: " + fromfile, t);
System.err.println("Exception occurred in convertXSL, t: " + t);
t.printStackTrace();
return false; //something wrong during the conversion
}
finally
{
try
{
if (fis != null)
fis.close();
if (fos != null)
fos.close();
if (dos != null)
dos.close();
} catch (Exception e) {}
}

return true;
}


// This should really be made 'static' and moved into a utility class,
// included here to simplify things
private final static String replace(String line, String oldString, String newString)
{
if (line == null) {
return null;
}
int i = 0;
if ((i = line.indexOf(oldString, i)) >= 0) {
char[] line2 = line.toCharArray(); char[] newString2 = newString.toCharArray(); int oLength = oldString.length();
StringBuffer buf = new StringBuffer(line2.length); buf.append(line2, 0, i).append(newString2); i += oLength;
int j = i;
while ((i = line.indexOf(oldString, i)) > 0) {
buf.append(line2, j, i - j).append(newString2); i += oLength; j = i;
}
buf.append(line2, j, line2.length - j); return buf.toString();
}
return line;
}

public static void main(String[] args)
{
int index = 0;
String action = args[index++];
String f1 = args[index++];
String f2 = args[index++];

long start = System.currentTimeMillis();
long end = 0;
if (action.equals("pdf"))
convertPDF(f1, f2);
else if (action.equals("doc"))
convertDOC(f1, f2);
else if (action.equals("xls"))
convertXLS(f1, f2);
else if (action.equals("ppt"))
convertPPT(f1, f2);
else if (action.equals("ppt"))
convertHTML(f1, f2);

end = System.currentTimeMillis();
System.out.println(action + " convert " + f1 + " took " + ((end-start)/1000) + " seconds.");
}

}





main方法主要是输入三个参数 第一个是转换文档的格式,第二个是文档存放的路径,第三个是要输出的文档存放的位置,

然后对输出的文档进行索引, 平均每个文档在1M-5M之间,

问题: 在进行文档转换的时候pdf,word,xls 都非常慢,本来想写一个threadpool来进行文档的转换,可是测试数据表明多线程转换还不如单线程的快,而且也容易出现outofmemory, 后来我又想了一个办法,把大的pdf ,word xls 进行切分,可是写了一个java的切分成小文档的方法,只能对txt文档进行转换,word 和pdf 因为里面有很多格式和样式的东西都是二进制的,在合成一个大的文档就合并不回去了(c++ 或.net 到时有办法切分),所以希望有过索引大量pdf ,word,xls 文档的朋友给写帮助,能快速处理, 目前的数据量大概是1T(大概是100G),服务器配置大概是4个cpu ,4G内存,虚拟机开到了1.2个G用的是jdk1.4在大也开不了了,谢谢帮助
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值