功能:把word,excel,text,ptf等文件转化成字符串
package lucene.service;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class File2StringUtils {
public static String wordTranslation(File file) throws IOException {
InputStream in = new FileInputStream(file);
WordExtractor wordExtractor = new WordExtractor(in);
return wordExtractor.getText();
}
public static String powerPointTranslation(File file) throws IOException {
InputStream in = new FileInputStream(file);
StringBuffer sb = new StringBuffer("");
SlideShow ss = new SlideShow(new HSLFSlideShow(in));
Slide[] slides = ss.getSlides();
for (int i = 0; i < slides.length; i++) {
TextRun[] t = slides[i].getTextRuns();
for (int j = 0; j < t.length; j++) {
sb.append(t[j].getText());
}
sb.append(slides[i].getTitle());
}
return sb.toString();
}
public static String excelTranlation(File file) {
StringBuffer sb = new StringBuffer("");
try {
InputStream in = new FileInputStream(file);
HSSFWorkbook workbook = new HSSFWorkbook(in);
int numberOfSheet = workbook.getNumberOfSheets();
for(int i = 0 ; i < numberOfSheet ; i ++) {
HSSFSheet sheet = workbook.getSheetAt(i);
int rows = sheet.getLastRowNum();
for(int j = 0 ; j < rows; j ++) {
HSSFRow row = sheet.getRow(j);
int cells = row.getLastCellNum();
for(int k = 0 ; k < cells ; k ++) {
HSSFCell cell = row.getCell(k);
String str = cell.getStringCellValue();
sb.append(str + " ");
}
}
}
} catch (Exception e) {
return sb.toString();
}
return sb.toString();
}
public static String pdfTranslation(File file) throws IOException {
InputStream in = new FileInputStream(file);
PDFParser parser = new PDFParser(in);
parser.parse();
PDDocument doc = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
return stripper.getText(doc);
}
public static void main(String[] args) throws IOException {
// File file= new File("D:\\test\\doc.doc");
// System.out.println(wordTranslation(file));
// File file= new File("D:\\test\\ppt.ppt");
// System.out.println(powerPointTranslation(file));
// File file= new File("D:\\test\\xls.xls");
// System.out.println(excelTranlation(file));
File file= new File("D:\\test\\pdf.pdf");
System.out.println(pdfTranslation(file));
}
}