// 所用到的jar 包 可以到 http://download.csdn.net/detail/zhuhongming123/6888019 下载jar文件
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import javax.swing.text.BadLocationException;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import util.interf.DocumentReader;
/**
* lucene 全文索引基于文档建立索引常用的工具类
*
*
**/
public class IndexReaderUtil implements DocumentReader {
@Override
public String readerPDF(String path) {
// TODO Auto-generated method stub
StringBuffer buff = new StringBuffer();
try {
System.out.println("此处可以获取到path:" + path);
FileInputStream fis = new FileInputStream(path);
PDFParser p = new PDFParser(fis);
System.out.println("此处可以解析pdf文件1");
p.parse();
System.out.println("此处可以解析pdf文件2");
PDFTextStripper pdfStrip = new PDFTextStripper();
System.out.println("此处可以解析pdf文件3");
String s = pdfStrip.getText(p.getPDDocument());
buff.append(s);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return buff.toString().trim();
}
@Override
public String readerWord2003(String path) { // 2003
// TODO Auto-generated method stub
StringBuffer buff = new StringBuffer("");// 文档内容
try {
System.out.println("此处可以获取到path:" + path);
HWPFDocument doc = new HWPFDocument(new FileInputStream(path));
System.out.println("此处可以读取word 1");
Range range = doc.getRange();
int paragraphCount = range.numParagraphs();// 段落
for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
Paragraph pp = range.getParagraph(i);
buff.append(pp.text());
}
} catch (Exception e) {
e.printStackTrace();
}
return buff.toString().trim();
}
@Override
public String readerExcel(String path) {
// TODO Auto-generated method stub
StringBuffer buff = new StringBuffer("");
try {
FileInputStream fis = new FileInputStream(path);
HSSFWorkbook fb = new HSSFWorkbook(fis);
ExcelExtractor extractor = new ExcelExtractor(fb);
extractor.setFormulasNotResults(true);
extractor.setIncludeSheetNames(false);
buff.append(extractor.getText());
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return buff.toString().trim();
}
@Override
public String readerHtml(String path) {
// TODO Auto-generated method stub
StringBuffer buff = new StringBuffer("");
File file = new File(path);
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
// 读取页面
BufferedReader reader = new BufferedReader(new InputStreamReader(
fis, "utf-8"));// 这里的字符编码要注意,要对上html头文件的一致,否则会出乱码
String line = null;
while ((line = reader.readLine()) != null) {
buff.append(line + "\n");
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
return buff.toString().trim();
}
@Override
public String readerText(String path) {
// TODO Auto-generated method stub
StringBuffer buff = new StringBuffer("");// 文档内容
try {
FileReader reader = new FileReader(path);
BufferedReader br = new BufferedReader(reader);
String s1 = null;
while ((s1 = br.readLine()) != null) {
buff.append(s1 + "\r");
}
br.close();
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
return buff.toString().trim();
}
/**
* @Method: extractTextFromDOCX
* @Description: 从word 2003文档中提取纯文本
*
* @param
* @return String
* @throws
*/
public static String extractTextFromDOC(String path) {
StringBuffer buff = new StringBuffer("");
try {
FileInputStream fis = new FileInputStream(path);
WordExtractor ex = new WordExtractor(fis);
buff.append(ex.getText());
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} // is是WORD文件的InputStream
return buff.toString().trim();
}
/**
* @Method: extractTextFromDOCX
* @Description: 从word 2007文档中提取纯文本
*
* @param
* @return String
* @throws
*/
public static String extractTextFromDOC2007(String fileName) {
StringBuffer buff = new StringBuffer("");
try {
OPCPackage opcPackage = POIXMLDocument.openPackage(fileName);
POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);
buff.append(ex.getText());
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return buff.toString().trim();
}
/**
* @Method: extractTextFromXLS
* @Description: 从excel 2003文档中提取纯文本
*
* @param
* @return String
* @throws
*/
public String extractTextFromXLS(String path) throws IOException {
StringBuffer content = new StringBuffer();
FileInputStream fis = new FileInputStream(path);
HSSFWorkbook workbook = new HSSFWorkbook(fis); // 创建对Excel工作簿文件的引用
for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
if (null != workbook.getSheetAt(numSheets)) {
HSSFSheet aSheet = workbook.getSheetAt(numSheets); // 获得一个sheet
for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet
.getLastRowNum(); rowNumOfSheet++) {
if (null != aSheet.getRow(rowNumOfSheet)) {
HSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一行
for (int cellNumOfRow = 0; cellNumOfRow <= aRow
.getLastCellNum(); cellNumOfRow++) {
if (null != aRow.getCell(cellNumOfRow)) {
HSSFCell aCell = aRow.getCell(cellNumOfRow); // 获得列值
if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
content.append(aCell.getNumericCellValue());
} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN) {
content.append(aCell.getBooleanCellValue());
} else {
content.append(aCell.getStringCellValue());
}
}
}
}
}
}
}
return content.toString();
}
/**
* @Method: extractTextFromXLS2007
* @Description: 从excel 2007文档中提取纯文本
*
* @param
* @return String
* @throws
*/
public String extractTextFromXLS2007(String path) throws Exception {
StringBuffer content = new StringBuffer();
// 构造 XSSFWorkbook 对象,strPath 传入文件路径
FileInputStream fis = new FileInputStream(path);
XSSFWorkbook xwb = new XSSFWorkbook(fis);
// 循环工作表Sheet
for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {
XSSFSheet xSheet = xwb.getSheetAt(numSheet);
if (xSheet == null) {
continue;
}
// 循环行Row
for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {
XSSFRow xRow = xSheet.getRow(rowNum);
if (xRow == null) {
continue;
}
// 循环列Cell
for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {
XSSFCell xCell = xRow.getCell(cellNum);
if (xCell == null) {
continue;
}
if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
content.append(xCell.getBooleanCellValue());
} else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
content.append(xCell.getNumericCellValue());
} else {
content.append(xCell.getStringCellValue());
}
}
}
}
return content.toString();
}
/**
* @Method: readPowerPoint
* @Description: 从PowerPoint中提取纯文本
*
*
*
* */
public String readPowerPoint(String path) {
StringBuffer content = new StringBuffer("");
try {
SlideShow ss = new SlideShow(new HSLFSlideShow(new FileInputStream(
path)));// is
// 为文件的InputStream,建立SlideShow
Slide[] slides = ss.getSlides();// 获得每一张幻灯片
for (int i = 0; i < slides.length; i++) {
TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun
for (int j = 0; j < t.length; j++) {
content.append(t[j].getText());// 这里会将文字内容加到content中去
}
}
} catch (Exception ex) {
ex.printStackTrace();
}
return content.toString();
}
/**
* @Method: readRtf
* @Description: 从rtf中提取纯文本
*
*
*
* */
public String readRtf(String path) {
String result = null;
File file = new File(path);
try {
DefaultStyledDocument styledDoc = new DefaultStyledDocument();
InputStream is = new FileInputStream(file);
new RTFEditorKit().read(is, styledDoc, 0);
result = new String(styledDoc.getText(0, styledDoc.getLength())
.getBytes("iso8859-1"), "gbk");
// 提取文本,读取中文需要使用gbk编码,否则会出现乱码
} catch (IOException e) {
e.printStackTrace();
} catch (BadLocationException e) {
e.printStackTrace();
}
return result;
}
}