/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package stringtest;
import java.io.*;
import java.io.FileInputStream;
import java.io.File;
import org.apache.poi.hssf.extractor.*;
import org.apache.poi.hssf.usermodel.*;//包含生成Excel文档的各个类.
import org.apache.poi.hwpf.extractor.*;//对word文档进行处理的包
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.usermodel.SlideShow;//对ppt文档进行处理的包
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;//对pdf文档进行处理的包
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.queryParser.*;//lucene包
/**
*
* @author hp
*/
public class StringTest {
public static String INDEX_FILE_PATH = "F://index2";
public static String INDEX_STORE_PATH ="F://store2";
/**
* @param args the command line arguments
*/
public static void main(String[] args)throws Exception {
StringTest test = new StringTest();
IndexWriter writer = new IndexWriter(INDEX_STORE_PATH,new StandardAnalyzer(),true);
test.writeToIndex(INDEX_FILE_PATH,writer);
writer.close();
test.indexSearcher("雒琛");
//Mix mix = new Mix();
//String string = mix.readPdf("F://ch07.pdf");
//System.out.println(string);
// TODO code application logic here
}
public void writeToIndex(String path,IndexWriter writer)throws Exception{
File folder = new File(path);
String[] files = folder.list();
//System.out.println(files.length);
for(int i=0;i<files.length;i++){
//System.out.println(i);
File file = new File(folder,files[i]);//根据 parent 路径名字符串和 child 路径名字符串创建一个新 File 实例
String s = file.getAbsolutePath();
//System.out.println(s);
if(s.contains(".")) {
int index = s.indexOf(".");
String s1 = s.substring(index+1);
//System.out.println(s);
if(s1.equals("xls")){
Document doc = readExcel(s);
writer.addDocument(doc);
writer.optimize();
}
else if(s1.equals("doc")){
Document doc = readDoc(s);
writer.addDocument(doc);
writer.optimize();
}
else if(s1.equals("ppt")){
Document doc = readPpt(s);
writer.addDocument(doc);
writer.optimize();
}
else if(s1.equals("pdf")){
Document doc = readPdf(s);
writer.addDocument(doc);
writer.optimize();
}
else if(s1.equals("txt")){
//File f= new File(s);
Document doc = new Document();
FileInputStream is = new FileInputStream(file);
Reader reader = new BufferedReader(new InputStreamReader(is));
doc.add(new Field("content",reader));//,Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("path",s,Field.Store.YES,Field.Index.ANALYZED));
writer.addDocument(doc);
writer.optimize();
}
else continue;
}
else if(!s.contains(".")){ writeToIndex(s,writer);}
}
}
public void indexSearcher(String s)throws Exception{
//System.out.println(s);
QueryParser paser = new QueryParser("content",new StandardAnalyzer());
Query query = paser.parse(s);
//System.out.println(query.toString());
Searcher searcher = new IndexSearcher(INDEX_STORE_PATH);
Hits hit = searcher.search(query);
//System.out.println(hit.length());
for(int i=0;i<hit.length();i++)
{
Document d = hit.doc(i);
String dname = d.get("path");
System.out.println(dname+" ");
}
}
public Document readExcel(String xls)throws Exception {
// 创建输入流读取xls文件
//System.out.println(xls);
InputStream in = new FileInputStream(xls);//xls文件存储地址
HSSFWorkbook workbook = new HSSFWorkbook(in); //读取一个文件
ExcelExtractor extractor = new ExcelExtractor(workbook);
extractor.setFormulasNotResults(true);
extractor.setIncludeSheetNames(false);
String text = extractor.getText(); //Retrieves the text contents of the file
//System.out.println(text);
//return text;//返回文件的Sting类型文字
Document docexcel = new Document();
docexcel.add(new Field("content",text,Field.Store.YES,Field.Index.ANALYZED));//,Field.Store.YES, Field.Index.ANALYZED));
docexcel.add(new Field("path",xls,Field.Store.YES,Field.Index.ANALYZED));
//System.out.println(text);
return docexcel;
}
public Document readDoc(String doc) throws Exception {
// 创建输入流读取DOC文件
FileInputStream in = new FileInputStream(doc);
WordExtractor extractor = null; // 创建WordExtractor
extractor = new WordExtractor(in);// 对DOC文件进行提取
String text = extractor.getText();
Document docdoc = new Document();
docdoc.add(new Field("content",text,Field.Store.YES,Field.Index.ANALYZED));//,Field.Store.YES, Field.Index.ANALYZED));
docdoc.add(new Field("path",doc,Field.Store.YES,Field.Index.ANALYZED));
return docdoc;
}
public Document readPpt(String ppt)throws Exception {
// 创建输入流读取ppt文件
FileInputStream is = new FileInputStream(ppt);
SlideShow ss = new SlideShow(new HSLFSlideShow(is));//is 为文件的InputStream,建立SlideShow
Slide[] slides = ss.getSlides();//获得每一张幻灯片
String text = new String();
for(int i=0;i<slides.length;i++){
TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun
for(int j=0;j<t.length;j++){
//System.out.println(t[j].getText());//这里会将文字内容加到content中去
text += t[j].getText();
}
}
Document docppt = new Document();
docppt.add(new Field("content",text,Field.Store.YES,Field.Index.ANALYZED));//,Field.Store.YES, Field.Index.ANALYZED));
docppt.add(new Field("path",ppt,Field.Store.YES,Field.Index.ANALYZED));
return docppt;
}
public Document readPdf(String pdf){
// 创建输入流读取pdf文件
String result="";
FileInputStream is = null;
PDDocument document = null;
try {
is = new FileInputStream(pdf);
PDFParser parser = new PDFParser(is);
parser.parse();
document = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(document);
}catch (Exception e) {
e.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
} catch (Exception e) {
e.printStackTrace();
}
}
if (document != null) {
try {
document.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
//System.out.println(result);
Document docpdf = new Document();
docpdf.add(new Field("content",result,Field.Store.YES,Field.Index.ANALYZED));//,Field.Store.YES, Field.Index.ANALYZED));
docpdf.add(new Field("path",pdf,Field.Store.YES,Field.Index.ANALYZED));
return docpdf;
}
}