package luch;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.Date;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
public class DoSearch {
/**
* @param args
*/
public static void main(String[] args) throws Exception {
createIndex();
//实行检索操作
//指定索引所在文件夹
Directory dir = new SimpleFSDirectory(new File("d://index"));
//创建 IndexSearcher对象
IndexSearcher searcher = new IndexSearcher(dir);
//创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器
String fields = "content";
String queryString = "difficult";
try {
QueryParser queryParser = new QueryParser(Version.LUCENE_30,
fields, new StandardAnalyzer(Version.LUCENE_30));
// 将检索关键字打包成Query对象
Query query = queryParser.parse(queryString);
//搜索结果 TopDocs里面有scoreDocs[]数组,里面保存着索引值
TopDocs hits = searcher.search(query, 100);
//hits.totalHits表示一共搜到多少个
System.out.println("索引个数:" + searcher.maxDoc() + "个 包含关键字的有:" + hits.totalHits + "个");
//循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值
for (int i = 0; i < hits.scoreDocs.length; i++) {
ScoreDoc sdoc = hits.scoreDocs[i];
Document doc = searcher.doc(sdoc.doc);
System.out.println("文件名为:" + doc.get("name") );
System.out.println("内容为:" + doc.get("content"));
System.out.println("路径为:" + doc.get("path"));
}
searcher.close();
} catch (ParseException e) {
e.printStackTrace();
}
}
/*
*创建索引
**/
private static void createIndex() throws Exception {
Date start = new Date();
//索引文件存放的位置
File indexDir=new File("d://index");
//需要建立索引的文档集合的位置
File docDir = new File("d://content");
//建立索引器
IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir),
new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.LIMITED);
File[] textFiles = docDir.listFiles();
for (int i = 0; i < textFiles.length; i++) {
if(textFiles[i].isFile()
&& textFiles[i].getName().endsWith(".txt")) {
//Lucene的文档结构
Document doc = new Document();
//文件名称,可查询,不分词
String fileName = textFiles[i].getName().substring(0, textFiles[i].getName().length());
doc.add(new Field("name", fileName, Field.Store.YES, Field.Index.NOT_ANALYZED));
//文件路径,可查询,不分词
String filePath = textFiles[i].getPath();
doc.add(new Field("path", filePath, Field.Store.YES, Field.Index.NOT_ANALYZED));
//文件内容,需要检索注释掉的这种做法,只做索引不做存储
// doc.add(new Field("content", new FileReader(fileSrc)));
// doc.add(new Field("content",
// new InputStreamReader(
// new FileInputStream(filePath), "UTF-8")));
//读取文件内容
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(filePath), "GBK"));
String line = new String();
String temp = new String();
while ((line = reader.readLine()) != null) {
temp += line;
}
reader.close();
//这里有个属性是 Field.Index.ANALYZED,之前我用Field.Index.NOT_ANALYZED,然后文件内容就一直读不到,搞了我好长时间。
Field fieldBody = new Field("content", temp, Field.Store.YES,
Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(fieldBody);
//追加一条索引记录
writer.addDocument(doc);
}
}
//关闭索引器,并写入磁盘索引文件
writer.optimize();
writer.close();
Date end = new Date();
//计算时间
System.out.println("建立索引用时:" + (end.getTime()-start.getTime()) + "毫秒");
}
}