首先是建立索引的类文件:
package com.jereh.lucene;
import java.io.*;
import java.util.Date;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
/**
* 创建索引 Lucene 3.0+
*
* @author Administrator
*
*/
public class Indexer {
/**
* @param args
* @throws IOException
*/
public static void index(String dateDir, String indexDir)
throws IOException {
IndexWriter indexWriter = null;
// 创建Directory对象
Directory dir = FSDirectory.getDirectory(new File(indexDir));
// 创建IndexWriter对象,第一个参数是Directory,第二个是分词器,第三个表示是否是创建,如果为false为在此基础上面修改,第四表示表示分词的最大值,比如说new
// MaxFieldLength(2),就表示两个字一分,一般用IndexWriter.MaxFieldLength.LIMITED
indexWriter = new IndexWriter(dir,new PaodingAnalyzer());
File[] files = new File(dateDir).listFiles();
for (int i = 0; i < files.length; i++) {
Document doc = new Document();
// 创建Field对象,并放入doc对象中
doc.add(new Field("contents", readContents(files[i], "UTF-8"),
Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field("filename", files[i].getName(), Field.Store.YES,
Field.Index.TOKENIZED));
doc.add(new Field("indexDate", DateTools.dateToString(new Date(),
DateTools.Resolution.DAY), Field.Store.YES,
Field.Index.TOKENIZED));
// 写入IndexWriter
indexWriter.addDocument(doc);
}
// 查看IndexWriter里面有多少个索引
System.out.println("numDocs:" + indexWriter.numRamDocs());
indexWriter.optimize();
indexWriter.close();
}
public static String readContents(File file, String charset)
throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(file), charset));
String line = new String();
String temp = new String();
while ((line = reader.readLine()) != null) {
temp += line;
}
reader.close();
return temp;
}
}
其次是进行搜索的类:
package com.jereh.lucene;
import java.io.File;
import java.io.IOException;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
/**
* 搜索索引 Lucene 3.0+
*
* @author Administrator
*
*/
public class Searcher {
public static void search(String indexDir) throws IOException,
ParseException {
Directory dir = FSDirectory.getDirectory(new File(indexDir));
// 创建 IndexSearcher对象,相比IndexWriter对象,这个参数就要提供一个索引的目录就行了
IndexSearcher indexSearch = new IndexSearcher(dir);
// 创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器
QueryParser queryParser = new QueryParser("filename",new PaodingAnalyzer());
// 生成Query对象
Query query = queryParser.parse("滑移装载机");
// 搜索结果 TopDocs里面有scoreDocs[]数组,里面保存着索引值
//TopDocs hits = indexSearch.search(query, 10);
Hits hits = indexSearch.search(query);
// hits.totalHits表示一共搜到多少个
System.out.println("找到了" + hits.length() + "个");
// 循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值
Document doc = null;
for(int i=0;i<hits.length();i++){
doc = hits.doc(i);
System.out.print(doc.get("filename"));
}
indexSearch.close();
}
}
最后是运行的类:
package com.jereh.lucene;
import java.io.File;
import java.io.IOException;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import net.paoding.analysis.examples.gettingstarted.BoldFormatter;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
public class Test {
public static void main(String[] args) throws IOException, ParseException {
// try {
// //Indexer.index("E:/code/jrcms_liugong/website/products/",
// "F:/workspace/spring-mvc/WebRoot/WEB-INF/index/");
// Searcher.search("F:/workspace/spring-mvc/WebRoot/WEB-INF/index/");
// } catch (Exception e) {
// e.printStackTrace();
// }
// 将庖丁封装成符合Lucene要求的Analyzer规范
String dateDir = "E:/code/jrcms_liugong/website/about/";
Analyzer analyzer = new PaodingAnalyzer();
File[] files = new File(dateDir).listFiles();
for (File f : files) {
// 读取本类目录下的text.txt文件
String content = Indexer.readContents(f, "UTF-8");
// 接下来是标准的Lucene建立索引和检索的代码
Directory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter(ramDir, analyzer);
Document doc = new Document();
Field fname = new Field("filename",f.getName(),Field.Store.YES,Field.Index.UN_TOKENIZED);
Field fd = new Field("contents", content, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(fname);
doc.add(fd);
writer.addDocument(doc);
writer.optimize();
writer.close();
IndexReader reader = IndexReader.open(ramDir);
String queryString = "国家级企业技术中心";
QueryParser parser = new QueryParser("contents", analyzer);
Query query = parser.parse(queryString);
Searcher searcher = new IndexSearcher(ramDir);
query = query.rewrite(reader);
//System.out.println("Searching for: " + query.toString("contents"));
Hits hits = searcher.search(query);
BoldFormatter formatter = new BoldFormatter();
Highlighter highlighter = new Highlighter(formatter,
new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(50));
for (int i = 0; i < hits.length(); i++) {
String text = hits.doc(i).get("filename");
// int maxNumFragmentsRequired = 5;
// String fragmentSeparator = "...";
// TermPositionVector tpv = (TermPositionVector) reader.getTermFreqVector(hits.id(i), "contents");
// TokenStream tokenStream = TokenSources.getTokenStream(tpv);
// String result = highlighter.getBestFragments(tokenStream, text,
// maxNumFragmentsRequired, fragmentSeparator);
// System.out.println("\n" + result);
System.out.println(text);
}
reader.close();
}
}
}