建立索引:
package paoding;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
public class IndexFiles {
public static void main(String[] args) {
long start = System.currentTimeMillis();
try {
// 获取Paoding中文分词器
Analyzer analyzer = new PaodingAnalyzer();
// Analyzer analyzer = new StandardAnalyzer();
// indexWriter建立索引
IndexWriter writer = new IndexWriter("f:\\indexpaoding", analyzer, true,
IndexWriter.MaxFieldLength.UNLIMITED);
indexDocs(writer, new File("F:\\徐剛:28tel(繁firfox)"));
writer.optimize();
writer.close();
System.out.println("用时:" + (System.currentTimeMillis() - start)
+ " 毫秒");
} catch (IOException e) {
e.printStackTrace();
}
}
// 遍历文件夹文件,对需要的文件建立索引
static void indexDocs(IndexWriter writer, File file) throws IOException {
if (file.canRead()) {
if (file.isDirectory()) {
String[] files = file.list();
if (files != null) {
for (int i = 0; i < files.length; i++) {
indexDocs(writer, new File(file, files[i]));
}
}
} else {
if (file.getName().endsWith(".htm")
|| file.getName().endsWith(".html")
|| file.getName().endsWith(".jsp")
|| file.getName().endsWith(".php")
|| file.getName().endsWith(".txt")) {
System.out.println("添加 " + file);
try {
// 针对参数文件建立索引文档 ,一个Document就相当于一跳记录
Document doc = new Document();
// Field.Index.ANALYZED 文件名称 建立索引,分词
doc.add(new Field("filename", file.getCanonicalPath(),
Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
doc.add(new Field("contents", ReadFile(file),
Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
// new InputStreamReader(new
// FileInputStream(file.getCanonicalPath()), "utf-8")));
writer.addDocument(doc);
} catch (FileNotFoundException fnfe) {
;
}
}
}
}
}
// 用字符串形式,读取一个File的内容
public static String ReadFile(File f) {
String line = null;
StringBuffer temp = new StringBuffer();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(f), "utf-8"));
while ((line = br.readLine()) != null) {
temp.append(line);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return temp.toString();
}
}
用来搜索:并带简单分页效果
package paoding;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TokenGroup;
import org.apache.lucene.search.highlight.TokenSources;
public class SearchFiles {
/**
*
* @param key
* 搜索的关键字
* @param perPage
* 每页显示多少条记录
* @param begin
* 从第几页开始显示
* @throws CorruptIndexException
* @throws IOException
* @throws ParseException
*/
int CACHE_PAGE = 3; // 缓存的页面数
public void search(String key, int perPage, int begin)
throws CorruptIndexException, IOException, ParseException {
String IDNEX_PATH = "f:\\indexpaoding"; //索引所在目录
int total_Page = 0; // 总页数
// 获取Paoding中文分词器
Analyzer analyzer = new PaodingAnalyzer();
// Analyzer analyzer = new StandardAnalyzer();
// 检索
IndexReader reader = IndexReader.open(IDNEX_PATH);
Searcher searcher = new IndexSearcher(reader);
/* 下面这个表示要同时搜索这两个域,而且只要一个域里面有满足我们搜索的内容就行 */
BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD,
BooleanClause.Occur.SHOULD };
Query query = MultiFieldQueryParser.parse(key, new String[] {
"filename", "contents" }, clauses, analyzer);
// QueryParser parser = new QueryParser("contents", analyzer);
// Query query = parser.parse(key);
TopDocCollector collector = new TopDocCollector(perPage * CACHE_PAGE); // perPage
searcher.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
int numTotalHits = collector.getTotalHits();
System.out.println("符合查询词的文件数:" + numTotalHits);
// 获得总页数
if (numTotalHits % perPage != 0) {
total_Page = numTotalHits / perPage + 1;
} else {
total_Page = numTotalHits / perPage;
}
if (begin > total_Page) {
System.err.println("超出范围");
} else {
// 如果起始页大于缓存页,这就代表我们需要重新搜索更多的资源
if (begin > CACHE_PAGE) {
// 这时,我把搜索的资源都搜索出来,缓存页数=总页数
CACHE_PAGE = total_Page;
// 返回调用
search(key, perPage, begin);
// collector = new TopDocCollector( numTotalHits ); //缓存不够,重新搜索
// searcher.search(query, collector);
// hits = collector.topDocs().scoreDocs;
} else {
int temp = (begin - 1) * perPage + perPage;
if ((begin - 1) * perPage + perPage > numTotalHits) {
temp = numTotalHits;
}
// 根据参数,从指定的位置开始获取数据(用于分页)
for (int i = (begin - 1) * perPage; i < temp; i++) {
System.out.println(i);
int docId = hits[i].doc;
Document doc3 = searcher.doc(docId);
String filename = doc3.get("filename");
System.out.println("filename=" + filename);
// 高亮处理
String text = doc3.get("contents");
TermPositionVector tpv = (TermPositionVector) reader
.getTermFreqVector(hits[i].doc, "contents");
TokenStream ts = TokenSources.getTokenStream(tpv);
Formatter formatter = new Formatter() {
public String highlightTerm(String srcText, TokenGroup g) {
if (g.getTotalScore() <= 0) {
return srcText;
}
return "<b>" + srcText + "</b>";
}
};
Highlighter highlighter = new Highlighter(formatter,
new QueryScorer(query));
String result = highlighter.getBestFragments(ts, text, 5,
"…");
System.out.println("result:\n\t" + result);
}
System.out.println("循环结束");
}
}
reader.close();
System.out.println("关闭reader");
}
public static void main(String[] args) throws Exception {
SearchFiles sf = new SearchFiles();
sf.search("vvczvxcxz", 5, 1);
}
}