package lucene;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class TestIK {
//索引文件位置
File dataFile = new File("D://lucene");
//使用庖丁分词器
// Analyzer analyzer = new PaodingAnalyzer();
//使用IK分词器,要导入IK分词器的JAR包,最新版本是3.2.8
Analyzer analyzer = new IKAnalyzer();
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
new TestPaoding().bulidIndex(1,"apple");
new TestPaoding().bulidIndex(2,"apples");
new TestPaoding().bulidIndex(3,"apple pie");
new TestPaoding().search("java");
}
/**
* 建立新的索引数据
*/
public void bulidIndex(Integer id,String text){
Directory directory = null;
IndexWriter writer = null;
try {
directory = FSDirectory.open(dataFile);
IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_46, analyzer);
writer = new IndexWriter(directory, writerConfig); //创建一个索引器
//添加记录
addIndexDocument(writer, id, text);
}catch(Exception ex){
ex.printStackTrace();
} finally {
try {
writer.forceMerge(1); //优化压缩段,大规模添加数据的时候建议,少使用本方法,会影响性能
writer.close();
directory.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* 添加到document中
*/
public void addIndexDocument(IndexWriter writer, Integer id, String content) { //IndexWriter、索引键、索引值
try {
Document doc = new Document(); //Document是一个记录。
/**
* lucene4.6 StringField默认不分词
* TextField默认分词
*/
doc.add(new StringField("id", String.valueOf(id), Store.YES));
doc.add(new TextField("content", content, Store.YES));
writer.addDocument(doc);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 更新索引
*/
public void update(Integer id, String content) {
try {
Directory directory = FSDirectory.open(dataFile);
IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_46, analyzer);
IndexWriter writer = new IndexWriter(directory, writerConfig);
Document doc = new Document();
doc.add(new StringField("id", String.valueOf(id), Store.YES));
doc.add(new TextField("content", content, Store.YES));
Term term = new Term("id", String.valueOf(id));
/**
* 先去索引Term的记录,如果有就更新它(如果有多条,最后更新后只有一条)。如果没有就新增.
* 数据库更新的时候,我们可以只针对某个列来更新,而lucene只能针对一行数据更新。
*/
writer.updateDocument(term, doc);
directory.close();
writer.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 删除索引
*/
public void delete(Integer id) {
try {
Directory directory = FSDirectory.open(dataFile);
IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_46, analyzer);
IndexWriter writer = new IndexWriter(directory, writerConfig);
Term term = new Term("id",String.valueOf(id));
writer.deleteDocuments(term);
directory.close();
writer.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 搜索
* @param keyword
*/
public List search(String keyword) {
IndexSearcher isearcher = null;
Directory directory = null;
List list = new ArrayList();
try {
directory = FSDirectory.open(dataFile);
//读取索引文件,实例化IndexReader是非常昂贵的操作,且它是一个线程安全的,跟索引目录是一一对应的,通常我们只需要实例化一个IndexReader
IndexReader reader = IndexReader.open(directory);
/**
* 创建索引搜索器 且只读
*/
isearcher = new IndexSearcher(reader);
QueryParser parser=new QueryParser(Version.LUCENE_46, "content", analyzer);
Query query =parser.parse(keyword);
/**
* 执行搜索,获取查询结果集对象
* 取出前10条
*/
TopDocs topDocs = isearcher.search(query,10);
ScoreDoc[] hits=topDocs.scoreDocs;
//命中条数
System.out.println("命中条数:"+hits.length);
/**
* 关键字 高亮
*/
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
Scorer scorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(formatter, scorer);
/**
* 设置摘取字符的长度,默认为100个字符
*/
Fragmenter fragmenter = new SimpleFragmenter(100);
highlighter.setTextFragmenter(fragmenter);
for(ScoreDoc scoreDoc : hits){
Document hitDoc = isearcher.doc(scoreDoc.doc);
String id = hitDoc.get("id");
String text=hitDoc.get("content");
TokenStream tokens = new IKAnalyzer().tokenStream("content", new StringReader(text));
String content = highlighter.getBestFragment(tokens, hitDoc.get("content"));
list.add(content);
System.out.println("id:" + id + "\n\rcontent:" + content);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InvalidTokenOffsetsException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
try {
/*
* 使用new IndexSearcher(IndexReader);
* 如果每次索引都new IndexReader和close的话很耗时间
*/
// isearcher.close();
directory.close();
return list;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return list;
}
}
如果出现Token 3 exceeds length of provided text sized
可能是这里要new一个新的分词器
TokenStream tokens = new IKAnalyzer().tokenStream("content", new StringReader(text));