Lucene
1.什么是lucene
lucene是一款高性能的、可扩展,纯java语言编写的信息检索(IR)工具库。
它适合几乎任何需要全文本搜索(特别是跨平台)的应用程序。
下载地址 : http://lucene.apache.org/java
官网:http://lucene.apache.org/
2. IKAnalyze中文分词包
IKAnalyzer是一个开源的,基于java语言开发的轻量级的中文分词工具包。
特点(来自百度):
采用了特有的“正向迭代最细粒度切分算法“,具有60万字/秒的高速处理能力。
采用了多子处理器分析模式,支持:英文字母(IP地址、Email、URL)、数字(日期,常用中文数量词,罗马数字,科学计数法),中文词汇(姓名、地名处理)等分词处理。
对中英联合支持不是很好,在这方面的处理比较麻烦.需再做一次查询,同时是支持个人词条的优化的词典存储,更小的内存占用。
支持用户词典扩展定义。
针对Lucene全文检索优化的查询分析器IKQueryParser;采用歧义分析算法优化查询关键字的搜索排列组合,能极大的提高Lucene检索的命中率。
Maven依赖:
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
3.主要代码
/**
* 创建索引库
*
* @throws IOException
*/
public static void write(String foodname) throws IOException {
Directory directory = FSDirectory.open(new File(url));
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer);
IndexWriter iwriter = new IndexWriter(directory, config);
Document doc = new Document();
Field field = new Field("food", foodname, TextField.TYPE_STORED);
doc.add(field);
iwriter.addDocument(doc);
iwriter.commit();
iwriter.close();
}
/**
* 搜索索引库
*
* @throws IOException
* @throws ParseException
*/
public static void search(String name) throws IOException, ParseException {
Directory directory = FSDirectory.open(new File(url));
DirectoryReader ireader = DirectoryReader.open(directory);
IndexSearcher isearcher = new IndexSearcher(ireader);
QueryParser parser = new QueryParser(Version.LUCENE_47, "food", analyzer);
Query query = parser.parse(name);
ScoreDoc[] hits = isearcher.search(query, null, 10).scoreDocs;
for (int i = 0; i < hits.length; i++) {
Document hitDoc = isearcher.doc(hits[i].doc);
System.out.println(hitDoc.getField("food").stringValue());
}
ireader.close();
directory.close();
}
/**
* 查看分词结果
*/
public static List<String> getIndex(String str,Analyzer analyzer){
List<String> result = new ArrayList<String>();
TokenStream stream = null;
try {
stream = analyzer.tokenStream("content", new StringReader(str));
CharTermAttribute attr = stream.addAttribute(CharTermAttribute.class);
stream.reset();
while(stream.incrementToken()){
result.add(attr.toString());
}
} catch (IOException e) {
e.printStackTrace();
}finally{
if(stream != null){
try {
stream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return result;
}
Lucene进阶
批量建索引:
public String create() throws IOException {
Integer start = 0;
Integer rows = 1000;
Integer i = fd.getCount();
while (start <= i) {
List<Map<String, Object>> list = fd.getFood(start, rows);
for (int j = 0; j < list.size(); j++) {
Document doc = new Document();
Field field1 = new Field("foodid", list.get(j).get("foodid").toString(), TextField.TYPE_STORED);
Field field2 = new Field("foodname", list.get(j).get("foodname").toString(), TextField.TYPE_STORED);
Field field3 = new Field("price", list.get(j).get("price").toString(), TextField.TYPE_STORED);
Field field4 = new Field("imagepath", list.get(j).get("imagepath").toString(), TextField.TYPE_STORED);
doc.add(field1);
doc.add(field2);
doc.add(field3);
doc.add(field4);
write(doc);
}
start += rows;
}
return "创建索引成功";
}
高亮
public List<Map> search(String name) throws IOException, ParseException, InvalidTokenOffsetsException {
Directory directory = FSDirectory.open(new File(url));
DirectoryReader ireader = DirectoryReader.open(directory);
IndexSearcher isearcher = new IndexSearcher(ireader);
QueryParser parser = new QueryParser(Version.LUCENE_47, "foodname", analyzer);
Query query = parser.parse(name);
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<font color='green'>", "</font>");
Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
ScoreDoc[] hits = isearcher.search(query, null, 10).scoreDocs;
List<Map> list = new ArrayList<Map>();
for (int i = 0; i < hits.length; i++) {
int id = hits[i].doc;
Document doc = isearcher.doc(id);
Document hitDoc = isearcher.doc(hits[i].doc);
Map map = new HashMap();
map.put("foodid", hitDoc.get("foodid"));
String foodname = hitDoc.get("foodname");
TokenStream tokenStream = TokenSources.getAnyTokenStream(isearcher.getIndexReader(), id, "foodname",
analyzer);
TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, foodname, false, 10);
String s="";
for (int j = 0; j < frag.length; j++) {
if ((frag[j] != null) && (frag[j].getScore() > 0)) {
s=frag[j].toString();
}
}
map.put("foodname", s);
map.put("price", hitDoc.get("price"));
map.put("imagepath", hitDoc.get("imagepath"));
list.add(map);
}
ireader.close();
directory.close();
return list;
}