搜索引擎SOLR
1.Lucene的介绍
lucene是一款高性能的、可扩展,纯java语言编写的信息检索(IR)工具库。
它适合几乎任何需要全文本搜索(特别是跨平台)的应用程序。
官网:http://lucene.apache.org/java
下载地址http://mirror.bit.edu.cn/apache/lucene/java/
所有版本:http://archive.apache.org/dist/lucene/java/
2.Lucene jar包及说明
lucene-core-3.1.0.jar 核心包
lucene-highlighter-3.1.0.jar 高亮包
lucene-analyzers-3.1.0.jar Lucene自带的分词解析包:
lucene-queries-3.1.0.jar搜索条件包
IKAnalyzer3.2.3Stable.jar IK中文分词包
3.Lucene原理
lucene是基于关键词索引和查询.
全文分析:把文本解析为一个个关键字存储到索引文件中。
倒排索引: (英语:Inverted index),也常被称为反向索引、置入档案或反向档案,是一种索引方法,被用来存储在全文搜索下某个单词在一个文档或者一组文档中的存储位置的映射。它是文档检索系统中最常用的数据结构。
设有两篇文章1和2
文章1的内容为:
Tom lives in Guangzhou,I live in Guangzhou too.
文章2的内容为:
He once lived in Shanghai.
倒排索引的原理图:
加入依赖
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
简单的分词和获取搜索
package cn.et;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKTokenizer;
public class IndexDemo {
static String dir="D:\\index";
//定义分词器
static Analyzer analyzer = new IKAnalyzer();
public static void main(String[] args) throws IOException, ParseException {
// write();
search();
// String str="张三来自湖南是一位程序员";
// Tokenizer token=new IKTokenizer(new StringReader(str),true);
}
public static void search() throws IOException, ParseException {
Directory directory=FSDirectory.open(new File(dir));
//索引库的存储目录
DirectoryReader ireader = DirectoryReader.open(directory);
//搜索类
IndexSearcher isearcher = new IndexSearcher(ireader);
//lucence查询解析 用于指定查询的属性名和分词器
QueryParser parser = new QueryParser(Version.LUCENE_47, "userDesc", analyzer);
Query query = parser.parse("来");
//获取搜索的结果 制定返回的document个数
ScoreDoc[] hits = isearcher.search(query, null, 10).scoreDocs;
// Iterate through the results:
for (int i = 0; i < hits.length; i++) {
Document hitDoc = isearcher.doc(hits[i].doc);
System.out.println(hitDoc.getField("userName").stringValue());
}
ireader.close();
directory.close();
}
/**
* 创建索引库
* @throws IOException
*/
public static void write() throws IOException{
//索引库的存储目录
Directory directory=FSDirectory.open(new File(dir));
//关联lucence版本和当前分词器
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer);
//传入写入的目录和分词器
IndexWriter iwriter = new IndexWriter(directory, config);
//document对象field属性
Document doc=new Document();
Field field=new Field("userName","张三",TextField.TYPE_STORED);
doc.add(field);
field=new Field("userDesc","张三来自湖南是一位程序员",TextField.TYPE_STORED);
doc.add(field);
iwriter.addDocument(doc);
Document doc1=new Document();
Field field1=new Field("userName","李四",TextField.TYPE_STORED);
doc1.add(field1);
field1=new Field("userDesc","李四来自湖南是一位白领",TextField.TYPE_STORED);
doc1.add(field1);
iwriter.addDocument(doc1);
iwriter.commit();
iwriter.close();
}
}
分词和高亮查询
从数据库获取
package cn.et.food.dao.impl;
import java.util.List;
import java.util.Map;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.stereotype.Repository;
import cn.et.food.dao.FoodDao;
@Repository
public class FoodDaoImpl implements FoodDao {
@Autowired
private JdbcTemplate jdbc;
/* (non-Javadoc)
* @see cn.et.food.dao.impl.FoodDao#queryFoodCount()
*/
@Override
public int queryFoodCount(){
String sql="select count(*) as foodCount from food";
return Integer.parseInt(jdbc.queryForList(sql).get(0).get("foodCount").toString());
}
/* (non-Javadoc)
* @see cn.et.food.dao.impl.FoodDao#queryFood(int, int)
*/
@Override
public List<Map<String, Object>> queryFood(int start,int rows){
String sql="select * from food limit "+start+","+rows;
return jdbc.queryForList(sql);
}
}
高亮查询和分词
package cn.et.food.util;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class LuceneUtils {
static String dir ="D:\\index";
//创建分词器
static Analyzer analyzer=new IKAnalyzer();
public static List<Map> search(String field,String value) throws IOException, ParseException, InvalidTokenOffsetsException {
Directory directory=FSDirectory.open(new File(dir));
//索引库的存储目录
DirectoryReader ireader = DirectoryReader.open(directory);
//搜索类
IndexSearcher isearcher = new IndexSearcher(ireader);
//lucence查询解析 用于指定查询的属性名和分词器
QueryParser parser = new QueryParser(Version.LUCENE_47, "foodname", analyzer);
Query query = parser.parse(value);
//最终结果被分词后添加前缀和后缀的处理类 <B></B>
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<font color=red>","</font>");
//将高亮搜索的词 添加到高亮处理器中
Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
//获取搜索的结果 制定返回的document个数
ScoreDoc[] hits = isearcher.search(query, null, 10).scoreDocs;
List<Map> list=new ArrayList<Map>();
// Iterate through the results:
for (int i = 0; i < hits.length; i++) {
int id=hits[i].doc;
Document hitDoc = isearcher.doc(hits[i].doc);
Map map=new HashMap();
map.put("foodid", hitDoc.get("foodid"));
String foodname=hitDoc.get("foodname");
//将查询的结果和搜索词匹配 匹配到添加前缀和后缀高亮
TokenStream tokenStream = TokenSources.getAnyTokenStream(isearcher.getIndexReader(), id, "foodname", analyzer);
TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, foodname, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "...");
String foodnameHign="";
for (int j = 0; j < frag.length; j++) {
if ((frag[j] != null) && (frag[j].getScore() > 0)) {
foodnameHign=(frag[j].toString());
System.out.println((frag[j].toString()));
}
}
map.put("foodname", foodnameHign);
map.put("price", hitDoc.get("price"));
map.put("imagepath", hitDoc.get("imagepath"));
list.add(map);
}
ireader.close();
directory.close();
return list;
}
public static void write(Document doc) throws IOException{
//索引库的存储目录
Directory directory=FSDirectory.open(new File(dir));
//关联lucence版本和当前分词器
IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_47,analyzer);
//传入目录和分词器
IndexWriter iwriter=new IndexWriter(directory, config);
iwriter.addDocument(doc);
iwriter.commit();
iwriter.close();
}
}
控制层
package cn.et.food.controller;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;
import cn.et.food.dao.FoodDao;
import cn.et.food.util.LuceneUtils;
@RestController
public class FoodController {
@Autowired
FoodDao dao;
@GetMapping("/searchFood")
public List<Map> getFood(String keyWord) throws IOException, ParseException, InvalidTokenOffsetsException{
return LuceneUtils.search("foodname", keyWord);
}
@GetMapping("createIndex")
public String createIndex(){
try {
//数据库查询所有 查询
int queryFoodCount=dao.queryFoodCount();
//第一次拉取 0,1000
//带二次拉取1001,2000
int startIndex=0;
int rows=1000;
while(startIndex<=queryFoodCount){
//每次拉取数据
List<Map<String,Object>> queryFood=dao.queryFood(startIndex, rows);
for(int i=0;i<queryFood.size();i++){
Map<String,Object> mso=queryFood.get(i);
Document doc=new Document();
Field field1=new Field("foodid",mso.get("foodid").toString(),TextField.TYPE_STORED);
Field field2=new Field("foodname",mso.get("foodname").toString(),TextField.TYPE_STORED);
Field field3=new Field("price",mso.get("price").toString(),TextField.TYPE_STORED);
Field field4=new Field("imagepath",mso.get("imagepath").toString(),TextField.TYPE_STORED);
doc.add(field1);
doc.add(field2);
doc.add(field3);
doc.add(field4);
LuceneUtils.write(doc);
}
//写入lucene索引
if(startIndex<rows){
startIndex+=1+rows;
}else{
startIndex+=rows;
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return "0";
}
return "1";
}
}