引言
最近项目需要使用到中文分词,所以想到了使用lucene+ik分词来实现
步骤
导入依赖
在pom .xml引入
<!-- lucene核心库 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>7.6.0</version>
</dependency>
<!-- Lucene的查询解析器 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>7.6.0</version>
</dependency>
<!-- lucene的默认分词器库 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>7.6.0</version>
</dependency>
<!-- lucene的高亮显示 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>7.6.0</version>
</dependency>
<!-- ik分词器 -->
<dependency>
<groupId>com.github.magese</groupId>
<artifactId>ik-analyzer</artifactId>
<version>7.6.0</version>
</dependency>
建立字典
在配置文件中配置字典路径
lucence.CUSTOMER_ALL=d:\\indexDir\\customer
lucence.SALE_PRODUCT_ALL=d:\\indexDir\\saleproduct
因为需要生成多个字典,而且每个不同环境所生成的路径都需要不同,所以采用配置文件动态配置
@Data
public class LucenceType {
public static String CUSTOMER_ALL;
public static String SALE_PRODUCT_ALL;
//代码放入静态块中,项目启动就加载
static {
load();
}
public static void load() {
try {
Configuration config = new PropertiesConfiguration("wxArticleConfiguration.properties" );
CUSTOMER_ALL = config.getString("lucence.CUSTOMER_ALL" );
SALE_PRODUCT_ALL = config.getString("lucence.SALE_PRODUCT_ALL" );
} catch (ConfigurationException e) {
throw new RuntimeException("获取配置文件失败,", e);
}
}
}
编写生成字典的主方法
@ResponseBody
@RequestMapping("/createCustomerIndex")
public String createCustomerIndex() {
try {
List<TSaleCustomer> list1 =tSaleCustomerDao.tSaleCustomerList(new HashMap<>());
// 创建文档的集合
Collection<Document> docs = new ArrayList<>();
for(int i=0;i<list1.size();i++){
// 创建文档对象
Document document1 = new Document();
//StringField会创建索引,但是不会被分词,TextField,即创建索引又会被分词。
document1.add(new StringField("customerId", list1.get(i).getId()+"", Field.Store.YES));
//因为分词后可能无法匹配原完整词的结果,所以将同一个数据做两次引入
document1.add(new TextField("namelike", list1.get(i).getName()+"", Field.Store.YES));//分词匹配
document1.add(new StringField("name", list1.get(i).getName()+"", Field.Store.YES));//全字匹配
docs.add(document1);
}
LucenceUtils.createIndex(docs, LucenceType.CUSTOMER_ALL);
log.info("----创建索引完成----");
System.out.println("----创建索引完成----");
return"success";
}catch (Exception e){
log.error("----创建索引异常----",e);
System.out.println(e);
return "erro";
}
}
在执行完成之后在目录下会出现如下几个文件,这样就表示字典生成成功
查询数据
@RequestMapping("/tSaleCustomerList2")
public String[] luceneTest(@RequestBody Map<String,Object> map) throws Exception {
int pageNow= org.apache.commons.collections.MapUtils.getIntValue(map,"page");
int pageSize= org.apache.commons.collections.MapUtils.getIntValue(map,"limit");
String keyword= org.apache.commons.collections.MapUtils.getString(map,"keyword");
Directory directory = FSDirectory.open(FileSystems.getDefault().getPath(LucenceType.CUSTOMER_ALL));
// 索引读取工具
IndexReader reader = DirectoryReader.open(directory);
BooleanQuery.Builder builder = new BooleanQuery.Builder();
// 索引搜索工具
IndexSearcher searcher = new IndexSearcher(reader);
// 指定多默认域数组
String[] arr=new String[]{"name","namelike"};
//搜索时设置权重
//如果不设置权重有可能全字匹配会在分词匹配后面
Map<String,Float> boosts = new HashMap<String,Float>();
boosts.put("name", 50.0f);//权重默认是1
boosts.put("namelike", 10.0f);
MultiFieldQueryParser queryParser = new MultiFieldQueryParser(arr, new IKAnalyzer(),boosts);//指定搜索权重
Query parse = queryParser.parse(keyword);//查询所有默认域里有关键字的文档
builder.add(parse, BooleanClause.Occur.MUST);
BooleanQuery booleanQuery = builder.build();
//分页查询
TopDocs topDocs = searcher.search(booleanQuery, pageSize*pageNow);
// 获取总条数
System.out.println("本次搜索共找到" + topDocs.totalHits + "条数据");
// 获取得分文档对象(ScoreDoc)数组.SocreDoc中包含:文档的编号、文档的得分
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
int start = (pageNow -1)*pageSize ;
int end = pageSize*pageNow;
String[] customerIds=new String[topDocs.scoreDocs.length];
for(int i=start;i<end&&i<scoreDocs.length;i++){
if(scoreDocs[i]!=null){
Document doc2 = reader.document(scoreDocs[i].doc);
customerIds[i]=doc2.get("customerId");
}
}
//可以根据这些id去查询详情后返回
return customerIds
}
更新数据
//更新客户数据到Lucene
public String updateCustomerIndex(String customerId) {
IndexWriter writer = null;
try {
TSaleCustomer customer=tSaleCustomerDao.findByCustomerId(customerId);
Directory directory = FSDirectory.open(FileSystems.getDefault().getPath(LucenceType.CUSTOMER_ALL));
writer = new IndexWriter(directory,new IndexWriterConfig(new StandardAnalyzer()));
// 创建文档对象
Document document1 = new Document();
//StringField会创建索引,但是不会被分词,TextField,即创建索引又会被分词。
document1.add(new StringField("customerId", customer.getId()+"", Field.Store.YES));
document1.add(new TextField("namelike", customer.getName()+"", Field.Store.YES));
document1.add(new StringField("name", customer.getName()+"", Field.Store.YES));
writer.updateDocument(new Term("customerId",customer.getId()+""), document1);//更新id为2的索引
writer.commit();
writer.close();
log.info("----索引更新完成----"+customerId);
return"success";
}catch (Exception e){
log.error("----更新索引异常----"+customerId,e);
System.out.println(e);
return "erro";
}
}