main测试代码
package cn.tedu.test2;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoublePoint;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Test;
import cn.tedu.test.IKAnalyzer6x;
public class SearchIndex {
//单个词项匹配
@Test
public void termquery() throws Exception {
//1.指定文件夹索引位置
FSDirectory dir = FSDirectory.open(Paths.get(“c://index01”));
//2.根据文件夹位置生成reader流
DirectoryReader reader = DirectoryReader.open(dir);
//3.创建搜索对象
IndexSearcher search = new IndexSearcher(reader);
//4.准备搜索条件
Term term = new Term(“productCat”,“小”);
TermQuery query = new TermQuery(term);
/*
* 搜索封装了document大量标识数据对象(没有源数据)
* document 评分等,根据查询条件不同,自动计算评分
* 词项中,根据字符串匹配长度,长度大,评分越高
* 查询的条数一共多少条
*/
//4.查询前10条
TopDocs topDoc = search.search(query, 10);
System.out.println(“最高分:”+topDoc.getMaxScore());
System.out.println(“一共获取数据:”+topDoc.totalHits);
//5.利用浅查询得到的评分对象拿到doucumentId
ScoreDoc[] scoreDocs = topDoc.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
//每次循环都获取返回结果中一个document评分相关内容
System.out.println(“当前docid:”+scoreDoc.doc);
System.out.println(“当前doc评分:”+scoreDoc.score);
//利用documentId获取源数据, 拿不到Store.NO的数据
Document doc = search.doc(scoreDoc.doc);
System.out.println(“productName”+doc.get(“productName”));
System.out.println(“productImage”+doc.get(“productImage”));
}
}
//根据在url输入的字符,对字符进行分词,匹配luence分好的词
@Test
public void multilFieldQuery() throws Exception{
Directory dir=FSDirectory
.open(Paths.get("c://index01"));
IndexReader reader=DirectoryReader.open(dir);
IndexSearcher search=new IndexSearcher(reader);
//生成多域查询条件
//生成解析器 解析字符串形成多个分词的结果
String[] fields={"productName","productCat"};
Analyzer analyzer=new IKAnalyzer6x();
MultiFieldQueryParser parser=new
MultiFieldQueryParser(fields, analyzer);
//利用解析器生成多域查询条件
Query query=parser.parse("大小功率的灯泡节能效果不一样");
/* productName productCat
* 功率 term(productName ,功率) term(productCat,功率)
...
以上任何一个词项查询的结果集 做并集处理
*/
TopDocs topDoc = search.search(query, 10);
System.out.println("最高分:"+topDoc.getMaxScore());
System.out.println("一共获取数据:"+topDoc.totalHits);
ScoreDoc[] scoreDocs = topDoc.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
//每次循环都获取返回结果中一个document评分相关内容
System.out.println("当前docid:"+scoreDoc.doc);
System.out.println("当前doc评分:"+scoreDoc.score);
//利用documentId获取源数据 拿不到Store.NO的数据
Document doc=search.doc(scoreDoc.doc);
//解析所有属性值
System.out.println("productName"+doc.get("productName"));
System.out.println("productImage"+doc.get("productImage"));
}
}
@Test
public void booleanQuery() throws Exception{
Directory dir=FSDirectory
.open(Paths.get("c://index01"));
IndexReader reader=DirectoryReader.open(dir);
IndexSearcher search=new IndexSearcher(reader);
//准备boolean条件的子条件termQuery
Query query1=new TermQuery(new Term("productName","灯泡"));
Query query2=new TermQuery(new Term("productName","小功率灯泡"));
//利用query1 2 封装子条件
BooleanClause bc1=new BooleanClause(query1,Occur.FILTER);
BooleanClause bc2=new BooleanClause(query2,Occur.MUST_NOT);
//occur决定了查询结果与当前条件的逻辑关系
/*MUST:查询结果必须包含这个条件的结果
*MUST_NOT:查询结果必须不包含这个条件的结果
*SHOULD:可包含可不包含,当他与MUST同时存在时,不生效
*FILTER:MUST效果一样的,但是通过FILTER子条件查询的结果没有评分
*/
Query query=new BooleanQuery.
Builder().add(bc1).add(bc2).build();
TopDocs topDoc = search.search(query, 10);
System.out.println("最高分:"+topDoc.getMaxScore());
System.out.println("一共获取数据:"+topDoc.totalHits);
ScoreDoc[] scoreDocs = topDoc.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
//每次循环都获取返回结果中一个document评分相关内容
System.out.println("当前docid:"+scoreDoc.doc);
System.out.println("当前doc评分:"+scoreDoc.score);
//利用documentId获取源数据 拿不到Store.NO的数据
Document doc=search.doc(scoreDoc.doc);
//解析所有属性值
System.out.println("productName"+doc.get("productName"));
System.out.println("productImage"+doc.get("productImage"));
}
}
//通过范围进行 匹配,比如京东的价格区间
@Test
public void rangeQuery() throws Exception{
Directory dir=FSDirectory
.open(Paths.get("c://index01"));
IndexReader reader=DirectoryReader.open(dir);
IndexSearcher search=new IndexSearcher(reader);
//生成对price价钱做范围查询的query
Query query=DoublePoint.newRangeQuery
("productPrice", 555, 8000);
TopDocs topDoc = search.search(query, 10);
System.out.println("最高分:"+topDoc.getMaxScore());
System.out.println("一共获取数据:"+topDoc.totalHits);
ScoreDoc[] scoreDocs = topDoc.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
//每次循环都获取返回结果中一个document评分相关内容
System.out.println("当前docid:"+scoreDoc.doc);
System.out.println("当前doc评分:"+scoreDoc.score);
//利用documentId获取源数据 拿不到Store.NO的数据
Document doc=search.doc(scoreDoc.doc);
//解析所有属性值
System.out.println("productName"+doc.get("productName"));
System.out.println("productImage"+doc.get("productImage"));
System.out.println("price"+doc.get("productPrice"));
}
}
}
生成数据代码
package cn.tedu.test2;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoublePoint;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.FSDirectory;
import org.junit.Test;
import cn.tedu.test.IKAnalyzer6x;
/*
-
创建测试需要的索引文件
*/
public class CreateIndex {
@Test
public void createIndex() throws Exception {
//1.准备索引文件夹
Path path = Paths.get(“c://index01”);
//2.将路径传递给lucene对象使用
FSDirectory dir = FSDirectory.open(path);
//3.封装数据doucument
Document doc1 = new Document();
Document doc2 = new Document();
//productName productImage productPrice productCat(属性名称)S
doc1.add(new TextField(“productName”, “小功率节能灯泡”, Store.YES));
doc1.add(new StringField(“productImage”, “www.image.com”, Store.YES));
doc1.add(new DoublePoint(“productPrice”, 5000));
doc1.add(new TextField(“productCat”, “家居用品”, Store.YES));doc2.add(new TextField("productName", "大功率节能灯泡",Store.YES)); doc2.add(new StringField("productImage", "www.easymall.com", Store.YES)); doc2.add(new DoublePoint("productPrice", 355)); doc2.add(new StringField("productPrice", "355元", Store.YES)); doc2.add(new TextField("productCat", "家居用品",Store.NO)); //TextField和StringField什么区别 //TextField分词,stringField不分词 //Store.YES/NO什么作用 YES存储值,NO不存储值 //数字特性的field和字符串有什么关系;数字特性和stringField配合使用才能保留值 IndexWriterConfig config = new IndexWriterConfig(new IKAnalyzer6x()); /* * create:每次调用都覆盖原有内容 * append:每次调用都将新数据追加到原有内容索引 * create_or_append:无则建,有则追加 */ config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); writer.addDocument(doc1); writer.addDocument(doc2); writer.commit();
}
}
Ik分词类工具
package cn.tedu.test;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
public class IKAnalyzer6x extends Analyzer{
private boolean useSmart;
public boolean useSmart(){
return useSmart;
}
public void setUseSmart(boolean useSmart){
this.useSmart=useSmart;
}
public IKAnalyzer6x(){
this(false);//IK分词器lucene analyzer接口实现类,默认细粒度切分算法
}
//重写最新版本createComponents;重载analyzer接口,构造分词组件
@Override
protected TokenStreamComponents createComponents(String filedName) {
Tokenizer _IKTokenizer=new IKTokenizer6x(this.useSmart);
return new TokenStreamComponents(_IKTokenizer);
}
public IKAnalyzer6x(boolean useSmart){
super();
this.useSmart=useSmart;
}
}
package cn.tedu.test;
import java.io.IOException;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
public class IKTokenizer6x extends Tokenizer{
//ik分词器实现
private IKSegmenter _IKImplement;
//词元文本属性
private final CharTermAttribute termAtt;
//词元位移属性
private final OffsetAttribute offsetAtt;
//词元分类属性
private final TypeAttribute typeAtt;
//记录最后一个词元的结束位置
private int endPosition;
//构造函数,实现最新的Tokenizer
public IKTokenizer6x(boolean useSmart){
super();
offsetAtt=addAttribute(OffsetAttribute.class);
termAtt=addAttribute(CharTermAttribute.class);
typeAtt=addAttribute(TypeAttribute.class);
_IKImplement=new IKSegmenter(input, useSmart);
}
@Override
public final boolean incrementToken() throws IOException {
//清除所有的词元属性
clearAttributes();
Lexeme nextLexeme=_IKImplement.next();
if(nextLexeme!=null){
//将lexeme转成attributes
termAtt.append(nextLexeme.getLexemeText());
termAtt.setLength(nextLexeme.getLength());
offsetAtt.setOffset(nextLexeme.getBeginPosition(),
nextLexeme.getEndPosition());
//记录分词的最后位置
endPosition=nextLexeme.getEndPosition();
typeAtt.setType(nextLexeme.getLexemeText());
return true;//告知还有下个词元
}
return false;//告知词元输出完毕
}
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public final void end(){
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
}