浅谈文本检索工具Lucene

最新推荐文章于 2024-03-08 10:18:48 发布

chuyuan2324

最新推荐文章于 2024-03-08 10:18:48 发布

阅读量104

点赞数

原文链接：https://my.oschina.net/u/1458170/blog/1647805

版权

最近项目要用文本检索工具，就了解到了lucene. 我对lucene查了很多资料，了解到lucene的版本比较多，且每个版本里面的方法变动比较大。最后确定lucene4.7.2版本，理论就不多说了，网上很多介绍理论的，我就直接把能用的代码介绍一下。

1.首先建立索引

public static void indexBuilding(String indexPath, List<Book> datas){// indexPath表示索引存放的目录 datas表示数据（可以从文本里、表里等取到数据）
try {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer);
indexWriterConfig.setOpenMode(OpenMode.CREATE);//已创建模式建立索引
IndexWriter indexWriter = new IndexWriter(FSDirectory.open(new File(indexPath)), indexWriterConfig);
for (Book bood: datas) {
indexWriter.addDocument(Document(wotvMediaInfoModel));//写入索引
}
indexWriter.close();
} catch (Exception e) {
e.printStackTrace();
}

}

public static Document Document(Book book) {
Document doc = new Document();
doc.add(new StringField("name", book.getName(), Field.Store.YES));//name创建索引，并保存
doc.add(new StringField("title", book.getTitle(), Field.Store.No));//title创建索引，不保存
doc.add(new TextField("name", book.getAuthor(), Field.Store.YES));//author创建索引，保存
return doc;
}

//数据封装对象

public class Book implements Serializable {

private static final long serialVersionUID = 1L;
private int id;
private String name;
private String title;
private String author;

public data () {}
public void setId(int id) {

this.id = id;

}

public int getId() {

return id;

}

................

}

2.建立完索引，就可以进行查询了

public List<QueryResult> query(String keyword) {
try {
Date start = new Date();
IndexSearcher indexSearcher = initIndexSearch();//初始化IndexSearcher
String key = IK_Analyzer(keyword);//IK分词器进行分词
if (StringUtil.isEmpty(key)) {//分词失败，直接返回
return null;
}
List<QueryResult> resultList = queryByOneKey(indexSearcher, "name", key);
if (!CollectionUtil.isEmpty(resultList)) {
Date end = new Date();
logger.info("queryByOneField 耗时：" + (end.getTime() - start.getTime()) + "ms");
for (QueryResult queryResult : resultList) {
logger.info("queryByOneField查询结果：" + queryResult.toString());
}
return resultList;
}
} catch (Exception e) {
logger.error("queryByOneField error", e);
}
return null;
}

//拿到索引，并创建IndexSearcher

protected static IndexSearcher initIndexSearch() {
if (indexSearcher == null) {
DirectoryReader directoryReader = null;
try {
directoryReader = DirectoryReader.open(FSDirectory.open(new File(Lucene.Path.indexFilePath)));
} catch (IOException e) {
logger.error("queryByMultiField error:{}", e);
e.printStackTrace();
return null;
}
indexSearcher = new IndexSearcher(directoryReader);
return indexSearcher;
}
return indexSearcher;
}

//利用IK进行分词

protected static String IK_Analyzer(String str) {
Analyzer ikAnalyzer = new IKAnalyzer();
Reader reader = new StringReader(str);
String results = "";
try {
TokenStream tokenStream = ikAnalyzer.tokenStream("", reader);
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
int startOffSet = offsetAttribute.startOffset();
int endOffSet = offsetAttribute.endOffset();
if (endOffSet - startOffSet > 1) {
results = results + charTermAttribute.toString() + " ";
}
}
tokenStream.close();
} catch (IOException e) {
e.printStackTrace();
}
logger.info("IK_Analyzer字典分词结果:[" + results + "]");
ikAnalyzer.close();
return results;
}

//查询结果

protected static List<QueryResult> queryByOneKey(IndexSearcher indexSearcher, String field, String keys)
throws ParseException {
try {
Date start = new Date();
QueryParser queryParser = new QueryParser(Version.LUCENE_47, field,
new StandardAnalyzer(Version.LUCENE_47));
queryParser.setDefaultOperator(Operator.OR);//Operator.YES（逻辑与的意思）和 Operator.OR（逻辑或的意思）
Query query = queryParser.parse(keys);
TopDocs topDocs = indexSearcher.search(query, 3);//取出最匹配的3条记录
Date end = new Date();
logger.info("queryByOneKey 总共为您找到 " + topDocs.totalHits + "条记录\t耗时：" + (end.getTime() - start.getTime())
+ "ms");
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
List<QueryResult> list = new ArrayList<QueryResult>();
for (ScoreDoc scoreDoc : scoreDocs) {
list.add(getIndexResult(indexSearcher.doc(scoreDoc.doc), scoreDoc.score));
}
return list;
} catch (IOException e) {
logger.error("queryByOneKey error:{}", e);
e.printStackTrace();
}
return null;
}

// 组装对象
private static QueryResult getIndexResult(Document doc, float score) {
QueryResult indexResult = new QueryResult();
indexResult.setCid(doc.get("cid"));
indexResult.setRes(doc.get("res"));
indexResult.setName(doc.get("name"));
indexResult.setScore(score);
return indexResult;
}

//结果对象封装

public class QueryResult {

private String name;
private String author;
private float score;

public String getCid() {
return cid;
}

public void setName(String name) {
this.name= name;
}

public String getName() {
return name;
}

public void setTitle(String title) {
this.title = title;
}

public float getTitle() {
return title;
}

public void setScore(float score) {
this.score = score;
}

}

3.利用IK分词器

IK分词器是对中文分词支持比较好的了，并且可以自己扩展字典

自定义字典配置

public class MyConfig implements Configuration {

/*
* 分词器默认字典路径
*/
private static final String PATH_DIC_MAIN = "org/wltea/analyzer/dic/main2012.dic";
private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic";
/*
* 是否使用smart方式分词
*/
private boolean useSmart;

/**
* 返回useSmart标志位 useSmart =true ，分词器使用智能切分策略， =false则使用细粒度切分
*
* @return useSmart
*/
public boolean useSmart() {
return useSmart;
}

/**
* 设置useSmart标志位 useSmart =true ，分词器使用智能切分策略， =false则使用细粒度切分
*
* @param useSmart
*/
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}

/**
* 获取主词典路径
*
* @return String 主词典路径
*/
public String getMainDictionary() {
return PATH_DIC_MAIN;
}

/**
* 获取量词词典路径
*
* @return String 量词词典路径
*/
public String getQuantifierDicionary() {
return PATH_DIC_QUANTIFIER;
}

/**
* 获取扩展字典配置路径
*
* @return List<String> 相对类加载器的路径
*/
public List<String> getExtDictionarys() {
List<String> extDictFiles = new ArrayList<String>(2);
String extDictCfg = Lucene.IKAnalyzer.myDicPath;//这是自己定义字典目录，因为IK利用的ClassLoader解析的该文件，所以目录最好放到与该方法同级目录下，要不然扩展字典不起作用
extDictFiles.add(extDictCfg);
return extDictFiles;
}

/**
* 获取扩展停止词典配置路径
*/
@Override
public List<String> getExtStopWordDictionarys() {
return null;
}

}

public class InitMyDictionary {

/**
* 将自定义配置添加到字典中
*/
public static void init() {
Dictionary.initial(new MyConfig());
}

}