Lucene 4.7 的分词器中已经有CJKAnalyzer了,这个分析器支持中文、日文、韩文和朝鲜文的分词。其过滤器CJKBigramFilter,既可以同时输出单字与双字切割,也可以选择只输出双字切割。
不过CJKAnalyzer是final类型,无法定制,需要复制出来:
//测试单双字同时分词的效果
private static Analyzer newCjkAnalyzer() {
return new StopwordAnalyzerBase(Version.LUCENE_47){
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
if (this.matchVersion.onOrAfter(Version.LUCENE_36)) {
Tokenizer source = new StandardTokenizer(this.matchVersion, reader);
TokenStream result = new CJKWidthFilter(source);
result = new LowerCaseFilter(this.matchVersion, result);
result = new CJKBigramFilter(result, 15, true);
return new TokenStreamComponents(source, new StopFilter(this.matchVersion, result, this.stopwords));
} else {
Tokenizer source = new CJKTokenizer(reader);
return new TokenStreamComponents(source, new StopFilter(this.matchVersion, source, this.stopwords));
}
}
};
}
@Test
public void testTokenStream() throws Exception {
//自定义分词器
Analyzer analyzer = newCjkAnalyzer();
//测试分词器
TokenStream tokenStream = analyzer.tokenStream("test", "我是中国人The Spring Framework provides a spring programming and configuration model.");
//添加一个引用,可以获得每个关键词
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
//添加一个偏移量的引用,记录了关键词的开始位置以及结束位置
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
//将指针调整到列表的头部
tokenStream.reset();
//遍历关键词列表,通过incrementToken方法判断列表是否结束
while(tokenStream.incrementToken()) {
System.out.println(charTermAttribute
+ "start->" + offsetAttribute.startOffset()
+ "end->" + offsetAttribute.endOffset());
}
tokenStream.close();
}
测试字符串:我是中国人The Spring Framework
输出结果:
我 start->0 end->1
我是 start->0 end->2
是 start->1 end->2
是中 start->1 end->3
中 start->2 end->3
中国 start->2 end->4
国 start->3 end->4
国人 start->3 end->5
人 start->4 end->5
the start->5 end->8
spring start->9 end->15
framework start->16 end->25
同时输出单字与双字切割。分词方式很naive。
但是,对于搜索结果是否有影响呢?下面进行测试。
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.cjk.CJKTokenizer;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.jupiter.api.Test;
import org.knziha.metaline.Metaline; // 元线模块,代码定义多行字符串
import test.CMN; // 打印类
public static void main(String[] args) throws IOException, ParseException {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
// analyzer = newCjkAnalyzer();
Directory index = FSDirectory.open(new File("G:/lucene-demo-index")); // MemoryIndex不太会用
// indexing
// 1 create index-writer
String[][] entries = new String[][]{
new String[]{"0", ""}
, new String[]{"1", "人民可以得到更多实惠"}
, new String[]{"2", "中国人民银行"}
, new String[]{"2", "洛杉矶人,洛杉矶居民"}
, new String[]{"2", "民族,人民"}
, new String[]{"2", "工人居民"}
};
// 2 write index
if(true)
{
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer);
config.setOpenMode(OpenMode.CREATE);
CMN.rt();
IndexWriter writer = new IndexWriter(index, config);
for (int i = 0; i < entries.length; i++) {
Document doc = new Document();
doc.add(new TextField("entry", entries[i][0], Field.Store.YES));
doc.add(new StringField("bookName", "NAME", Field.Store.YES));
// CMN.Log(text);
doc.add(new TextField("content", entries[i][1], Field.Store.YES));
writer.addDocument(doc);
}
writer.close();
CMN.pt("索引时间::");
}
// search
if(true)
{
CMN.rt();
IndexReader reader = DirectoryReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
Query query = new QueryParser(Version.LUCENE_47, "content", analyzer).parse("人民");
CMN.Log(styles);
CMN.Log("query: ", query);
int hitsPerPage = 100;
// 3 do search
TopDocs docs = searcher.search(query, hitsPerPage);
ScoreDoc[] hits = docs.scoreDocs;
CMN.Log("found " + hits.length + " results", docs.totalHits);
QueryScorer scorer=new QueryScorer(query); //显示得分高的片段(摘要)
Fragmenter fragmenter=new SimpleSpanFragmenter(scorer);
SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");
Highlighter highlighter=new Highlighter(simpleHTMLFormatter,scorer);
highlighter.setTextFragmenter(fragmenter);
for(ScoreDoc hit : hits) {
CMN.Log("<br/><br/>\r\n");
int docId = hit.doc;
Document doc = searcher.doc(docId);
String text = doc.get("content");
//CMN.Log("<h1 class='title'><a href=''>"+doc.get("entry")+"</a> </h1>");
if(text!=null) {
//if(false)
try {
String bookName = doc.get("bookName");
//bookName = "简明英汉汉英词典";
String dt = "<span class='dt'>"+bookName+"</span>";
/*把权重高的显示出来*/
TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(text));
String str = highlighter.getBestFragment(tokenStream, text);
CMN.Log("<div class='preview'>"/*+dt*/+str+(" ("+hit.score+") ")+"</div>");
continue;
} catch (InvalidTokenOffsetsException e) {
e.printStackTrace();
}
CMN.Log("<br/>---15字::", text.substring(0, Math.min(15, text.length())));
}
}
CMN.pt("搜索时间:");
测试结果:
StandardAnalyzer
民族,
人
民 (0.85355335)
工
人居
民 (0.70710677)
中国
人
民银行 (0.53033006)
人
民可以得到更多实惠 (0.44194174)
洛杉矶
人,洛杉矶居
民 (0.44194174)
newCJKAnalyzer
民族,
人民 (1.1007859)
中国
人民银行 (0.7476838)
人民可以得到更多实惠 (0.62306976)
工
人居民 (0.5015489)
洛杉矶
人,洛杉矶居
民 (0.31346804)
CJKAnalyzer
民族,
人民 (0.8784157)
中国
人民银行 (0.614891)
人民可以得到更多实惠 (0.43920785)
关键词搜索“人民”,使用 StandardAnalyzer 之时,词条“工人居民”居然排在了“人民银行”等词条的前面。
可见,虽然CJKAnalyzer的分词方法较为简陋,但对于搜搜双字词语还是有帮助的,排序不会像 StandardAnalyzer 那样杂乱。同时输出单字与双字,还可搜索到更多结果。