lucene常用分词器——代码

IKAnalyzer需要自己去下载对应版本的架包

package com.team.lucene;

import java.io.StringReader;

import lombok.extern.slf4j.Slf4j;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
 * @ClassName:AnalyzerTest.java
 * @Description: 分词器学习
 * @author gaoguangjin
 * @Date 2015-7-14 上午9:49:40
 */
@Slf4j
public class AnalyzerTest {
    public static void main(String[] args) {
        String content = "gaoguangjin is handsome boy 刘德华没有高广金帅气";
        // 标准分词器
        Analyzer analyzer1 = new StandardAnalyzer(Version.LUCENE_40);
        // 简单分词器
        Analyzer analyzer2 = new SimpleAnalyzer(Version.LUCENE_40);
        // 二元切分
        Analyzer analyzer3 = new CJKAnalyzer(Version.LUCENE_40);
        // 中文语意分词 需要自己引入IKAnalyzer架包
        Analyzer analyzer4 = new IKAnalyzer(true);

        display(analyzer1, "StandardAnalyzer", content);
        display(analyzer2, "SimpleAnalyzer", content);
        display(analyzer3, "CJKAnalyzer", content);
        display(analyzer4, "IKAnalyzer", content);
    }

    /**
     * @Description: 展示
     * @param analyzer
     * @param analyzerName
     * @return:void
     */
    private static void display(Analyzer analyzer, String analyzerName, String content) {
        try {
            TokenStream tokenstream = analyzer.tokenStream("content", new StringReader(content));

            // 为token设置属性类
            CharTermAttribute termAttribute = tokenstream.addAttribute(CharTermAttribute.class);
            // 重新设置
            tokenstream.reset();
            log.info("*******************" + analyzerName + "开始分词********************");
            // 遍历得到token
            while (tokenstream.incrementToken()) {
                log.info(new String(termAttribute.buffer(), 0, termAttribute.length()) + "  ");
            }
        } catch (Exception e) {
            log.error(analyzerName + "分词失败!" + e.getLocalizedMessage());
        }

    }
}

打印

2015-07-14 13:30:42,623  INFO [main] (AnalyzerTest.java:55) - *******************StandardAnalyzer开始分词********************
2015-07-14 13:30:42,627  INFO [main] (AnalyzerTest.java:58) - gaoguangjin  
2015-07-14 13:30:42,628  INFO [main] (AnalyzerTest.java:58) - handsome  
2015-07-14 13:30:42,628  INFO [main] (AnalyzerTest.java:58) - boy  
2015-07-14 13:30:42,630  INFO [main] (AnalyzerTest.java:58) - 刘  
2015-07-14 13:30:42,630  INFO [main] (AnalyzerTest.java:58) - 德  
2015-07-14 13:30:42,631  INFO [main] (AnalyzerTest.java:58) - 华  
2015-07-14 13:30:42,631  INFO [main] (AnalyzerTest.java:58) - 没  
2015-07-14 13:30:42,632  INFO [main] (AnalyzerTest.java:58) - 有  
2015-07-14 13:30:42,632  INFO [main] (AnalyzerTest.java:58) - 高  
2015-07-14 13:30:42,632  INFO [main] (AnalyzerTest.java:58) - 广  
2015-07-14 13:30:42,632  INFO [main] (AnalyzerTest.java:58) - 金  
2015-07-14 13:30:42,632  INFO [main] (AnalyzerTest.java:58) - 帅  
2015-07-14 13:30:42,632  INFO [main] (AnalyzerTest.java:58) - 气  
2015-07-14 13:30:42,634  INFO [main] (AnalyzerTest.java:55) - *******************SimpleAnalyzer开始分词********************
2015-07-14 13:30:42,634  INFO [main] (AnalyzerTest.java:58) - gaoguangjin  
2015-07-14 13:30:42,637  INFO [main] (AnalyzerTest.java:58) - is  
2015-07-14 13:30:42,637  INFO [main] (AnalyzerTest.java:58) - handsome  
2015-07-14 13:30:42,637  INFO [main] (AnalyzerTest.java:58) - boy  
2015-07-14 13:30:42,637  INFO [main] (AnalyzerTest.java:58) - 刘德华没有高广金帅气  
2015-07-14 13:30:42,643  INFO [main] (AnalyzerTest.java:55) - *******************CJKAnalyzer开始分词********************
2015-07-14 13:30:42,643  INFO [main] (AnalyzerTest.java:58) - gaoguangjin  
2015-07-14 13:30:42,643  INFO [main] (AnalyzerTest.java:58) - handsome  
2015-07-14 13:30:42,644  INFO [main] (AnalyzerTest.java:58) - boy  
2015-07-14 13:30:42,644  INFO [main] (AnalyzerTest.java:58) - 刘德  
2015-07-14 13:30:42,644  INFO [main] (AnalyzerTest.java:58) - 德华  
2015-07-14 13:30:42,644  INFO [main] (AnalyzerTest.java:58) - 华没  
2015-07-14 13:30:42,644  INFO [main] (AnalyzerTest.java:58) - 没有  
2015-07-14 13:30:42,645  INFO [main] (AnalyzerTest.java:58) - 有高  
2015-07-14 13:30:42,645  INFO [main] (AnalyzerTest.java:58) - 高广  
2015-07-14 13:30:42,645  INFO [main] (AnalyzerTest.java:58) - 广金  
2015-07-14 13:30:42,645  INFO [main] (AnalyzerTest.java:58) - 金帅  
2015-07-14 13:30:42,645  INFO [main] (AnalyzerTest.java:58) - 帅气  
useSmart = true
2015-07-14 13:30:42,871  INFO [main] (AnalyzerTest.java:55) - *******************IKAnalyzer开始分词********************
2015-07-14 13:30:42,884  INFO [main] (AnalyzerTest.java:58) - gaoguangjin  
2015-07-14 13:30:42,884  INFO [main] (AnalyzerTest.java:58) - is  
2015-07-14 13:30:42,885  INFO [main] (AnalyzerTest.java:58) - handsome  
2015-07-14 13:30:42,885  INFO [main] (AnalyzerTest.java:58) - boy  
2015-07-14 13:30:42,885  INFO [main] (AnalyzerTest.java:58) - **刘德华**  
2015-07-14 13:30:42,885  INFO [main] (AnalyzerTest.java:58) - 没  
2015-07-14 13:30:42,885  INFO [main] (AnalyzerTest.java:58) - 有  
2015-07-14 13:30:42,885  INFO [main] (AnalyzerTest.java:58) - 高  
2015-07-14 13:30:42,885  INFO [main] (AnalyzerTest.java:58) - 广  
2015-07-14 13:30:42,886  INFO [main] (AnalyzerTest.java:58) - 金  
2015-07-14 13:30:42,886  INFO [main] (AnalyzerTest.java:58) - 帅  
2015-07-14 13:30:42,886  INFO [main] (AnalyzerTest.java:58) - 气  
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值