lucene使用hanlp分词

maven依赖

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>


  <groupId>ff</groupId>
  <artifactId>dd</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <packaging>jar</packaging>


  <name>dd</name>
  <url>http://maven.apache.org</url>


  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <lucene.version>5.0.0</lucene.version>
  </properties>


  <dependencies>
 <dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-core</artifactId>
    <version>${lucene.version}</version>
</dependency>


<dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-queryparser</artifactId>
    <version>${lucene.version}</version>
</dependency>


<!-- 分词器 -->
<dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-analyzers-smartcn</artifactId>
    <version>${lucene.version}</version>
</dependency>


<dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-analyzers-common</artifactId>
    <version>${lucene.version}</version>
</dependency>


<dependency>
    <groupId>com.hankcs.nlp</groupId>
    <artifactId>hanlp-lucene-plugin</artifactId>
    <version>1.1.2</version>
</dependency>
        
        
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>3.8.1</version>
      <scope>test</scope>
    </dependency>
  </dependencies>
</project>

public static void main( String[] args ) throws ParseException, IOException
    {
    long  time=System.currentTimeMillis();
    
    String text = "以前发布过HanLP的Lucene插件,后来很多人跟我说山东人比武汉人听说过吃一线,长一智更好其实Solr更流行(反正我是觉得既然Solr是Lucene的子项目,那么稍武汉轻工大学微改改配置就能红安以及黄石路支持Solr),于是就抽空做了个Solr插件出来,开源在Github上,欢迎改进来自王宝强";


        标准分词器(长词不做切分的分词器)//
        Analyzer analyzer = new HanLPAnalyzer();
        TokenStream ts = analyzer.tokenStream("field",text);
        ts.reset();
        while(ts.incrementToken()){
            CharTermAttribute attribute = ts.getAttribute(CharTermAttribute.class);  //The term text of a Token.
            OffsetAttribute offsetAttribute =ts.getAttribute(OffsetAttribute.class);  //偏移量
            PositionIncrementAttribute positionIncrementAttribute = ts.getAttribute(PositionIncrementAttribute.class); //距离
            System.out.println(attribute+"  "
                    +offsetAttribute.startOffset()+"  "+offsetAttribute.endOffset()+"  "
                    +positionIncrementAttribute.getPositionIncrement());
        }
        ts.close();
        System.out.println(11111111);


        /索引分词器(长词全切分的分词器)/
        Analyzer indexAnalyzer = new HanLPIndexAnalyzer();
        TokenStream indexTs = indexAnalyzer.tokenStream("field",text);
        indexTs.reset();
        while(indexTs.incrementToken()){
            CharTermAttribute attribute = indexTs.getAttribute(CharTermAttribute.class);  //The term text of a Token.
            OffsetAttribute offsetAttribute =indexTs.getAttribute(OffsetAttribute.class);  //偏移量
            PositionIncrementAttribute positionIncrementAttribute = indexTs.getAttribute(PositionIncrementAttribute.class); //距离
            System.out.println(attribute+"  "
                    +offsetAttribute.startOffset()+"  "+offsetAttribute.endOffset()+"  "
                    +positionIncrementAttribute.getPositionIncrement());
        }
        
        indexTs.close();
       // System.out.println("2222222");


        /通过query查看分词结果//
        QueryParser queryParser = new QueryParser( "txt",analyzer);
        Query query = queryParser.parse(text);
      //  System.out.println(query.toString("txt"));
        queryParser = new QueryParser("txt",indexAnalyzer);
        query = queryParser.parse(text);
      
        
        System.out.println(query.toString("txt"));
      //  System.out.println(HanLP.parseDependency("把市场经济奉行的等价交换原则引入党的生活和国家机关政务活动中"));;
        
        System.out.println(System.currentTimeMillis()-time);
     
       
   // System.out.println(HanLP.DemoTextClassification);
    }

以前  0  2  1
发布  2  4  1
过  4  5  1
HanLP  5  10  1
的  10  11  1
Lucene  11  17  1
插件  17  19  1
,  19  20  1
后来  20  22  1
很多  22  24  1
人  24  25  1
跟  25  26  1
我  26  27  1
说  27  28  1
山东  28  30  1
人  30  31  1
比  31  32  1
武汉  32  34  1
人  34  35  1
听说  35  37  1
过  37  38  1
吃  38  39  1
一线  39  41  1
,  41  42  1
长一智  42  45  1
更好  45  47  1
其实  47  49  1
Solr  49  53  1
更  53  54  1
流行  54  56  1
(  56  57  1

 

 

文章来源于公众号火炎一笑倾城的博客

转载于:https://my.oschina.net/u/3984125/blog/2964182

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值