Lucene4.10.4版本 IKAnalyzer 中文分词

最新推荐文章于 2021-08-10 12:12:02 发布

bear_cab

最新推荐文章于 2021-08-10 12:12:02 发布

阅读量637

点赞数

本文链接：https://blog.csdn.net/phone13144830339/article/details/79048007

版权

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>cn.et</groupId>
  <artifactId>LuceneIkAnalyzer</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <parent>  
	<groupId>org.springframework.boot</groupId>  
	<artifactId>spring-boot-starter-parent</artifactId>  
	<version>1.5.9.RELEASE</version>  
  </parent>  
  <properties>  
    <lucene.version>4.10.4</lucene.version>  
  </properties>  
  <dependencies>
    <dependency>  
      <groupId>org.springframework.boot</groupId>  
      <artifactId>spring-boot-starter-web</artifactId>  
    </dependency>
    <dependency>  
        <groupId> org.apache.lucene</groupId>  
        <artifactId> lucene-core</artifactId>  
        <version> ${lucene.version}</version>  
    </dependency>  
    <dependency>  
        <groupId> org.apache.lucene</groupId>  
        <artifactId> lucene-analyzers-common</artifactId>  
        <version> ${lucene.version}</version>  
    </dependency>  
    <dependency>  
        <groupId> org.apache.lucene</groupId>  
        <artifactId> lucene-queryparser</artifactId>  
        <version> ${lucene.version}</version>  
    </dependency>  
    <dependency>  
        <groupId> org.apache.lucene</groupId>  
        <artifactId> lucene-highlighter</artifactId>  
        <version> ${lucene.version}</version>  
    </dependency>  
    <dependency>  
        <groupId>com.janeluo</groupId>  
        <artifactId>ikanalyzer</artifactId>  
        <version>2012_u6</version>  
    </dependency>  
  </dependencies>
  <build>
    <plugins>
	  <plugin>
	    <groupId>org.apache.maven.plugins</groupId>
	  	<artifactId>maven-compiler-plugin</artifactId>
	  	<configuration>
	  	  <source>1.7</source>
	  	  <target>1.7</target>
	  	  <encoding>UTF-8</encoding>
	  	</configuration>
	  </plugin>
    </plugins>
  </build>
</project>

IKAnalyzer.cfg.xml

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">  
<properties>  
    <comment>IK Analyzer 扩展配置</comment>
    <!--用户可以在这里配置自己的扩展字典 -->
    <entry key="ext_dict">ext.dic;</entry> 
    <!--用户可以在这里配置自己的扩展停止词字典-->
    <entry key="ext_stopwords">stopword.dic;chinese_stopword.dic</entry>
</properties>

ext.dic

博客
QQ
Phone
Email

chinese_stopword.dic

喔
啊
呀
吐
哈
嗯
啪
呼
噗
嗨
嘿

stopword.dic

a
an
and
are
as
at
be
but
by
for
if
in
into
is
it
no
not
of
on
or
such
that
the
their
then
there
these
they
this
to
was
will
with

package cn.et;

import org.springframework.boot.SpringApplication;  
import org.springframework.boot.autoconfigure.SpringBootApplication;  
@SpringBootApplication  
public class SpringBootMain {  
    public static void main(String[] args) {  
        SpringApplication.run(SpringBootMain.class, args);  
    }  
}

package cn.et.control;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import org.wltea.analyzer.lucene.IKAnalyzer;
@RestController 
public class LueneIkControl {
	public List<String> getWords(String str,Analyzer analyzer){  
	    List<String> result = new ArrayList<String>();  
	    TokenStream stream = null;  
	    try {  
	        stream = analyzer.tokenStream("content", new StringReader(str));  
	        CharTermAttribute attr = stream.addAttribute(CharTermAttribute.class);  
	        stream.reset();  
	        while(stream.incrementToken()){  
	            result.add(attr.toString());  
	        }  
	    } catch (IOException e) {  
	        e.printStackTrace();  
	    }finally{  
	        if(stream != null){  
	            try {  
	                stream.close();  
	            } catch (IOException e) {  
	                e.printStackTrace();  
	            }  
	        }  
	    }  
	    return result;  
	}  
	
	@RequestMapping("/getIkResult") 
	public String getIkResult(String content){
		//String content = "欢迎光临卢陈的博客:http://blog.csdn.net/phone13144830339";  
		List<String> lists = getWords(content, new IKAnalyzer());  
		String result = "";
		for (String s : lists) {  
		    System.out.println(s);  
		    result += s + "<br/>";
		}  
		return result;
	}
}

form.css

input{
margin:12px;
font-size:15px;
color:SaddleBrown;
vertical-align:middle;
padding:5px;
font-family:楷体;
font-weight:bold;
}
[type=text],[type=password]{
width:180px;
height:30px;
border-radius:5px;
}
[type=submit],[type=reset] {
width:80px;
padding:5px;
background:Crimson;
color:white;
font-weight:bold;
font-family:楷体;
border-radius:7px;
}
body{font-weight:bold;font-family:楷体;}
a{text-decoration:none;}
a:link{color:blueViolet;}
a:visited{color:brown;}
a:hover{color:orangeRed;}
a:active{color:dimGray;}

index.html

<html>
	<head>
		<meta http-equiv="content-type" content="text/html;charset=UTF-8" />
		<title>Lucene IKAnalyzer</title>
		<link rel=stylesheet href="form.css" type="text/css">
	</head>
	<body>
		<form align="center" action="/getIkResult" method="get" οnsubmit="return submitForm()">
			中  文<input type="text" name="content" placeholder="请输入中文"/><br> 
			<input type="submit" value="分词" />
			<input type="reset" value="重置"/><br/>
		</form>
	</body>
</html>