java鬼混笔记：lucene 3、单词分法，二分法，停用词

最新推荐文章于 2022-09-27 10:21:44 发布

最新推荐文章于 2022-09-27 10:21:44 发布

阅读量409

点赞数

分类专栏： lucene 文章标签： lucene二分法 lucene停用词 lucene单词分法

本文链接：https://blog.csdn.net/u013845177/article/details/78129115

版权

lucene 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

这次的笔记是玩玩lucene自带的两个分词器StandardAnalyzer（单词分），CJKAnalyzer（二词分），和它们对停用词(停用词：就是不进行拆分的词。。。)

上代码：

package cn;

import java.io.IOException;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;

public class fenci {

	public static void main(String[] args) throws IOException {
		
		String txt = "我是中国人，爱和平爱团结";
		
		// 单个词分法：一个一个词分开
		// 1、单词分词器
		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
		
		// 2、分词后的词
		TokenStream ts = analyzer.tokenStream("content", new StringReader(txt));
		
		// 3、分别获取分词后的每个词
		while(ts.incrementToken()) {
			CharTermAttribute cta = ts.getAttribute(CharTermAttribute.class);
			System.out.println(cta.toString());// 这时控制台一行一个字 ，如果txt是英文，也是一个一个单词输出的
		}
		analyzer.close();
		/*控制台打印：
		我
		是
		中
		国
		人
		爱
		和
		平
		爱
		团
		结*/
		
		
		
		
		
		// 二词分法:相近的两个字成一个词，比如 ‘我是中国人’ 会分成：我是，是中，中国，国人
		Analyzer analyzer2 = new CJKAnalyzer(Version.LUCENE_40);// 同上说明
		TokenStream ts2 = analyzer2.tokenStream("content", new StringReader(txt));// 同上说明
		while(ts2.incrementToken()) {
			CharTermAttribute cta = ts2.getAttribute(CharTermAttribute.class);
			System.out.println(cta.toString());// 
		}
		analyzer2.close();
		/*控制台打印：
		我是
		是中
		中国
		国人
		爱和
		和平
		平爱
		爱团
		团结*/

		
		
		
		// 停用词：也就是不进行分记的词
		CharArraySet cas = StandardAnalyzer.STOP_WORDS_SET;
		System.out.println("StandardAnalyzer:"+cas);// 全是英语，可以看得出StandardAnalyzer是英文传用的
		
		cas = CJKAnalyzer.getDefaultStopSet();
		System.out.println("CJKAnalyzer:"+cas);
		/*控制台打印：
		StandardAnalyzer:[but, be, with, such, then, for, no, will, not, are, and, their, if, this, on, into, a, or, there, in, that, they, was, is, it, an, the, as, at, these, by, to, of]
		CJKAnalyzer:[but, be, with, such, if, for, no, will, not, are, and, their, then, this, on, into, a, or, there, in, that, they, was, is, it, at, the, as, s, t, these, by, to, of, www]*/
		
		
		
		// 停用词自定义：停用词也可以自定义，针对那些粗话，政治敏感词等
		CharArraySet casDiy = new CharArraySet(Version.LUCENE_40, 0, true);
		casDiy.add("fuck");// 加上...好在控制台查看
		casDiy.add("shit");
		casDiy.addAll(StandardAnalyzer.STOP_WORDS_SET);// 补回原来的

		// 验证一下自定义
		Analyzer analyzer3 = new StandardAnalyzer(Version.LUCENE_40);// 没加自定义停用词前
		String t3 = "apple fuck android, wp shit ";// 语法肯定错
		TokenStream ts3 = analyzer3.tokenStream("content", new StringReader(t3));
		while(ts3.incrementToken()) {
			CharTermAttribute cta = ts3.getAttribute(CharTermAttribute.class);
			System.out.println(cta.toString());// 
		}
		analyzer3.close();
		/*控制台打印：
		apple
		fuck
		android
		wp
		shit*/
		
		
		
		Analyzer analyzer5 = new StandardAnalyzer(Version.LUCENE_40, casDiy);// 加自定义停用词后
		String t5 = "apple fuck android, wp shit ";// 语法肯定错
		TokenStream ts5 = analyzer5.tokenStream("content", new StringReader(t5));
		while(ts5.incrementToken()) {
			CharTermAttribute cta = ts5.getAttribute(CharTermAttribute.class);
			System.out.println(cta.toString());// 
		}
		analyzer5.close();
		/*控制台打印:
		apple
		android
		wp*/
		// 去掉了自定义的停用词fuck和shit
	}

}