Lucene-分词器

最新推荐文章于 2021-08-10 12:12:02 发布

南栀_倾寒

最新推荐文章于 2021-08-10 12:12:02 发布

阅读量483

点赞数

分类专栏： lucene

本文链接：https://blog.csdn.net/xinxinqiu/article/details/21334333

版权

lucene 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

/**
	 * 经过该方法可以把分词后的结果输出
	 * @param analyzer
	 * @param text
	 * @throws Exception
	 */
	private void testAnalyzer(Analyzer analyzer,String text)throws Exception{
		TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text));
		tokenStream.addAttribute(TermAttribute.class);
		while(tokenStream.incrementToken()){
			TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
			System.out.println(termAttribute.term());
		}
	}

/**
 * 主要針對汉语
 * 英语
 *    
 * 汉语
 * @author Administrator
 *
 */
public class AnalyzerTest {
	@Test
	public void testEn() throws Exception{
		/**
		 * Creates a searcher searching the index in the named directory
		 */
		/**
		 * 1、切分关键词
		 * 2、去掉停用词
		 * 3、把大写转化成小写
		 */
		String text = "Creates a searcher searching the index in the named directory";
		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
		this.testAnalyzer(analyzer, text);
	}
	
	@Test
	public void testZH() throws Exception{
		/**
		 * 单字分词
		 */
		Analyzer analyzer = new ChineseAnalyzer();
		String text = "传智播客的黎活明是UFO";
		this.testAnalyzer(analyzer, text);
	}
	
	@Test
	public void testZH2() throws Exception{
		/**
		 * 单字分词
		 */
		Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
		String text = "传智播客的黎活明是UFO";
		this.testAnalyzer(analyzer, text);
	}
	
	@Test
	public void testZH3() throws Exception{
		Analyzer analyzer = new IKAnalyzer();
		String text = "北京美女";
		this.testAnalyzer(analyzer, text);
	}
	/**
	 * 经过该方法可以把分词后的结果输出
	 * @param analyzer
	 * @param text
	 * @throws Exception
	 */
	private void testAnalyzer(Analyzer analyzer,String text)throws Exception{
		TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text));
		tokenStream.addAttribute(TermAttribute.class);
		while(tokenStream.incrementToken()){
			TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
			System.out.println(termAttribute.term());
		}
	}
}