03_java Lucene学习——分词Analyzer(01):lucene4.0展示分词结果

最新推荐文章于 2021-03-13 14:06:50 发布

zack_kane

最新推荐文章于 2021-03-13 14:06:50 发布

阅读量683

点赞数

分类专栏： lucene学习

本文链接：https://blog.csdn.net/ohyesurright/article/details/38816917

版权

lucene学习专栏收录该内容

5 篇文章 0 订阅

订阅专栏

吐槽：

1.看的视频是3.5，遍历tokenStream的时候报数组下标越界。。。ArrayIndexOutOfBounds...bitch!!!

用的lucene是4.0，看了下源代码，原来遍历时做了改进，遍历操作应该是：

The workflow of the new TokenStream API is as follows:

Instantiation of TokenStream/TokenFilters which add/get attributes to/from the AttributeSource.
The consumer calls reset().
The consumer retrieves attributes from the stream and stores local references to all attributes it wants to access.
The consumer calls incrementToken() until it returns false consuming the attributes after each call.
The consumer calls end() so that any end-of-stream operations can be performed.
The consumer calls close() to release any resource when finished using the TokenStream.

总结：

一、遍历tokenStream读取语汇单元信息的步骤：

1.创建TokenStream
2.创建各种Attribute
3.tokenStream.reset();
4.开始循环遍历，tokenStream.incrementToken();
5.遍历结束，tokenStream.end();
6.关闭流，tokenStream.close;

说明：

1.AnalyzerUtils只是一个小样例，练习下Lucene4 如何遍历分词结果信息

测试代码

public class MyTest {

	@Test
	public void test01(){
		
		Analyzer a1 = new StandardAnalyzer(Version.LUCENE_40);
		Analyzer a2 = new StopAnalyzer(Version.LUCENE_40);
		Analyzer a3 = new SimpleAnalyzer(Version.LUCENE_40);
		Analyzer a4 = new WhitespaceAnalyzer(Version.LUCENE_40);
		String txt = " I'm from China.How about you? Do you mind giving your email to me ? " +
				" my is abc@163.com";
		
		AnalyzerUtils.displayToken(txt, a1);
		AnalyzerUtils.displayToken(txt, a2);
		AnalyzerUtils.displayToken(txt, a3);
		AnalyzerUtils.displayToken(txt, a4);

	}
	
	@Test
	public void test02(){
		
		Analyzer a1 = new StandardAnalyzer(Version.LUCENE_40);
		Analyzer a2 = new StopAnalyzer(Version.LUCENE_40);
		Analyzer a3 = new SimpleAnalyzer(Version.LUCENE_40);
		Analyzer a4 = new WhitespaceAnalyzer(Version.LUCENE_40);
		String txt = "I'm from China.How about you? Do you mind giving your email to me?" +
				"my is abc@163.com";
		
		System.out.println("原文_>"+txt);
		
		AnalyzerUtils.displayAllTokenInfo(txt, a1);
		System.out.println("-----------------------------------"+"\n");

		AnalyzerUtils.displayAllTokenInfo(txt, a2);
		System.out.println("-----------------------------------"+"\n");

		AnalyzerUtils.displayAllTokenInfo(txt, a3);
		System.out.println("-----------------------------------"+"\n");

		AnalyzerUtils.displayAllTokenInfo(txt, a4);

	}
}

工具代码：

package util;
import java.io.IOException;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;


public class AnalyzerUtils {

	/**
	 * 展示str被a分词后的语汇单元
	 * @param str 待分词的字符串文本
	 * @param a 选用的分词器
	 */
	public static void displayToken(String str,Analyzer a) {
		try {
			
			// tokenStream(String fieldName, Reader reader) fieldName可以随便起，用了标记
			TokenStream stream = a.tokenStream("abc",new StringReader(str));
			
			// 创建一个属性，这个属性会添加到流中。随着流的遍历而变化
			CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class);
			// 遍历之前reset.
			//stream.reset();
			while(stream.incrementToken()) {
				System.out.print("["+cta+"]");
			}
			// 遍历之后end;
			stream.end();
			// 最后关闭流
			stream.close();
			System.out.println();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 展示str被a分词后的详细分词结果信息
	 * @param str 待分词的字符串文本
	 * @param a 选用的分词器
	 */
	public static void displayAllTokenInfo(String str,Analyzer a){
		
		try {
			// tokenStream(String fieldName, Reader reader) fieldName可以随便起，用了标记
			TokenStream stream = a.tokenStream("abc", new StringReader(str));
			
			// 位置增量属性，语汇单元的间隔
			PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute.class);
			// 语汇单元每个字符的偏移量
			OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class);
			// 语汇单元本身内容
			CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class);
			// 词汇类型??如何详细解释呢。。。
			TypeAttribute ta = stream.addAttribute(TypeAttribute.class);
			
			// 遍历之前先reset
			stream.reset();
			// 开始遍历
			for(;stream.incrementToken();){
				System.out.print(cta.toString()+"----");
				System.out.print(pia.getPositionIncrement()+"【位置增量】;");
				System.out.print(oa.startOffset()+"-"+oa.endOffset()+"【偏移量】;");
				System.out.print(ta.type()+"【词汇类型】"+"\n");
				
			}
			// 遍历之后end
			stream.end();
			// 最后关闭流
			stream.close();
			
			
		} catch (Exception e) {
			System.out.println(e.toString());
		}
		
		
	}
	

}