1.自定义分词器首先要了解Analyzer、Tokenizer和TokenFilter三者之间的联系
Analyzer包含两个核心组件,Tokenizer以及TokenFilter。两者的区别在于,前者在字符级别处理流,而后者则在词语级别处理流。Tokenizer是Analyzer的第一步,其构造函数接收一个Reader作为参数,而TokenFilter则是一个类似的拦截器,其参数可以是TokenStream、Tokenizer。
详细介绍请看 Lucene源码解析–Analyzer之Tokenizer
2.定义自己的Analyzer——MyTokenSameWordAnalyzer
- 这里我要说明一下我用的lucene的包是3.6.2的,因为没有汉字的分词器所以我是用了mmseg4j-all-1.8.5.jar,MMSegAnalyzer是提供了汉字分词
- 所以我们就仿照这个MMSegAnalyzer类来写我们的MyTokenSameWordAnalyzer类
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.store.Directory;
import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;
public class MyTokenSameWordAnalyzer extends Analyzer {
public MyTokenSameWordAnalyzer() {
// TODO Auto-generated constructor stub
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
// TODO Auto-generated method stub
Dictionary dic = Dictionary.getInstance("F:\\jar包\\java各种jar包集合\\mmseg\\mmseg4j-1.8.5\\data"); //这个是mmseg4j的源码包下边的dictionary,自己的分词字典
return new MyTokenSameWordFilter(new MMSegTokenizer(new MaxWordSeg(dic),reader));//通过MMSegTokenizer源码我们发现newSeg()是受保护的,所以我们继续网上找到了new MaxWordSeg(dic),我们也由此得知dic是个默认地址我们需要手工指定一个dictionary
}
}
3.定义自己的Filter——MyTokenSameWordFilter
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
public class MyTokenSameWordFilter extends TokenFilter {
private static Map<String,String[]> mp = new HashMap<String,String[]>();
private CharTermAttribute cta = null;
private PositionIncrementAttribute pia = null;
private Stack<String> stack = null; // 栈是Vector的一个子类,它实现了一个标准的后进先出的栈。
private AttributeSource.State current;
static{
File file = new File("D:\\LuceneData\\local05\\samewords");//我自己定义的同义词字典
if (file.exists()) {
File[] files = file.listFiles();
try {
for (File file2 : files) {
if(file2.isFile()){
List readLines = FileUtils.readLines(file2,"GBK");
if(readLines!=null){
Object[] array = readLines.toArray();
for(int i=0;i<array.length;i++ ){
String head = StringUtils.substring(array[i].toString(),0,StringUtils.indexOf(array[i].toString(), "=>"));
String weibu = StringUtils.substring(array[i].toString(),StringUtils.indexOf(array[i].toString(), "=>")+2,array[i].toString().length());
String[] end = StringUtils.split(weibu,",");
mp.put(head, end);
}
}
}
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
public MyTokenSameWordFilter(TokenStream input) {
super(input);
cta = this.addAttribute(CharTermAttribute.class); //元素
pia = this.addAttribute(PositionIncrementAttribute.class);
stack = new Stack<String>();//每次要取第一个元素是重新生成一下stack
}
@Override
public boolean incrementToken() throws IOException {
//获取分词结果
while(stack.size()>0){
//将当前的cta的同义词出栈
String sw = stack.pop();
//还原为当前状态
restoreState(current);
//
cta.setEmpty();
cta.append(sw);
pia.setPositionIncrement(pia.getPositionIncrement());
return true; //return之后输出新的元素
}
if(!input.incrementToken()) return false;
if(getSamesWords(cta.toString())){
current = captureState();
}
// System.out.println(cta);
return true;
}
public boolean getSamesWords(String name){
String[] obj = mp.get(name);
if(obj!=null){
for(String str:obj){
stack.push(str);
}
return true;
}
return false;
}
}
4.定义自己的TestSearcher
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.junit.Before;
import org.junit.Test;
import com.bjxy.lucene4.util.SearcherUtil;
import com.bjxy.lucene5.analyzer.MyStopAnalyzer;
import com.bjxy.lucene5.util.AnalyzerUtil;
import com.chenlb.mmseg4j.MMSeg;
import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
public class TestSearcher {
private MyIndexMain searcher = null;
private Directory directory = null;
private Analyzer a1 = null;
private Analyzer a2 = null;
private Analyzer a3 = null;
private Analyzer a4 = null;
String field = "this is xy , welcome to my house,friends my qq is 1229396220 and mail is yzuchaoyang@foxmail.com";
private static String pathname = "D:\\LuceneIndex\\Index05";
@Before
public void createDirectory(){
try {
directory = FSDirectory.open(new File(pathname));
searcher = new MyIndexMain();
//当前分词器不适用于汉字
a1 = new StandardAnalyzer(Version.LUCENE_36 );//不拆介词和this,that,标点,特殊符号这些词
a2 = new StopAnalyzer(Version.LUCENE_36 ); //不拆分数字和介词和this,that,标点,特殊符号这些词
a3 = new SimpleAnalyzer(Version.LUCENE_36 ); //不拆分数字,和标点,特殊符号
a4 = new WhitespaceAnalyzer(Version.LUCENE_36 );//以空格进行拆分
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
public void testSameWordsAnalyzer01() throws Exception{
try {
field = "我叫玉朝阳,来自中国的一个保定市的小农村里";
AnalyzerUtil.displayAllTokenInfo(field,new MyTokenSameWordAnalyzer());
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
}
@Test
public void testMyIndexWirter() throws Exception{
try {
searcher.myIndexWirter(directory,true);
// AnalyzerUtil.displayAllTokenInfo(field,new MyTokenSameWordAnalyzer());
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
}
@Test
public void testMyIndexReader() throws Exception{
try {
searcher.myIndexReader(directory,"俺叫",1,5); //通过同义词分词器就能查询了
// AnalyzerUtil.displayAllTokenInfo(field,new MyTokenSameWordAnalyzer());
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
}
}
5.定义自己的AnalyzerUtil
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
public class AnalyzerUtil {
public static void displayToken(String field,Analyzer a){
try {
TokenStream tstream = a.tokenStream("content", new StringReader(field));
CharTermAttribute cta = tstream
.addAttribute(CharTermAttribute.class);
while (tstream.incrementToken()) {
System.out.println("cta: " + cta);
}
System.out.println("-----------------------------");
} catch (Exception e) {
e.printStackTrace();
}
}
public static void displayAllTokenInfo(String field,Analyzer a){
try {
TokenStream tstream = a.tokenStream("content", new StringReader(field));
//位置增量属性,存储语汇单元之间的距离
PositionIncrementAttribute pia = tstream.addAttribute(PositionIncrementAttribute.class);
//存储语汇单元的位偏移量
OffsetAttribute oa = tstream.addAttribute(OffsetAttribute.class);
//使用的分词器的类型信息
TypeAttribute ta = tstream.addAttribute(TypeAttribute.class);
//存储每个语汇单元的信息(分词单元信息)
CharTermAttribute cta = tstream.addAttribute(CharTermAttribute.class);
// while (tstream.incrementToken()) {
// System.out.println("cta: "+cta+" ta: "+ta.type()+" pia: " + pia.getPositionIncrement()+" ["+oa.startOffset()+"-"+oa.endOffset()+"]");
// }
for(;tstream.incrementToken();){
System.out.println("cta: "+cta+" ta: "+ta.type()+" pia: " + pia.getPositionIncrement()+" ["+oa.startOffset()+"-"+oa.endOffset()+"]");
}
System.out.println("-----------------------------");
} catch (Exception e) {
e.printStackTrace();
}
}
}
6.定义自己的MyIndexMain
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.text.SimpleDateFormat;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
import cn.bjxy.lucene2.util.LuceneUtil1;
import com.bjxy.lucene3.util.ReaderUtil;
import com.bjxy.lucene4.util.SearcherUtil;
public class MyIndexMain {
public void myIndexWirter(Directory directory,boolean hasNew) {
IndexWriter indexWriter = null;
try{
indexWriter = LuceneUtil1.createIndexWriter(directory, Version.LUCENE_36 ,new StandardAnalyzer(Version.LUCENE_36));
if(hasNew){
indexWriter.deleteAll(); //创建索引之前,先把文档清空掉
}
//3.创建Document对象
File file = new File("D:\\LuceneData\\local05\\data\\");
Document doc = null;
for(File eFl:file.listFiles()){
//4.创建Document对应的Field信息
String readFileToString = FileUtils.readFileToString(eFl,"GBK");
System.out.println(readFileToString);
doc = new Document();
// doc.add(new Field("content",new FileReader(eFl)));
// doc.add(new Field("content",readFileToString,Field.Store.YES,Field.Index.ANALYZED));
doc.add(new Field("content",readFileToString,Field.Store.NO,Field.Index.ANALYZED));
doc.add(new Field("path",eFl.getAbsolutePath(),Field.Store.YES,Field.Index.NOT_ANALYZED));
doc.add(new Field("filename",eFl.getName(),Field.Store.YES,Field.Index.NOT_ANALYZED));
doc.add(new NumericField("date",Field.Store.YES,true).setLongValue(eFl.lastModified()));
doc.add(new NumericField("size",Field.Store.YES,true).setIntValue((int)(eFl.length()/1024)));//字节转化为k
//5.通过IndexWriter将文档添加到索引中
indexWriter.addDocument(doc);
}
}catch (Exception e) {
e.printStackTrace();
}finally{
if(indexWriter!=null)
try {
indexWriter.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static ScoreDoc getLastScoreDoc(int pageIndex,int pageSize,Query query,IndexSearcher searcher) throws Exception{
if(pageIndex == 1 ) return null; //如果是第一页就返回空
int num = (pageIndex-1)*pageSize;
TopDocs docs = searcher.search(query, num);
ScoreDoc last = (docs.scoreDocs)[num-1];
return last;
}
public void myIndexReader(Directory directory, String queryName,
int pageIndex, int pageSize) {
// TODO Auto-generated method stub
IndexSearcher searcher = null;
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
try{
QueryParser paser = new QueryParser(Version.LUCENE_36,"content",new MyTokenSameWordAnalyzer()); //默认搜索域 content
Query query = paser.parse(queryName); //有tom或者jerry的,默认空格为OR[对字段内容进行判断] 有两条
searcher = ReaderUtil.getIndexSearcher(directory);
ScoreDoc lastScoreDoc = SearcherUtil.getLastScoreDoc(pageIndex, pageSize, query, searcher);
TopDocs searchAfter = searcher.searchAfter(lastScoreDoc, query,pageSize);
ScoreDoc[] afters = searchAfter.scoreDocs;
for(ScoreDoc sc:afters){
Document doc = searcher.doc(sc.doc);
System.out.println(doc.get("filename")+"---->"+doc.get("content")+"---->"+format.format(Long.valueOf(doc.get("date"))));
}
}catch (Exception e) {
e.printStackTrace();
}
}
}
7.定义自己的ReaderUtil
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import cn.bjxy.lucene2.util.LuceneUtil1;
public class ReaderUtil {
private static IndexReader indexReader = null;
private ReaderUtil(){}
public static IndexSearcher getIndexSearcher(Directory directory) throws Exception{
if(indexReader==null){
synchronized (ReaderUtil.class) {
if(indexReader==null){
indexReader = IndexReader.open(directory);
}else{
IndexReader iR2 = IndexReader.openIfChanged(indexReader);
if(iR2!=null)indexReader.close(); indexReader = iR2;
}
}
}else{
IndexReader iR2 = IndexReader.openIfChanged(indexReader);
if(iR2!=null)indexReader = iR2;
}
return new IndexSearcher(indexReader);
}
}
8.数据源
我的数据源为D:\LuceneData\local05\data\company.txt
内容为:
我是小玉来自一个神奇的国度,这个地方叫中国,在这里的人们每天都过得很嗨皮!