自定义分词器

最新推荐文章于 2024-05-07 00:24:47 发布

weixin_34184561

最新推荐文章于 2024-05-07 00:24:47 发布

阅读量180

点赞数

文章标签：数据库

原文链接：http://www.cnblogs.com/zhzcode/p/9838435.html

版权

【停用词分词器】

 1 /**
 2  * 自定义停用词分词器
 3  * @author Terry
 4  *
 5  */
 6 public class EnStopAnalyzer  extends Analyzer{
 7     private Version version = Version.LUCENE_35;
 8     //存储停用词集合
 9     private Set stopWords;
10     
11     /**
12      * 使用默认停用词词库
13      */
14     public EnStopAnalyzer(){
15         this.stopWords.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
16     }
17     
18     /**
19      * 使用自定义+默认停用词词库
20      * @param stopWords
21      */
22     public EnStopAnalyzer(String[] stopWords){
23         //参数2：停用词字符串数组（存储了停用词集合）
24         //参数3：是否忽略大小写
25         this.stopWords = StopFilter.makeStopSet(version, stopWords, true);
26         
27         //原有停用词列表
28         //System.out.println(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
29         //将默认停用词集合添加到现有集合中
30         this.stopWords.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
31     }
32     
33     /**
34      * 自定义是否使用停用词词库
35      * @param stopWords
36      * @param userDefault：true---使用默认停用词词库；false---不适用默认停用词词库
37      */
38     public EnStopAnalyzer(String[] stopWords, boolean userDefault){
39         this.stopWords = StopFilter.makeStopSet(version, stopWords, true);
40         
41         if(userDefault)
42             this.stopWords.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
43     }
44     
45     /**
46      * 对流中数据启用停用词设置
47      */
48     @Override
49     public TokenStream tokenStream(String fieldName, Reader reader) {
50         return new StopFilter(version, 
51                 new LowerCaseFilter(version, new LetterTokenizer(version, reader)),
52                 stopWords);
53     }
54 
55 }

 1     /**
 2      * 停用词分词器
 3      */
 4     @Test
 5     public void test04(){
 6         //定义停用词字符串
 7         String[] stopWords = new String[] {"to","of","him","and","take"};
 8         String str = "helped him survive the jungle of Hollywood made  pick up the phone and call for a car to take him to the airport";
 9         
10         Analyzer a =new  EnStopAnalyzer(stopWords);
11         
12         AnalyzerUtil.displayTokenStream(str,a);
13     }

【中文分词器】

1) 常用分词器

Paoding ：庖丁解牛
MMSeg4j ：传说使用了搜狗词库。另说：使用了Paoding的词库
IK_CAnalyzer

2) 分词器的技术点

词库是否强大
算法是否优化

【同义词】

1) 分词过程

2) 示例

 1 public class SynonymsAnalyzer extends Analyzer{
 2 
 3     /**
 4      * 将输入流对象转为TokenStream
 5      */
 6     @Override
 7     public TokenStream tokenStream(String fieldName, Reader reader) {
 8         //创建字典对象（词库）
 9         Dictionary dic = Dictionary.getInstance("e:\\lucene\\dic2");
10         
11         //返回处理后的TokenStream
12         return new SynonymsFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader));
13     }
14 
15 }

  1 public class SynonymsFilter  extends TokenFilter{
  2     //定义Attribute对象
  3     private CharTermAttribute charAttr = null;
  4     //存储位置信息
  5     private PositionIncrementAttribute positionAttr =null;
  6     //存储当前的状态
  7     private AttributeSource.State current;
  8         
  9     //定义栈存储同义词列表
 10     Stack<String> synmonyms = null;
 11 
 12     /**
 13      * 构造函数
 14      * @param input
 15      */
 16     protected SynonymsFilter(TokenStream input) {
 17         super(input);
 18         //向流中添加了CharTermAttribu属性
 19         charAttr = this.addAttribute(CharTermAttribute.class);
 20         positionAttr = this.addAttribute(PositionIncrementAttribute.class);
 21         this.synmonyms = new Stack<String>();
 22     }
 23 
 24     @Override
 25     public boolean incrementToken() throws IOException {
 26         /*
 27         //第一次修改：直接添加同义词：为"我"添加同义词
 28         if("我".equals(charAttr.toString())){
 29             //先清除charAttr中的内容
 30             charAttr.setEmpty();
 31             //向charAttr中添加同义词
 32             charAttr.append("咱");
 33             charAttr.append("俺");
 34             System.out.println("yes");
 35             System.out.println(charAttr.toString());
 36         }
 37         */
 38         
 39         /*2第二次修改：通过方法添加同义词
 40         String[] arr = getSynonyms("我"); 
 41         if(arr!= null && arr.length>0){
 42             for(String str : arr){
 43                 charAttr.append(str);
 44             }
 45         }
 46         */
 47         
 48         /*第三次修改：使用修改后的方法
 49         //判断是否包含同义词
 50         if(getSynonyms("我")){            
 51             //清空当前charAttr对象
 52             charAttr.setEmpty();
 53             while(this.synmonyms.elements().hasMoreElements()){
 54                 charAttr.append(this.synmonyms.pop());
 55             }
 56         }
 57         
 58         System.out.println(charAttr.toString());
 59         */
 60         
 61         //第四次修改：
 62         //判断是否具有同义词，有则还原状态并将同义词追加到charAttr
 63         while(this.synmonyms.size()>0){
 64             //还原状态
 65             restoreState(current);
 66             
 67             String word = this.synmonyms.pop();
 68             
 69             //清空当前charAttr对象
 70             charAttr.setEmpty();
 71             charAttr.append(word);
 72             
 73             //设置位置信息为0（0表示同义词）
 74             positionAttr.setPositionIncrement(0);
 75             
 76             //同义词需要加入到TokenStream中，以便生成同义词索引
 77             //终止继续读取，避免后续内容覆盖当前语汇
 78             return true;
 79         }
 80         
 81         //读取一个语汇
 82         if(!input.incrementToken())
 83             return false;
 84         
 85         //读取同义词列表
 86         if(getSynonyms(charAttr.toString()))
 87             //变更状态
 88             current = captureState();
 89         
 90         return true;
 91     }
 92     
 93     /**
 94      * 获取语汇的同义词
 95      * @param key    ：语汇内容
 96      * @return    ：true--含有同义词；false--没有同义词
 97      */
 98     private boolean getSynonyms(String key){
 99         //定义Map存储同义词
100         Map<String,String[]> map = new HashMap<String,String[]>();
101         //向集合中添加同义词
102         map.put("我", new String[]{"咱","俺"});
103         
104         //获取key的同义词列表
105         String[] array = map.get(key);
106         //遍历同义词列表
107         if(array != null && array.length>0){
108             for(String str : array){
109                 //向栈对象synmonyms中添加同义词
110                 this.synmonyms.push(str);
111             }
112             
113             return true;
114         }
115         
116         return false;
117     }
118     
119     /*
120     private String[] getSynonyms(String key){
121         //定义Map存储同义词
122         Map<String,String[]> map = new HashMap<String,String[]>();
123         //向集合中添加同义词
124         map.put("我", new String[]{"咱","俺"});
125         
126         return map.get(key);
127     }
128     */
129 
130 }

3) 创建同义词索引

 1     /**
 2      * 创建同义词索引
 3      * @throws IOException 
 4      * @throws LockObtainFailedException 
 5      * @throws CorruptIndexException 
 6      * @throws ParseException 
 7      */
 8     @Test
 9     public void test2() throws CorruptIndexException, LockObtainFailedException, IOException, ParseException{
10         Version version = Version.LUCENE_35;
11         String str = "我来自翰林，我爱翰林";
12         Analyzer a = new SynonymsAnalyzer();
13         Directory dir = new RAMDirectory();
14         
15         //1、创建同义词索引
16         //创建IndexWriter
17         IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(version,a));
18         
19         //创建Document 
20         Document doc = new Document();
21         doc.add(new Field("content",str,Field.Store.YES,Field.Index.ANALYZED));
22         writer.addDocument(doc);
23         writer.close();
24         
25         //检索内容
26         //创建IndexReader
27         IndexReader reader = IndexReader.open(dir);
28         
29         //创建Searcher
30         IndexSearcher searcher = new IndexSearcher(reader);
31         
32         //创建Query
33         QueryParser parser = new QueryParser(version,"content",a);
34         Query query = parser.parse("we");
35             
36         //获取TopDocs
37         TopDocs td = searcher.search(query, 100);
38         
39         //获取ScoreDoc
40         for(ScoreDoc sd : td.scoreDocs){
41             Document d= searcher.doc(sd.doc);
42             System.out.println(d.get("content"));
43         }
44         
45         searcher.close();
46         reader.close();
47     }

4) 自定义词库

 1 /**
 2  * 同义词库接口
 3  * 同义词(引擎)
 4  * @author Terry
 5  *
 6  */
 7 public interface SynonymsContext {
 8     /**
 9      * 获取同义词
10      * @param key
11      * @return
12      */
13     public String[] getSynonyms(String key);
14 }
15 public class SynonymsFilter  extends TokenFilter{
16     //定义Attribute对象
17     private CharTermAttribute charAttr = null;
18     //存储位置信息
19     private PositionIncrementAttribute positionAttr =null;
20     //存储当前的状态
21     private AttributeSource.State current;
22         
23     //定义栈存储同义词列表
24     Stack<String> synmonyms = null;
25     //创建分词库(引擎)对象
26     SynonymsContext context = null;
27 
28     /**
29      * 构造函数
30      * @param input
31      */
32     protected SynonymsFilter(TokenStream input,SynonymsContext context) {
33         super(input);
34         //向流中添加了CharTermAttribu属性
35         charAttr = this.addAttribute(CharTermAttribute.class);
36         positionAttr = this.addAttribute(PositionIncrementAttribute.class);
37         this.synmonyms = new Stack<String>();
38         
39         this.context = context;
40     }
41 
42     @Override
43     public boolean incrementToken() throws IOException {
44         while(this.synmonyms.size()>0){
45             //还原状态
46             restoreState(current);
47             
48             String word = this.synmonyms.pop();
49             
50             //清空当前charAttr对象
51             charAttr.setEmpty();
52             charAttr.append(word);
53             
54             //设置位置信息为0（0表示同义词）
55             positionAttr.setPositionIncrement(0);
56             
57             //同义词需要加入到TokenStream中，以便生成同义词索引
58             //终止继续读取，避免后续内容覆盖当前语汇
59             return true;
60         }
61         
62         //读取一个语汇
63         if(!input.incrementToken())
64             return false;
65         
66         //读取同义词列表
67         if(getSynonyms(charAttr.toString()))
68             //变更状态
69             current = captureState();
70         
71         return true;
72     }
73     
74     /**
75      * 获取同义词列表
76      * @param key
77      * @return
78      */
79     private boolean getSynonyms(String key){
80         String[] array = context.getSynonyms(key);
81         
82         //遍历同义词列表
83         if (array != null && array.length > 0) {
84             for (String str : array) {
85                 // 向栈对象synmonyms中添加同义词
86                 this.synmonyms.push(str);
87             }
88             return true;
89         }
90         return false;
91     }
92 }

 1 /**
 2  * 同义词库接口
 3  * 同义词(引擎)
 4  * @author Terry
 5  *
 6  */
 7 public interface SynonymsContext {
 8     /**
 9      * 获取同义词
10      * @param key
11      * @return
12      */
13     public String[] getSynonyms(String key);
14 }
15 public class SynonymsFilter  extends TokenFilter{
16     //定义Attribute对象
17     private CharTermAttribute charAttr = null;
18     //存储位置信息
19     private PositionIncrementAttribute positionAttr =null;
20     //存储当前的状态
21     private AttributeSource.State current;
22         
23     //定义栈存储同义词列表
24     Stack<String> synmonyms = null;
25     //创建分词库(引擎)对象
26     SynonymsContext context = null;
27 
28     /**
29      * 构造函数
30      * @param input
31      */
32     protected SynonymsFilter(TokenStream input,SynonymsContext context) {
33         super(input);
34         //向流中添加了CharTermAttribu属性
35         charAttr = this.addAttribute(CharTermAttribute.class);
36         positionAttr = this.addAttribute(PositionIncrementAttribute.class);
37         this.synmonyms = new Stack<String>();
38         
39         this.context = context;
40     }
41 
42     @Override
43     public boolean incrementToken() throws IOException {
44         while(this.synmonyms.size()>0){
45             //还原状态
46             restoreState(current);
47             
48             String word = this.synmonyms.pop();
49             
50             //清空当前charAttr对象
51             charAttr.setEmpty();
52             charAttr.append(word);
53             
54             //设置位置信息为0（0表示同义词）
55             positionAttr.setPositionIncrement(0);
56             
57             //同义词需要加入到TokenStream中，以便生成同义词索引
58             //终止继续读取，避免后续内容覆盖当前语汇
59             return true;
60         }
61         
62         //读取一个语汇
63         if(!input.incrementToken())
64             return false;
65         
66         //读取同义词列表
67         if(getSynonyms(charAttr.toString()))
68             //变更状态
69             current = captureState();
70         
71         return true;
72     }
73     
74     /**
75      * 获取同义词列表
76      * @param key
77      * @return
78      */
79     private boolean getSynonyms(String key){
80         String[] array = context.getSynonyms(key);
81         
82         //遍历同义词列表
83         if (array != null && array.length > 0) {
84             for (String str : array) {
85                 // 向栈对象synmonyms中添加同义词
86                 this.synmonyms.push(str);
87             }
88             return true;
89         }
90         return false;
91     }
92 }

 1 public class SynonymsAnalyzer extends Analyzer{
 2     private SynonymsContext context =null;
 3     
 4     public SynonymsAnalyzer(SynonymsContext context) {
 5         this.context = context;
 6     }
 7 
 8     /**
 9      * 将输入流对象转为TokenStream
10      */
11     @Override
12     public TokenStream tokenStream(String fieldName, Reader reader) {
13         //创建字典对象（词库）
14         Dictionary dic = Dictionary.getInstance("e:\\lucene\\dic2");
15         
16         //返回处理后的TokenStream
17         return new SynonymsFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader),context);
18     }
19 
20 }

 1 public class MoreSynonymsContext implements SynonymsContext{
 2     Map<String,String[]> map = new HashMap<String,String[]>();
 3     
 4     public MoreSynonymsContext() {
 5         map.put("我", new String[]{"俺","咱"});
 6         map.put("翰林", new String[]{"汉","汗","旱"});
 7     }
 8     
 9     @Override
10     public String[] getSynonyms(String key) {
11         return map.get(key);
12     }
13 
14 }

面向接口的方式,封装了变化点.

优势:便于扩展

转载于:https://www.cnblogs.com/zhzcode/p/9838435.html

weixin_34184561

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
自定义分词器

【停用词分词器】 1 /** 2 * 自定义停用词分词器 3 * @author Terry 4 * 5 */ 6 public class EnStopAnalyzer extends Analyzer{ 7 private Version version = Version.LUCENE_35; 8 //存储停用词集合 9 ...
复制链接

扫一扫