Lucene使用IKAnalyzer动态扩展词库

最新推荐文章于 2023-10-31 10:02:42 发布

kkndgto

最新推荐文章于 2023-10-31 10:02:42 发布

阅读量1.2k

点赞数 2

文章标签： lucene 扩展 IKAnalyzer

本文链接：https://blog.csdn.net/kkndgto/article/details/48706599

版权

Luncene版本3.6.0 IKAnalyzer 版本2012_u6

最近在使用Luncene+IKAnalyzer做全文检索遇到了IKAnalyzer分词不能满足要求。通过查找资源发现有很多种方式可以扩展字典，下面我来介绍一下这两种方式：

 
  第一种方式：
 通过配置文件来扩展字典
 在ｓｒｃ下面建立IKAnalyzer.cfg.xml
 <? 
  xml 
  version 
  = 
  "1.0" 
  encoding 
  = 
  "UTF-8" 
  ?> 
 
 
  <! 
  DOCTYPE 
  properties SYSTEM "http://java.sun.com/dtd/properties.dtd">   
 
 
  < 
  properties 
  >   
 
 
       
  < 
  comment 
  >IK Analyzer 扩展配置</ 
  comment 
  > 
 
 
       
  <!-- 用户可以在这里配置自己的扩展字典 --> 
 
 
        
  < 
  entry 
  key 
  = 
  "ext_dict" 
  >com/dh/lucene/ext_begin.dic</ 
  entry 
  >  
 
 
        
  <!-- 用户可以在这里配置自己的扩展停用词字典    --> 
 
 
       
  < 
  entry 
  key 
  = 
  "ext_stopwords" 
  >com/dh/lucene/ext_stopword.dic</ 
  entry 
  >  
 
 
  </ 
  properties 
  > 
 

建立字典文件ext_begin.dic，ext_stopword.dic　这里需要注意字典文件格式是无BOM 的UTF-8 编码的中文文本文件，可以通过editplus进行修改。

具体Tools->Configure User Tools->File 选择ＵＴＦ－８无ｂｏｍ格式。

在ｄｉｃ中维护字典即可。

第二种方式：

建立MyConfiguration类，这个类也是在网上搜集到的。

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.InvalidPropertiesFormatException;
import java.util.List;
import java.util.Properties;

/**
* Configuration 默认实现
*
*/
public class MyConfiguration implements Configuration{
   //懒汉单例
   private static final Configuration CFG = new MyConfiguration();
   /*
   * 分词器默认字典路径
   */
   private String PATH_DIC_MAIN = "org/wltea/analyzer/dic/main2012.dic";
   private static final String //    MyConfiguration mycfg = new MyConfiguration();
   /*
   * 分词器配置文件路径
   */
   private static final String FILE_NAME = "IKAnalyzer.cfg.xml";
   //配置属性——扩展字典
   private static final String EXT_DICT = "ext_dict";
   //配置属性——扩展停止词典
   private static final String EXT_STOP = "ext_stopwords";

   private Properties props;
   /*
   * 是否使用smart方式分词
   */
   private boolean useSmart;
   /**
   * 返回单例
   * @return Configuration单例
   */
   public static Configuration getInstance(){
       return CFG;
   }
   /*
   * 初始化配置文件
   */
   public MyConfiguration(){
       props = new Properties();

       InputStream input = this.getClass().getClassLoader().getResourceAsStream(FILE_NAME);
       if(input != null){
           try {
               props.loadFromXML(input);
           } catch (InvalidPropertiesFormatException e) {
               e.printStackTrace();
           } catch (IOException e) {
               e.printStackTrace();
           }
       }
   }
   /**
   * 返回useSmart标志位
   * useSmart =true ，分词器使用智能切分策略， =false则使用细粒度切分
   * @return useSmart
   */
   public boolean useSmart() {
       return useSmart;
   }
   /**
   * 设置useSmart标志位
   * useSmart =true ，分词器使用智能切分策略， =false则使用细粒度切分
   * @param useSmart
   */
   public void setUseSmart(boolean useSmart) {
       this.useSmart = useSmart;
   }


   /**
   新加函数：设置主词典路径
   * @return String
   主词典路径
   */
   public void setMainDictionary(String path) {
       this.PATH_DIC_MAIN = path;
   }

   /**
   * 获取主词典路径
   *
   * @return String 主词典路径
   */
   public String getMainDictionary(){
       return PATH_DIC_MAIN;
   }

   /**
   * 获取量词词典路径
   * @return String 量词词典路径
   */
   public String getQuantifierDicionary(){
       return PATH_DIC_QUANTIFIER;
   }

   /**
   * 获取扩展字典配置路径
   * @return List<String> 相对类加载器的路径
   */
   public List<String> getExtDictionarys(){
       List<String> extDictFiles = new ArrayList<String>(2);
       String extDictCfg = props.getProperty(EXT_DICT);
       if(extDictCfg != null){
           //使用;分割多个扩展字典配置
           String[] filePaths = extDictCfg.split(";");
           if(filePaths != null){
               for(String filePath : filePaths){
                   if(filePath != null && !"".equals(filePath.trim())){
                       extDictFiles.add(filePath.trim());
                   }
               }
           }
       }
       return extDictFiles;
   }

   /**
   * 获取扩展停止词典配置路径
   * @return List<String> 相对类加载器的路径
   */
   public List<String> getExtStopWordDictionarys(){
       List<String> extStopWordDictFiles = new ArrayList<String>(2);
       String extStopWordDictCfg = props.getProperty(EXT_STOP);
       if(extStopWordDictCfg != null){
           //使用;分割多个扩展字典配置
           String[] filePaths = extStopWordDictCfg.split(";");
           if(filePaths != null){
               for(String filePath : filePaths){
                   if(filePath != null && !"".equals(filePath.trim())){
                       extStopWordDictFiles.add(filePath.trim());
                   }
               }
           }
       }
       return extStopWordDictFiles;
   }
}

使用方式：

MyConfiguration mycfg = new MyConfiguration();
mycfg.setUseSmart(true);
// 设置为智能分词
mycfg.setMainDictionary("com/dh/lucene/ext_begin.dic.dic");

Dictionary.initial(mycfg);

Analyzer mAnalyzer = new IKAnalyzer();

这样不需要走配置文件方式直接使用自定义字典。

第三种方式为通过使用addWords方法，批量加载新词条，根据业务需求可以在生成索引前把需要的词添加进去：

Dictionary dg = Dictionary.getSingleton();
List<String> list=new ArrayList<String>();
list.add("需要动态添加的词");
dg.addWords(list);

Analyzer mAnalyzer = new IKAnalyzer();

建立索引文件

public static void createIndex(String lupath,Word word){
            try {
                File path = new File(lupath+"LuceneEx"); //索引文件存放路径
                Directory mdDirectory = FSDirectory.open(path);

      //－－－－－－－－－－－－－第二种方式－－－－－－－－－－－－－－－－－－
            //    MyConfiguration mycfg = new MyConfiguration();
              // mycfg.setUseSmart(true);
                // 设置为智能分词
             //   mycfg.setMainDictionary("com/dh/lucene/ext_stopword.dic");
                //动态设置自定义的词库
            //    IKSegmenter seg = new IKSegmenter(new StringReader(text) ,mycfg);
                //Dictionary.initial(mycfg);

//－－－－－－－－－－－－－第二种方式－－－－－－－－－－－－－－－－－－

                Dictionary dg = Dictionary.getSingleton();
                List<String> strList=new ArrayList<String>();
                strList.add(word.getT_model_title());
                dg.addWords(strList);
             // 使用Lucene提供的分词器
//              Analyzer mAnalyzer = new StandardAnalyzer(Version.LUCENE_36);
                // 使用商业分词器
            /*
             * Analyzer analyzer = new IKAnalyzer();//细粒度切分算法
                Analyzer analyzer = new IKAnalyzer(true);//智能切分
                */
                Analyzer mAnalyzer = new IKAnalyzer();

                LogMergePolicy mergePolicy = new LogByteSizeMergePolicy();
                //设置segment添加文档(Document)时的合并频率
               mergePolicy.setMergeFactor(20);
               //设置segment最大合并文档(Document)数
               mergePolicy.setMaxMergeDocs(500);
               //启用复合式索引文件格式,合并多个segment
               mergePolicy.setUseCompoundFile(true);

               IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, mAnalyzer);

               config.setMergePolicy(mergePolicy);
                 //设置索引的打开模式
                config.setOpenMode(OpenMode.CREATE_OR_APPEND);
                IndexWriter writer = new IndexWriter(mdDirectory, config);
                long start = System.currentTimeMillis();
                    Document doc = new Document();
                    /**
                       *   Field.Store.YES:存储字段值（未分词前的字段值）
                           Field.Store.NO:不存储,存储与索引没有关系
                           Field.Store.COMPRESS:压缩存储,用于长文本或二进制，但性能受损

                           Field.Index.ANALYZED:分词建索引
                           Field.Index.ANALYZED_NO_NORMS:分词建索引，但是Field的值不像通常那样被保存，而是只取一个byte，这样节约存储空间
                           Field.Index.NOT_ANALYZED:不分词且索引
                           Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引，Field的值去一个byte保存

                           TermVector表示文档的条目（由一个Document和Field定位）和它们在当前文档中所出现的次数
                           Field.TermVector.YES:为每个文档（Document）存储该字段的TermVector
                           Field.TermVector.NO:不存储TermVector
                           Field.TermVector.WITH_POSITIONS:存储位置
                           Field.TermVector.WITH_OFFSETS:存储偏移量
                           Field.TermVector.WITH_POSITIONS_OFFSETS:存储位置和偏移量
                       */

　　　　　//－－－－－－－－－－业务需要－－－－－－－－－－－－－－－　　　　　
                    String c = "";
                    JSONArray array = JSONArray.fromObject(word.getT_model_concent());
                   for(int i = 0; i < array.size(); i++){
                       JSONObject jsonObject1 = array.getJSONObject(i);
                       if(jsonObject1.getString("type").equals("basicInfoModule")){
                          if(jsonObject1.getString("text")!=null&&!jsonObject1.getString("text").equals("")&&jsonObject1.getString("text").length()>0){
                          c = jsonObject1.getString("text");
                          }
                        }
                   }
                   //－－－－－－－－－－业务需要－－－－－－－－－－－－－－－

                    Field id = new Field("t_modelid", word.getT_modelid(),Store.YES, Index.NOT_ANALYZED);
                    Field title = new Field("title", word.getT_model_title(), Store.YES, Index.NOT_ANALYZED);
                    Field concent = new Field("concent", c, Store.YES, Index.NOT_ANALYZED);

                    doc.add(id);
                    doc.add(title);
                    doc.add(concent);

                    writer.addDocument(doc);
                writer.commit();
                //   writer.forceMerge(1);// 优化
                writer.close();
                long end = System.currentTimeMillis();
                System.out.println("createIndex耗时"+(end-start)+"ms");

            } catch (CorruptIndexException e) {
                e.printStackTrace();
            } catch (LockObtainFailedException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }