IK动态词库及禁用内置主词库

版权声明:【分享也是一种提高】个人转载请在正文开头明显位置注明出处,未经作者同意禁止企业/组织转载,禁止私自更改原文,禁止用于商业目的。 https://blog.csdn.net/u010887744/article/details/78230017

        ik-analyzer新增词库后,需要重启solr,而线上环境肯定是需要支持热更新的,需要修改词库后“实时”更新词库。个人将先前修改后的IK(支持solr6.6+版本),再做修改,使之能实现以下功能:

  1. 支持IK词库热更新,服务定期扫描词库,发现词库变化则重新导入相应词库;
  2. 记录详细更新日志,新增时间+新增词语,方便定位问题;
  3. 支持禁用内置主词典main2012.dic。
1、DefaultConfig主要修改代码:
/**
     * 获取词典动态更新时间间隔[首次延时,时间间隔](格式:正整数,单位:分钟)
     * 
     * @return Integer 时间间隔
     */
    public Integer[] getDicUpdateMin() {
        String extUpdateMin = props.getProperty(DIC_UPDATEMIN);
        Integer[] timeInterval = null;
        if (null != extUpdateMin && !Objects.equals("", extUpdateMin.trim())) {
            String[] split = extUpdateMin.split(",");
            if (split.length == 2) {
                timeInterval = new Integer[2];
                timeInterval[0] = Integer.valueOf(split[0].trim());
                timeInterval[1] = Integer.valueOf(split[1].trim());
                if (timeInterval[1] <= 0) {
                    timeInterval = null;
                }
            }
        }
        Dictionary.print("dic_updateMin_Param", extUpdateMin);
        return timeInterval;
    }

    /**
     * 是否禁用内置主词典main2012.dic
     * 
     * @return bool 默认false(不禁用)
     */
    public boolean isDicDisable() {
        String extUpdateMin = props.getProperty(DICINNER_DISABLE);
        Dictionary.print("isDicDisable", extUpdateMin);
        return Objects.equals("true", extUpdateMin);
    }
2、Dictionary词典管理类
重构部分代码,主要修改代码如下:
/**
 * 词典管理类,单子模式
 */
public class Dictionary {

    /*
     * 词典单子实例
     */
    private static Dictionary singleton;

    /*
     * 主词典对象
     */
    private static DictSegment _MainDict = null;

    /*
     * 停止词词典
     */
    private static DictSegment _StopWordDict = null;

    /*
     * 量词词典
     */
    private DictSegment _QuantifierDict;

    /**
     * 词典上传修改时间.
     */
    private static Map<String, Long> dicLastModified = new HashMap<String, Long>();

    /**
     * 扩展词.
     */
    private static Set<String> dicExtSet = new HashSet<String>(10000);

    /**
     * 停用词.
     */
    private static Set<String> dicStopSet = new HashSet<String>(2000);

    /**
     * 配置对象
     */
    private static Configuration cfg;

    /**
     * 线程池定时加载词典.
     */
    private static ScheduledExecutorService scheduledThreadPool = Executors.newScheduledThreadPool(1);

    /**
     * 是否已加载过词典.
     */
    private static boolean hasAdd = false;

    /**
     * SimpleDateFormat(程序逻辑不存在并发,不考虑线程不安全情况).
     */
    private final static java.text.SimpleDateFormat DATE_FORMAT = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S");

    /**
     * 词典初始化
     * 
     * 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
     * 
     * 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间,
     * 
     * 该方法提供了一个在应用加载阶段就初始化字典的手段
     * 
     * @return Dictionary
     */
    public static Dictionary initial(Configuration cfg) {
        if (singleton == null) {
            synchronized (Dictionary.class) {
                if (singleton == null) {
                    singleton = new Dictionary(cfg);
                    Integer[] dicUpdateMin = cfg.getDicUpdateMin();
                    if (null != dicUpdateMin) {
                        print("loadDicFixedTime", "start");
                        loadDicFixedTime(dicUpdateMin);
                    }
                    return singleton;
                }
            }
        }
        return singleton;
    }

    /**
     * 定期加载配置文件.
     * 
     * @param dicUpdateMin
     *            加载间隔
     */
    private static void loadDicFixedTime(Integer[] dicUpdateMin) {
        scheduledThreadPool.scheduleWithFixedDelay(new Runnable() {

            public void run() {
                try {
                    loadMainDict();
                    loadStopWordDict();
                } catch (Exception e) {
                    print(e);
                }
            }
        }, dicUpdateMin[0], dicUpdateMin[1], TimeUnit.MINUTES);
    }

    private Dictionary(Configuration cfg) {
        this.cfg = cfg;
        this.loadMainDict();
        this.loadStopWordDict();
        this.loadQuantifierDict();
        hasAdd = true;
    }

    /**
     * 获取词典单子实例
     * 
     * @return Dictionary 单例对象
     */
    public static Dictionary getSingleton() {
        if (singleton == null) {
            throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
        }
        return singleton;
    }

    /**
     * 批量加载新词条
     * 
     * @param words
     *            Collection<String>词条列表
     */
    public void addWords(Collection<String> words) {
        if (words != null) {
            for (String word : words) {
                if (word != null) {
                    // 批量加载词条到主内存词典中
                    singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
                }
            }
        }
    }

    /**
     * 批量移除(屏蔽)词条
     * 
     * @param words
     */
    public void disableWords(Collection<String> words) {
        if (words != null) {
            for (String word : words) {
                if (word != null) {
                    // 批量屏蔽词条
                    singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
                }
            }
        }
    }

    /**
     * 检索匹配主词典
     * 
     * @param charArray
     * @return Hit 匹配结果描述
     */
    public Hit matchInMainDict(char[] charArray) {
        return singleton._MainDict.match(charArray);
    }

    /**
     * 检索匹配主词典
     * 
     * @param charArray
     * @param begin
     * @param length
     * @return Hit 匹配结果描述
     */
    public Hit matchInMainDict(char[] charArray, int begin, int length) {
        return singleton._MainDict.match(charArray, begin, length);
    }

    /**
     * 检索匹配量词词典
     * 
     * @param charArray
     * @param begin
     * @param length
     * @return Hit 匹配结果描述
     */
    public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
        return singleton._QuantifierDict.match(charArray, begin, length);
    }

    /**
     * 从已匹配的Hit中直接取出DictSegment,继续向下匹配
     * 
     * @param charArray
     * @param currentIndex
     * @param matchedHit
     * @return Hit
     */
    public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
        DictSegment ds = matchedHit.getMatchedDictSegment();
        return ds.match(charArray, currentIndex, 1, matchedHit);
    }

    /**
     * 判断是否是停止词
     * 
     * @param charArray
     * @param begin
     * @param length
     * @return boolean
     */
    public boolean isStopWord(char[] charArray, int begin, int length) {
        return singleton._StopWordDict.match(charArray, begin, length).isMatch();
    }

    /**
     * 加载主词典及扩展词典
     */
    private static void loadMainDict() {
        // 建立一个主词典实例
        if (_MainDict == null) { // 首次加载
            _MainDict = new DictSegment((char) 0);
            String mainDictionary = cfg.getMainDictionary();
            // 读取主词典文件
            if (!cfg.isDicDisable()) {
                loadToMain(mainDictionary, 1);
            }
        }
        // 加载扩展词典
        List<String> extDictFiles = cfg.getExtDictionarys();
        if (null != extDictFiles && !extDictFiles.isEmpty()) {
            for (String extFile : extDictFiles) {
                loadToMain(extFile, null);
            }
        }
    }

    /**
     * 将文件加载到主库.
     * 
     * @param mainDictionary
     *            mainDictionary
     * @param innerDic
     *            是否是内置词典(1是)
     */
    private static void loadToMain(String mainDictionary, Integer innerDic) {

        String path = null;
        InputStream is = null;
        File file = new File("");
        if (Objects.equals(1, innerDic)) {
            is = Dictionary.class.getClassLoader().getResourceAsStream(mainDictionary);
        } else {
            path = getFilePath(mainDictionary);
            file = new File(path);
            try {
                is = new FileInputStream(file);
            } catch (FileNotFoundException e) {
                print(e);
            }
        }
        if (is == null) {
            print("loadToMain:FileNotFoundException", path);
            // throw new RuntimeException("Main Dictionary not found!!!");
            return;
        }
        if (hasAdd && dicLastModified.containsKey(path) && file.lastModified() <= dicLastModified.get(path)) {
            return; // 非首次加载或词典未修改
        }
        print("loadToMain_START", mainDictionary);
        BufferedReader br = null;
        InputStreamReader inputStreamReader = null;
        StringBuilder updateDic = new StringBuilder();
        try {
            inputStreamReader = new InputStreamReader(is, "UTF-8");
            br = new BufferedReader(inputStreamReader, 512);
            String theWord = null;
            do {
                theWord = br.readLine();
                if (theWord != null && !"".equals(theWord.trim())) {
                    if (!dicExtSet.contains(theWord)) {
                        dicExtSet.add(theWord);
                        if (hasAdd) {
                            updateDic.append(theWord).append(";");
                        }
                    }
                    _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
                }
            } while (theWord != null);

        } catch (IOException ioe) {
            print("loadToMain exception.");
            print(ioe);
        } finally {
            dicLastModified.put(path, file.lastModified());
            if (updateDic.length() != 0) {
                print("loadToMain_END", "FileLastModified:" + DATE_FORMAT.format(new Date(file.lastModified())), updateDic.toString());
            }
            close(is, inputStreamReader, br);
        }
    }

    /**
     * 获取字典文件实际路径.
     * 
     * @param dictionary
     *            字典名
     * @return 字典路径
     */
    private static String getFilePath(String dictionary) {
        URL resource = Dictionary.class.getClassLoader().getResource(dictionary);
        if (null == resource) {
            print("NullPointerException", "getFilePath", dictionary); // 提示用户配置词库有误,方便用户定位异常
        }
        return resource.getPath(); // 抛出异常,终止IK
    }

    /**
     * 加载用户扩展的停止词词典
     */
    private static void loadStopWordDict() {
        // 建立一个主词典实例
        if (_StopWordDict == null) {
            _StopWordDict = new DictSegment((char) 0);
        }
        // 加载扩展停止词典
        List<String> extStopWordDictFiles = cfg.getExtStopWordDictionarys();
        if (extStopWordDictFiles != null) {
            InputStream is = null;
            for (String extStopWordDictName : extStopWordDictFiles) {
                // 读取扩展词典文件
                // is = Dictionary.class.getClassLoader().getResourceAsStream(extStopWordDictName);
                String path = getFilePath(extStopWordDictName);
                File file = new File(path);
                try {
                    is = new FileInputStream(file);
                } catch (FileNotFoundException e) {
                    print("loadStopWordDict:FileNotFoundException", path);
                    print(e);
                } finally {
                    close(is);
                }
                // 如果找不到扩展的字典,则忽略
                if (is == null) {
                    continue;
                }
                if (hasAdd && dicLastModified.containsKey(path) && file.lastModified() <= dicLastModified.get(path)) {
                    continue; // 非首次加载或词典未修改
                }
                print("loadStopWordDict_START", extStopWordDictName);
                BufferedReader br = null;
                InputStreamReader inputStreamReader = null;
                StringBuilder updateDic = new StringBuilder();
                try {
                    inputStreamReader = new InputStreamReader(is, "UTF-8");
                    br = new BufferedReader(inputStreamReader, 512);
                    String theWord = null;
                    do {
                        theWord = br.readLine();
                        if (theWord != null && !"".equals(theWord.trim())) {
                            // System.out.println(theWord);
                            // 加载扩展停止词典数据到内存中
                            _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
                            if (!dicStopSet.contains(theWord)) {
                                dicStopSet.add(theWord);
                                if (hasAdd) {
                                    updateDic.append(theWord).append(";");
                                }
                            }
                        }
                    } while (theWord != null);

                } catch (IOException ioe) {
                    print("loadStopWordDict exception.");
                    print(ioe);
                } finally {
                    dicLastModified.put(path, file.lastModified());
                    if (updateDic.length() != 0) {
                        print("loadStopWordDict_END", "FileLastModified:" + DATE_FORMAT.format(new Date(file.lastModified())), updateDic.toString());
                    }
                    close(is, inputStreamReader, br);
                }
            }
        }
    }

    /**
     * 加载量词词典
     */
    private void loadQuantifierDict() {
        // 建立一个量词典实例
        _QuantifierDict = new DictSegment((char) 0);
        // 读取量词词典文件
        InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDicionary());
        if (is == null) {
            throw new RuntimeException("Quantifier Dictionary not found!!!");
        }
        BufferedReader br = null;
        InputStreamReader inputStreamReader = null;
        try {
            inputStreamReader = new InputStreamReader(is, "UTF-8");
            br = new BufferedReader(inputStreamReader, 512);
            String theWord = null;
            do {
                theWord = br.readLine();
                if (theWord != null && !"".equals(theWord.trim())) {
                    _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
                }
            } while (theWord != null);

        } catch (IOException ioe) {
            print("Quantifier Dictionary loading exception.");
            print(ioe);

        } finally {
            close(is, inputStreamReader, br);
        }
    }

    /**
     * 批量关闭文件流.
     * 
     * @param closeables
     *            文件流集合
     */
    private static void close(AutoCloseable... closeables) {
        if (null != closeables && closeables.length > 0) {
            for (AutoCloseable autoCloseable : closeables) {
                if (null != autoCloseable) {
                    try {
                        autoCloseable.close();
                    } catch (Exception e) {
                        print(e);
                    }
                }
            }
        }
    }

    /**
     * 控制台打印.
     * 
     * @param param
     *            参数
     */
    public static void print(String... param) {
        StringBuilder builder = new StringBuilder();
        builder.append("[").append(DATE_FORMAT.format(new Date())).append("]");
        for (String str : param) {
            builder.append("[").append(str).append("]");
        }
        System.out.println(builder.toString());
    }

    /**
     * 控制台打印.
     * 
     * @param e
     *            异常信息
     */
    public static void print(Exception e) {
        StringBuilder builder = new StringBuilder();
        builder.append("[").append(DATE_FORMAT.format(new Date())).append("]").append(e.getMessage());
        System.out.println(builder.toString());
        e.printStackTrace();
    }
}

项目完整源码:https://github.com/zxiaofan/ik-analyzer-solr6
可直接从https://github.com/zxiaofan/ik-analyzer-solr6/releases 下载 solr6.6.1版本的jar。

欢迎个人转载,但须在文章页面明显位置给出原文连接;
未经作者同意必须保留此段声明、不得随意修改原文、不得用于商业用途,否则保留追究法律责任的权利。

【 CSDN 】:csdn.zxiaofan.com
【GitHub】:github.zxiaofan.com

如有任何问题,欢迎留言。祝君好运!
Life is all about choices! 
将来的你一定会感激现在拼命的自己!

没有更多推荐了,返回首页