Elasticsearch 修改IK分词器源码实现基于MySql的词库热更新


首先下载 IK 分词器的源码, github地址,下载之后根据自己的 ES 版本切换到对应的分支,我的 ES 版本是 6.7.2,所以使用的是分支 6.x。

官方提供的热更新方式

热更新

上图是官方提供的一种热更新词库的方式,是基于远程文件的,不太实用,但我们可以模仿这种方式自己实现一个基于 MySQL 的,官方提供的实现在org.wltea.analyzer.dic.Monitor类中,以下是其完整代码。

  1. 向词库服务器发送Head请求
  2. 从响应中获取Last-Modify、ETags字段值,判断是否变化
  3. 如果未变化,休眠1min,返回第①步
  4. 如果有变化,调用 Dictionary#reLoadMainDict()方法重新加载词典
  5. 休眠1min,返回第①步
package org.wltea.analyzer.dic;

import java.io.IOException;
import java.security.AccessController;
import java.security.PrivilegedAction;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpHead;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.SpecialPermission;
import org.wltea.analyzer.help.ESPluginLoggerFactory;

public class Monitor implements Runnable {

	private static final Logger logger = ESPluginLoggerFactory.getLogger(Monitor.class.getName());

	private static CloseableHttpClient httpclient = HttpClients.createDefault();
	/*
	 * 上次更改时间
	 */
	private String last_modified;
	/*
	 * 资源属性
	 */
	private String eTags;

	/*
	 * 请求地址
	 */
	private String location;

	public Monitor(String location) {
		this.location = location;
		this.last_modified = null;
		this.eTags = null;
	}

	public void run() {
		SpecialPermission.check();
		AccessController.doPrivileged((PrivilegedAction<Void>) () -> {
			this.runUnprivileged();
			return null;
		});
	}

	/**
	 * 监控流程:
	 *  ①向词库服务器发送Head请求
	 *  ②从响应中获取Last-Modify、ETags字段值,判断是否变化
	 *  ③如果未变化,休眠1min,返回第①步
	 * 	④如果有变化,重新加载词典
	 *  ⑤休眠1min,返回第①步
	 */

	public void runUnprivileged() {

		//超时设置
		RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
				.setConnectTimeout(10*1000).setSocketTimeout(15*1000).build();

		HttpHead head = new HttpHead(location);
		head.setConfig(rc);

		//设置请求头
		if (last_modified != null) {
			head.setHeader("If-Modified-Since", last_modified);
		}
		if (eTags != null) {
			head.setHeader("If-None-Match", eTags);
		}

		CloseableHttpResponse response = null;
		try {

			response = httpclient.execute(head);

			//返回200 才做操作
			if(response.getStatusLine().getStatusCode()==200){

				if (((response.getLastHeader("Last-Modified")!=null) && !response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified))
						||((response.getLastHeader("ETag")!=null) && !response.getLastHeader("ETag").getValue().equalsIgnoreCase(eTags))) {

					// 远程词库有更新,需要重新加载词典,并修改last_modified,eTags
					Dictionary.getSingleton().reLoadMainDict();
					last_modified = response.getLastHeader("Last-Modified")==null?null:response.getLastHeader("Last-Modified").getValue();
					eTags = response.getLastHeader("ETag")==null?null:response.getLastHeader("ETag").getValue();
				}
			}else if (response.getStatusLine().getStatusCode()==304) {
				//没有修改,不做操作
				//noop
			}else{
				logger.info("remote_ext_dict {} return bad code {}" , location , response.getStatusLine().getStatusCode() );
			}

		} catch (Exception e) {
			logger.error("remote_ext_dict {} error!",e , location);
		}finally{
			try {
				if (response != null) {
					response.close();
				}
			} catch (IOException e) {
				logger.error(e.getMessage(), e);
			}
		}
	}

}

reLoadMainDict()会调用loadMainDict(),进而调用loadRemoteExtDict()加载了远程自定义词库,同样的调用loadStopWordDict()也会同时加载远程停用词库。
reLoadMainDict()方法新创建了一个词典实例来重新加载词典,然后替换原来的词典,是一个全量替换。

void reLoadMainDict() {
	logger.info("重新加载词典...");
	// 新开一个实例加载词典,减少加载过程对当前词典使用的影响
	Dictionary tmpDict = new Dictionary(configuration);
	tmpDict.configuration = getSingleton().configuration;
	tmpDict.loadMainDict();
	tmpDict.loadStopWordDict();
	_MainDict = tmpDict._MainDict;
	_StopWords = tmpDict._StopWords;
	logger.info("重新加载词典完毕...");
}

/**
 * 加载主词典及扩展词典
 */
private void () {
	// 建立一个主词典实例
	_MainDict = new DictSegment((char) 0);

	// 读取主词典文件
	Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN);
	loadDictFile(_MainDict, file, false, "Main Dict");
	// 加载扩展词典
	this.loadExtDict();
	// 加载远程自定义词库
	this.loadRemoteExtDict();
}

loadRemoteExtDict()方法的逻辑也很清晰:

  1. 获取远程词典的 URL,可能有多个
  2. 循环请求每个 URL,取回远程词典
  3. 将远程词典添加到主词典中 _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());

这里需要重点关注的是 fillSegment()方法,它的作用是将一个词加入词典,与之相反的方法是disableSegment(),屏蔽词典中的一个词。

/**
 * 加载远程扩展词典到主词库表
 */
private void loadRemoteExtDict() {
	List<String> remoteExtDictFiles = getRemoteExtDictionarys();
	for (String location : remoteExtDictFiles) {
		logger.info("[Dict Loading] " + location);
		List<String> lists = getRemoteWords(location);
		// 如果找不到扩展的字典,则忽略
		if (lists == null) {
			logger.error("[Dict Loading] " + location + "加载失败");
			continue;
		}
		for (String theWord : lists) {
			if (theWord != null && !"".equals(theWord.trim())) {
				// 加载扩展词典数据到主内存词典中
				logger.info(theWord);
				_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
			}
		}
	}
}

/**
 * 加载填充词典片段
 * @param charArray
 */
void fillSegment(char[] charArray){
	this.fillSegment(charArray, 0 , charArray.length , 1); 
}

/**
 * 屏蔽词典中的一个词
 * @param charArray
 */
void disableSegment(char[] charArray){
	this.fillSegment(charArray, 0 , charArray.length , 0); 
}

Monitor类只是一个监控程序,它是在org.wltea.analyzer.dic.Dictionary类的initial()方法被启动的,以下代码的 29~35 行。

...
...
// 线程池
private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
...
...

/**
 * 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
 * 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间 该方法提供了一个在应用加载阶段就初始化字典的手段
 * 
 * @return Dictionary
 */
public static synchronized void initial(Configuration cfg) {
	if (singleton == null) {
		synchronized (Dictionary.class) {
			if (singleton == null) {

				singleton = new Dictionary(cfg);
				singleton.loadMainDict();
				singleton.loadSurnameDict();
				singleton.loadQuantifierDict();
				singleton.loadSuffixDict();
				singleton.loadPrepDict();
				singleton.loadStopWordDict();

				if(cfg.isEnableRemoteDict()){
					// 建立监控线程
					for (String location : singleton.getRemoteExtDictionarys()) {
						// 10 秒是初始延迟可以修改的 60是间隔时间 单位秒
						pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
					}
					for (String location : singleton.getRemoteExtStopWordDictionarys()) {
						pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
					}
				}

			}
		}
	}
}

自己实现基于MySQL的词库热更新

基于MySQL的词库热更新-Gitee地址

数据库

es_extra_maines_extra_stopword分别为主词典和停用词典。

CREATE TABLE `es_extra_main` (
    `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
    `word` varchar(255) CHARACTER SET utf8mb4 NOT NULL COMMENT '词',
    `is_deleted` tinyint(1) NOT NULL DEFAULT '0' COMMENT '是否已删除',
    `update_time` timestamp(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) ON UPDATE CURRENT_TIMESTAMP(6) COMMENT '更新时间',
    PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;


CREATE TABLE `es_extra_stopword` (
    `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
    `word` varchar(255) CHARACTER SET utf8mb4 NOT NULL COMMENT '词',
    `is_deleted` tinyint(1) NOT NULL DEFAULT '0' COMMENT '是否已删除',
    `update_time` timestamp(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) ON UPDATE CURRENT_TIMESTAMP(6) COMMENT '更新时间',
    PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

JDBC 配置

config/jdbc.properties 文件中记录 MySQL 的 url、driver、username、password,和查询主词典、停用词典的 SQL,以及热更新的间隔秒数。从两个 SQL 可以看出我的设计是增量更新,而不是官方的全量替换

jdbc.url=jdbc:mysql://localhost:3306/test?useAffectedRows=true&characterEncoding=UTF-8&autoReconnect=true&zeroDateTimeBehavior=convertToNull&useUnicode=true&serverTimezone=GMT%2B8&allowMultiQueries=true
jdbc.username=root
jdbc.password=root
jdbc.driver=com.mysql.cj.jdbc.Driver
jdbc.update.main.dic.sql=SELECT * FROM `es_extra_main` WHERE update_time > ? order by update_time asc
jdbc.update.stopword.sql=SELECT * FROM `es_extra_stopword` WHERE update_time > ? order by update_time asc
jdbc.update.interval=10

pom

elasticsearch.version 改为自己的 ES 的版本,添加 MySQL 驱动的依赖。

    <properties>
        <elasticsearch.version>6.7.2</elasticsearch.version>
    </properties>
	
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.19</version>
        </dependency>

src/main/assemblies/plugin.xml

将 MySQL 驱动的依赖写入,否则打成 zip 后会没有 MySQL 驱动的 jar 包。

    <dependencySets>
        <dependencySet>
            <outputDirectory/>
            <useProjectArtifact>true</useProjectArtifact>
            <useTransitiveFiltering>true</useTransitiveFiltering>
            <excludes>
                <exclude>org.elasticsearch:elasticsearch</exclude>
            </excludes>
        </dependencySet>
        <dependencySet>
            <outputDirectory/>
            <useProjectArtifact>true</useProjectArtifact>
            <useTransitiveFiltering>true</useTransitiveFiltering>
            <includes>
                <include>org.apache.httpcomponents:httpclient</include>
                <!--这里 看我看我-->
                <include>mysql:mysql-connector-java</include>
            </includes>
        </dependencySet>
    </dependencySets>

src/main/resources/plugin-security.policy

添加permission java.lang.RuntimePermission "setContextClassLoader";,否则会因为权限问题抛出以下异常。

grant {
  // needed because of the hot reload functionality
  permission java.net.SocketPermission "*", "connect,resolve";
  permission java.lang.RuntimePermission "setContextClassLoader";
};
java.lang.ExceptionInInitializerError: null
	at java.lang.Class.forName0(Native Method) ~[?:1.8.0_261]
	at java.lang.Class.forName(Unknown Source) ~[?:1.8.0_261]
	at com.mysql.cj.jdbc.NonRegisteringDriver.<clinit>(NonRegisteringDriver.java:97) ~[?:?]
	at java.lang.Class.forName0(Native Method) ~[?:1.8.0_261]
	at java.lang.Class.forName(Unknown Source) ~[?:1.8.0_261]
	at org.wltea.analyzer.dic.DatabaseMonitor.lambda$new$0(DatabaseMonitor.java:72) ~[?:?]
	at java.security.AccessController.doPrivileged(Native Method) ~[?:1.8.0_261]
	at org.wltea.analyzer.dic.DatabaseMonitor.<init>(DatabaseMonitor.java:70) ~[?:?]
	at org.wltea.analyzer.dic.Dictionary.initial(Dictionary.java:172) ~[?:?]
	at org.wltea.analyzer.cfg.Configuration.<init>(Configuration.java:40) ~[?:?]
	at org.elasticsearch.index.analysis.IkTokenizerFactory.<init>(IkTokenizerFactory.java:15) ~[?:?]
	at org.elasticsearch.index.analysis.IkTokenizerFactory.getIkSmartTokenizerFactory(IkTokenizerFactory.java:23) ~[?:?]
	at org.elasticsearch.index.analysis.AnalysisRegistry.buildMapping(AnalysisRegistry.java:379) ~[elasticsearch-6.7.2.jar:6.7.2]
	at org.elasticsearch.index.analysis.AnalysisRegistry.buildTokenizerFactories(AnalysisRegistry.java:189) ~[elasticsearch-6.7.2.jar:6.7.2]
	at org.elasticsearch.index.analysis.AnalysisRegistry.build(AnalysisRegistry.java:163) ~[elasticsearch-6.7.2.jar:6.7.2]
	at org.elasticsearch.index.IndexService.<init>(IndexService.java:164) ~[elasticsearch-6.7.2.jar:6.7.2]
	at org.elasticsearch.index.IndexModule.newIndexService(IndexModule.java:402) ~[elasticsearch-6.7.2.jar:6.7.2]
	at org.elasticsearch.indices.IndicesService.createIndexService(IndicesService.java:526) ~[elasticsearch-6.7.2.jar:6.7.2]
	at org.elasticsearch.indices.IndicesService.verifyIndexMetadata(IndicesService.java:599) ~[elasticsearch-6.7.2.jar:6.7.2]
	at org.elasticsearch.gateway.Gateway.performStateRecovery(Gateway.java:129) ~[elasticsearch-6.7.2.jar:6.7.2]
	at org.elasticsearch.gateway.GatewayService$1.doRun(GatewayService.java:227) ~[elasticsearch-6.7.2.jar:6.7.2]
	at org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:751) ~[elasticsearch-6.7.2.jar:6.7.2]
	at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) ~[elasticsearch-6.7.2.jar:6.7.2]
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) ~[?:1.8.0_261]
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) ~[?:1.8.0_261]
	at java.lang.Thread.run(Unknown Source) [?:1.8.0_261]
Caused by: java.security.AccessControlException: access denied ("java.lang.RuntimePermission" "setContextClassLoader")
	at java.security.AccessControlContext.checkPermission(Unknown Source) ~[?:1.8.0_261]
	at java.security.AccessController.checkPermission(Unknown Source) ~[?:1.8.0_261]
	at java.lang.SecurityManager.checkPermission(Unknown Source) ~[?:1.8.0_261]
	at java.lang.Thread.setContextClassLoader(Unknown Source) ~[?:1.8.0_261]
	at com.mysql.cj.jdbc.AbandonedConnectionCleanupThread.lambda$static$0(AbandonedConnectionCleanupThread.java:72) ~[?:?]
	at java.util.concurrent.ThreadPoolExecutor$Worker.<init>(Unknown Source) ~[?:1.8.0_261]
	at java.util.concurrent.ThreadPoolExecutor.addWorker(Unknown Source) ~[?:1.8.0_261]
	at java.util.concurrent.ThreadPoolExecutor.execute(Unknown Source) ~[?:1.8.0_261]
	at java.util.concurrent.Executors$DelegatedExecutorService.execute(Unknown Source) ~[?:1.8.0_261]
	at com.mysql.cj.jdbc.AbandonedConnectionCleanupThread.<clinit>(AbandonedConnectionCleanupThread.java:75) ~[?:?]
	... 26 more

修改 Dictionary

  • 在构造方法中加载 jdbc.properties 文件
  • getProperty()改为 public
  • 添加了几个方法,用于增删词条
  • initial()启动自己实现的数据库监控线程
	private Dictionary(Configuration cfg) {
		this.configuration = cfg;
		this.props = new Properties();
		this.conf_dir = cfg.getEnvironment().configFile().resolve(AnalysisIkPlugin.PLUGIN_NAME);
		Path configFile = conf_dir.resolve(FILE_NAME);

		InputStream input = null;
		try {
			logger.info("try load config from {}", configFile);
			input = new FileInputStream(configFile.toFile());
		} catch (FileNotFoundException e) {
			conf_dir = cfg.getConfigInPluginDir();
			configFile = conf_dir.resolve(FILE_NAME);
			try {
				logger.info("try load config from {}", configFile);
				input = new FileInputStream(configFile.toFile());
			} catch (FileNotFoundException ex) {
				// We should report origin exception
				logger.error("ik-analyzer", e);
			}
		}
		if (input != null) {
			try {
				props.loadFromXML(input);
			} catch (IOException e) {
				logger.error("ik-analyzer", e);
			}
		}

		// 加载 jdbc.properties 文件
		loadJdbcProperties();
	}
	
	public String getProperty(String key){
		if(props!=null){
			return props.getProperty(key);
		}
		return null;
	}

	/**
	 * 加载新词条
	 */
	public static void addWord(String word) {
		singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
	}

	/**
	 * 移除(屏蔽)词条
	 */
	public static void disableWord(String word) {
		singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
	}

	/**
	 * 加载新停用词
	 */
	public static void addStopword(String word) {
		singleton._StopWords.fillSegment(word.trim().toLowerCase().toCharArray());
	}

	/**
	 * 移除(屏蔽)停用词
	 */
	public static void disableStopword(String word) {
		singleton._StopWords.disableSegment(word.trim().toLowerCase().toCharArray());
	}

	/**
	 * 加载 jdbc.properties
	 */
	public void loadJdbcProperties() {
		Path file = PathUtils.get(getDictRoot(), DatabaseMonitor.PATH_JDBC_PROPERTIES);
		try {
			props.load(new FileInputStream(file.toFile()));
			logger.info("====================================properties====================================");
			for (Map.Entry<Object, Object> entry : props.entrySet()) {
				logger.info("{}: {}", entry.getKey(), entry.getValue());
			}
			logger.info("====================================properties====================================");
		} catch (IOException e) {
			logger.error("failed to read file: " + DatabaseMonitor.PATH_JDBC_PROPERTIES, e);
		}
	}
	
	public static synchronized void initial(Configuration cfg) {
		if (singleton == null) {
			synchronized (Dictionary.class) {
				if (singleton == null) {

					singleton = new Dictionary(cfg);
					singleton.loadMainDict();
					singleton.loadSurnameDict();
					singleton.loadQuantifierDict();
					singleton.loadSuffixDict();
					singleton.loadPrepDict();
					singleton.loadStopWordDict();

					if(cfg.isEnableRemoteDict()){
						for (String location : singleton.getRemoteExtDictionarys()) {
							pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
						}
						for (String location : singleton.getRemoteExtStopWordDictionarys()) {
							pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
						}
					}
					
					// 建立数据库监控线程
					pool.scheduleAtFixedRate(new DatabaseMonitor(), 10, Long.parseLong(getSingleton().getProperty(DatabaseMonitor.JDBC_UPDATE_INTERVAL)), TimeUnit.SECONDS);
				}
			}
		}
	}

MySQL 热更新的实现类 DatabaseMonitor

  • lastUpdateTimeOfMainDic、lastUpdateTimeOfStopword 记录上次处理的最后一条的updateTime
  • 查出上次处理之后新增或删除的记录
  • 循环判断 is_deleted 字段,为true则添加词条,false则删除词条
package org.wltea.analyzer.dic;

import org.apache.logging.log4j.Logger;
import org.elasticsearch.SpecialPermission;
import org.wltea.analyzer.help.ESPluginLoggerFactory;

import java.security.AccessController;
import java.security.PrivilegedAction;
import java.sql.*;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.LocalTime;

/**
 * 通过 mysql 更新词典
 *
 * @author 赵丙双
 */
public class DatabaseMonitor implements Runnable {

    private static final Logger logger = ESPluginLoggerFactory.getLogger(DatabaseMonitor.class.getName());
    public static final String PATH_JDBC_PROPERTIES = "jdbc.properties";

    private static final String JDBC_URL = "jdbc.url";
    private static final String JDBC_USERNAME = "jdbc.username";
    private static final String JDBC_PASSWORD = "jdbc.password";
    private static final String JDBC_DRIVER = "jdbc.driver";
    private static final String SQL_UPDATE_MAIN_DIC = "jdbc.update.main.dic.sql";
    private static final String SQL_UPDATE_STOPWORD = "jdbc.update.stopword.sql";
    /**
     * 更新间隔
     */
    public final static  String JDBC_UPDATE_INTERVAL = "jdbc.update.interval";

    private static final Timestamp DEFAULT_LAST_UPDATE = Timestamp.valueOf(LocalDateTime.of(LocalDate.of(2020, 1, 1), LocalTime.MIN));

    private static Timestamp lastUpdateTimeOfMainDic = null;

    private static Timestamp lastUpdateTimeOfStopword = null;

    public String getUrl() {
        return Dictionary.getSingleton().getProperty(JDBC_URL);
    }

    public String getUsername() {
        return Dictionary.getSingleton().getProperty(JDBC_USERNAME);
    }

    public String getPassword() {
        return Dictionary.getSingleton().getProperty(JDBC_PASSWORD);
    }

    public String getDriver() {
        return Dictionary.getSingleton().getProperty(JDBC_DRIVER);
    }

    public String getUpdateMainDicSql() {
        return Dictionary.getSingleton().getProperty(SQL_UPDATE_MAIN_DIC);
    }

    public String getUpdateStopwordSql() {
        return Dictionary.getSingleton().getProperty(SQL_UPDATE_STOPWORD);
    }

    /**
     * 加载MySQL驱动
     */
    public DatabaseMonitor() {
        SpecialPermission.check();
        AccessController.doPrivileged((PrivilegedAction<Void>) () -> {
            try {
                Class.forName(getDriver());
            } catch (ClassNotFoundException e) {
                logger.error("mysql jdbc driver not found", e);
            }
            return null;
        });


    }

    @Override
    public void run() {
        SpecialPermission.check();
        AccessController.doPrivileged((PrivilegedAction<Void>) () -> {
            Connection conn = getConnection();

            // 更新主词典
            updateMainDic(conn);
            // 更新停用词
            updateStopword(conn);
            closeConnection(conn);
            
            return null;
        });

    }

    public Connection getConnection() {
        Connection connection = null;
        try {
            connection = DriverManager.getConnection(getUrl(), getUsername(), getPassword());
        } catch (SQLException e) {
            logger.error("failed to get connection", e);
        }
        return connection;
    }

    public void closeConnection(Connection conn) {
        if (conn != null) {
            try {
                conn.close();
            } catch (SQLException e) {
                logger.error("failed to close Connection", e);
            }
        }
    }

    public void closeRsAndPs(ResultSet rs, PreparedStatement ps) {
        if (rs != null) {
            try {
                rs.close();
            } catch (SQLException e) {
                logger.error("failed to close ResultSet", e);
            }
        }

        if (ps != null) {
            try {
                ps.close();
            } catch (SQLException e) {
                logger.error("failed to close PreparedStatement", e);
            }
        }

    }

    /**
     * 主词典
     */
    public synchronized void updateMainDic(Connection conn) {

        logger.info("start update main dic");
        int numberOfAddWords = 0;
        int numberOfDisableWords = 0;
        PreparedStatement ps = null;
        ResultSet rs = null;

        try {
            String sql = getUpdateMainDicSql();

            Timestamp param = lastUpdateTimeOfMainDic == null ? DEFAULT_LAST_UPDATE : lastUpdateTimeOfMainDic;

            logger.info("param: " + param);

            ps = conn.prepareStatement(sql);
            ps.setTimestamp(1, param);

            rs = ps.executeQuery();

            while (rs.next()) {
                String word = rs.getString("word");
                word = word.trim();

                if (word.isEmpty()) {
                    continue;
                }

                lastUpdateTimeOfMainDic = rs.getTimestamp("update_time");

                if (rs.getBoolean("is_deleted")) {
                    logger.info("[main dic] disable word: {}", word);
                    // 删除
                    Dictionary.disableWord(word);
                    numberOfDisableWords++;
                } else {
                    logger.info("[main dic] add word: {}", word);
                    // 添加
                    Dictionary.addWord(word);
                    numberOfAddWords++;
                }
            }

            logger.info("end update main dic -> addWord: {}, disableWord: {}", numberOfAddWords, numberOfDisableWords);

        } catch (SQLException e) {
            logger.error("failed to update main_dic", e);
            // 关闭 ResultSet、PreparedStatement
            closeRsAndPs(rs, ps);
        }
    }

    /**
     * 停用词
     */
    public synchronized void updateStopword(Connection conn) {

        logger.info("start update stopword");

        int numberOfAddWords = 0;
        int numberOfDisableWords = 0;
        PreparedStatement ps = null;
        ResultSet rs = null;
        try {
            String sql = getUpdateStopwordSql();

            Timestamp param = lastUpdateTimeOfStopword == null ? DEFAULT_LAST_UPDATE : lastUpdateTimeOfStopword;

            logger.info("param: " + param);

            ps = conn.prepareStatement(sql);
            ps.setTimestamp(1, param);

            rs = ps.executeQuery();

            while (rs.next()) {
                String word = rs.getString("word");
                word = word.trim();


                if (word.isEmpty()) {
                    continue;
                }

                lastUpdateTimeOfStopword = rs.getTimestamp("update_time");

                if (rs.getBoolean("is_deleted")) {
                    logger.info("[stopword] disable word: {}", word);

                    // 删除
                    Dictionary.disableStopword(word);
                    numberOfDisableWords++;
                } else {
                    logger.info("[stopword] add word: {}", word);
                    // 添加
                    Dictionary.addStopword(word);
                    numberOfAddWords++;
                }
            }

            logger.info("end update stopword -> addWord: {}, disableWord: {}", numberOfAddWords, numberOfDisableWords);

        } catch (SQLException e) {
            logger.error("failed to update main_dic", e);
        } finally {
            // 关闭 ResultSet、PreparedStatement
            closeRsAndPs(rs, ps);
        }
    }
}

打包测试

直接mvn package,然后在 elasticsearch-analysis-ik/target/releases目录中找到 elasticsearch-analysis-ik-6.7.2.zip 压缩包,直接解压到 ES 自己的 plugins 目录即可。

参考

  1. 修改ES IK插件源码,配合MySQL实现词库热更新
  2. ElasticSearch 重写IK分词器源码设置mysql热词更新词库
  3. Elasticsearch7.8.0集成IK分词器改源码实现MySql5.7.2实现动态词库实时更新
  4. Elasticsearch5.5.1插件开发指南
  • 3
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值