HanLP无法动态加载停用词,无法重载停用词的自定义处理

在使用HanLP框架的过程中,发现其内置的CoreStopWordDictionary类只有删除、添加方法,没有动态的重载。而项目的需求确实可以动态加载,用他内置的方法去全删掉,然后一个个添加,在直观上感觉没有那么优美。。。所以准备重新写一个服务在做停用词的加载。
新建服务类:

CoreStopwordService

首先,HanLP的停用词记载是从其资源包中的stopword.txt下读取的数据加载,其加载方法在CoreStopWordDictionary的static静态块中存在:

static {
        ByteArray byteArray = ByteArray.createByteArray(Config.CoreStopWordDictionaryPath + ".bin");
        if (byteArray == null) {
            try {
                dictionary = new StopWordDictionary(Config.CoreStopWordDictionaryPath);
                DataOutputStream out = new DataOutputStream(IOUtil.newOutputStream(Config.CoreStopWordDictionaryPath + ".bin"));
                dictionary.save(out);
                out.close();
            } catch (Exception var2) {
                Predefine.logger.severe("载入停用词词典" + Config.CoreStopWordDictionaryPath + "失败" + TextUtility.exceptionToString(var2));
                throw new RuntimeException("载入停用词词典" + Config.CoreStopWordDictionaryPath + "失败");
            }
        } else {
            dictionary = new StopWordDictionary();
            dictionary.load(byteArray);
        }

        FILTER = new Filter() {
            public boolean shouldInclude(Term term) {
                String nature = term.nature != null ? term.nature.toString() : "空";
                char firstChar = nature.charAt(0);
                switch(firstChar) {
                case 'b':
                case 'c':
                case 'e':
                case 'm':
                case 'o':
                case 'p':
                case 'q':
                case 'r':
                case 'u':
                case 'w':
                case 'y':
                case 'z':
                    return false;
                case 'd':
                case 'f':
                case 'g':
                case 'h':
                case 'i':
                case 'j':
                case 'k':
                case 'l':
                case 'n':
                case 's':
                case 't':
                case 'v':
                case 'x':
                default:
                    return !CoreStopWordDictionary.contains(term.word);
                }
            }
        };
    }

将其改成在新服务的PostConstruct中加载:

@PostConstruct
  public void load() {

    ByteArray byteArray = ByteArray.createByteArray(Config.CoreStopWordDictionaryPath + ".bin");
    if (byteArray == null) {
      try {
        if (Files.size(Paths.get(Config.CoreStopWordDictionaryPath)) == 0){
          return ;
        }
        dictionary = new StopWordDictionary(Config.CoreStopWordDictionaryPath);
        DataOutputStream out = new DataOutputStream(IOUtil.newOutputStream(Config.CoreStopWordDictionaryPath + ".bin"));
        dictionary.save(out);
        out.close();
      } catch (Exception var2) {
        Predefine.logger.severe("载入停用词词典" + Config.CoreStopWordDictionaryPath + "失败" + TextUtility.exceptionToString(var2));
        throw new RuntimeException("载入停用词词典" + Config.CoreStopWordDictionaryPath + "失败");
      }
    } else {
      dictionary = new StopWordDictionary();
      dictionary.load(byteArray);
    }
  }

其他方法不变,再添加一个reload方法:

@Transactional(readOnly = true)
  public void reload() {
    log.info("开始重新加载停用词库...");
    try (Stream<Stopword> stream = stopwordRepository.findByProject_Id(customDictionaryService.getEffectiveProjectId())){
      String text = stream.map(Stopword::getWord).collect(Collectors.joining("\n"));
      if (StringUtils.isEmpty(text)){
        log.info("停用词库为空,停止加载");
        return ;
      }
      try {
        Files.write(Paths.get(Config.CoreStopWordDictionaryPath),text.getBytes());
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
    try {
      Files.delete(Paths.get(Config.CoreStopWordDictionaryPath + ".bin"));
    } catch (IOException e) {
      e.printStackTrace();
    }
    load();
    log.info("停用词库加载完成...");
  }

stopwordRepository.findByProject_Id(customDictionaryService.getEffectiveProjectId()))这个是我用jpa的记载数据的方法,这里可以替换成你项目中的数据查询方法。
这样就可以在项目中任何位置重载停用词了。
CoreStopwordService的完整代码如下:

/**
 * @description: 自定义停用词服务
 * @author: chenyang
 * @create: 2018-12-14
 **/
@Slf4j
@Service
public class CoreStopwordService {

  static StopWordDictionary dictionary;
  public static Filter FILTER;
  public final StopwordRepository stopwordRepository;
  private final CustomDictionaryService customDictionaryService;

  public CoreStopwordService(StopwordRepository stopwordRepository, CustomDictionaryService customDictionaryService) {
    this.stopwordRepository = stopwordRepository;
    this.customDictionaryService = customDictionaryService;
  }

  static {
    FILTER = new Filter() {
      public boolean shouldInclude(Term term) {
        String nature = term.nature != null ? term.nature.toString() : "空";
        char firstChar = nature.charAt(0);
        switch(firstChar) {
          case 'b':
          case 'c':
          case 'e':
          case 'm':
          case 'o':
          case 'p':
          case 'q':
          case 'r':
          case 'u':
          case 'w':
          case 'y':
          case 'z':
            return false;
          case 'd':
          case 'f':
          case 'g':
          case 'h':
          case 'i':
          case 'j':
          case 'k':
          case 'l':
          case 'n':
          case 's':
          case 't':
          case 'v':
          case 'x':
          default:
            return !CoreStopWordDictionary.contains(term.word);
        }
      }
    };
  }

  @Transactional(readOnly = true)
  public void reload() {
    log.info("开始重新加载停用词库...");
    try (Stream<Stopword> stream = stopwordRepository.findByProject_Id(customDictionaryService.getEffectiveProjectId())){
      String text = stream.map(Stopword::getWord).collect(Collectors.joining("\n"));
      if (StringUtils.isEmpty(text)){
        log.info("停用词库为空,停止加载");
        return ;
      }
      try {
        Files.write(Paths.get(Config.CoreStopWordDictionaryPath),text.getBytes());
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
    try {
      Files.delete(Paths.get(Config.CoreStopWordDictionaryPath + ".bin"));
    } catch (IOException e) {
      e.printStackTrace();
    }
    load();
    log.info("停用词库加载完成...");
  }

  public static boolean contains(String key) {
      return dictionary.contains(key);
  }

  public static boolean shouldInclude(Term term) {
      return FILTER.shouldInclude(term);
  }

  public static boolean shouldRemove(Term term) {
      return !shouldInclude(term);
  }

  public static boolean add(String stopWord) {
      return dictionary.add(stopWord);
  }

  public static boolean remove(String stopWord) {
      return dictionary.remove(stopWord);
  }

  public static void apply(List<Term> termList) {
      ListIterator listIterator = termList.listIterator();

      while(listIterator.hasNext()) {
          if (shouldRemove((Term)listIterator.next())) {
              listIterator.remove();
          }
      }

  }

  @PostConstruct
  public void load() {

    ByteArray byteArray = ByteArray.createByteArray(Config.CoreStopWordDictionaryPath + ".bin");
    if (byteArray == null) {
      try {
        if (Files.size(Paths.get(Config.CoreStopWordDictionaryPath)) == 0){
          return ;
        }
        dictionary = new StopWordDictionary(Config.CoreStopWordDictionaryPath);
        DataOutputStream out = new DataOutputStream(IOUtil.newOutputStream(Config.CoreStopWordDictionaryPath + ".bin"));
        dictionary.save(out);
        out.close();
      } catch (Exception var2) {
        Predefine.logger.severe("载入停用词词典" + Config.CoreStopWordDictionaryPath + "失败" + TextUtility.exceptionToString(var2));
        throw new RuntimeException("载入停用词词典" + Config.CoreStopWordDictionaryPath + "失败");
      }
    } else {
      dictionary = new StopWordDictionary();
      dictionary.load(byteArray);
    }
  }
}

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值