【基于python版本的连续英文分词实现java版本的英文分词器】

连续英文分词器java版本


在搜索领域,用户的输入是千奇百怪的,有时候用户输入的是连续的英文,如果不能有效的进行切分,那么搜索召回的效果可能会比较差,所以我们需要针对连续英文进行分词,主要有以下几个步骤:
1:定义词典
2:构建英文词典
3:切词

定义词典,

这里的词典可以就是一个文本文件,格式如下:
leagues
fossil
microsoft
property
depending
overall
universities
appearance

构建词典

代码如下:
package org.wltea.analyzer.custom;

import org.wltea.analyzer.cfg.Configuration;//这里用了IK里的一个配置文件,主要是获取词典路径,也可以不用,自己定义个路径就行

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.*;

/**

  • 自定义的连续英文分词

  • */
    public class ENDictionary {

    /**

    • 英文词典单例
    • */
      private static ENDictionary singleton;

    /**

    • 英文词典对象
    • */
      private Dictionary _ENMainDict;

    /**

    • 词的最大长度
    • */
      private Integer maxLength = 0;

    private Configuration cfg;

    private ENDictionary(Configuration cfg){
    this.cfg = cfg;
    this.loadMainDict();
    }

    /**

    • 词典初始化
    • 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
    • 只有当Dictionary类被实际调用时,才会开始载入词典,
    • 这将延长首次分词操作的时间
    • 该方法提供了一个在应用加载阶段就初始化字典的手段
      */
      public static void initial(Configuration cfg){
      if(singleton == null){
      synchronized (ENDictionary.class){
      if(singleton == null){
      singleton = new ENDictionary(cfg);
      }
      }
      }
      }

    /**

    • 获取词典单子实例
    • @return ENDictionary 单例对象
      */
      public static ENDictionary getSingleton(){
      if(singleton == null){
      throw new IllegalStateException(“英文词典尚未初始化,请先调用initial方法”);
      }
      return singleton;
      }

    /**

    • 批量增加英文词典
    • */

    private void loadMainDict(){
    _ENMainDict = new Hashtable<String,Double>();
    List diclist = new ArrayList<>();
    InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getENDictionary());
    if(is == null){
    throw new RuntimeException(“英文词典没有发现!”);
    }
    try {
    BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
    String theword;
    //收集数据
    do{
    theword = br.readLine();
    if(theword!=null && !“”.equals(theword.trim())){
    diclist.add(theword);
    Integer length = theword.length();
    if(maxLength<length){ //获取最大词的长度
    maxLength = length;
    }
    }
    }while (theword!=null);
    //迭代数据,计算每个词cost,形成字典
    fillDictinary(_ENMainDict,diclist);
    }catch (IOException ioe){
    System.err.println(“英文词典加载异常”);
    ioe.printStackTrace();
    }finally {
    try {
    is.close();
    } catch (IOException e) {
    e.printStackTrace();
    }
    }
    }

    /**

    • 字典填充方法
    • */

    private void fillDictinary(Dictionary dic,List diclist){
    Integer wordCount = diclist.size();
    for(int i=0;i<wordCount;i++){
    String word = diclist.get(i);
    Double cost =cost(i, wordCount);
    dic.put(word, cost);
    }
    }

    /**

    • 计算每个词的损失
    • @param i:单词在字典里位置,从0开始
    • @param wordCount:词典里总单词数
    • @return :返回单词的损失值
    • */
      private double cost(Integer i,Integer wordCount){
      return Math.log((i+1)*Math.log(wordCount));
      }

    public Dictionary get_ENMainDict() {
    return _ENMainDict;
    }

    public Integer getMaxLength() {
    return maxLength;
    }

}

切词实现

package org.wltea.analyzer.custom;

import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ENWordSegmenter {

private ENDictionary dic = ENDictionary.getSingleton();
//获取字典
private Dictionary<String,Double> _MainDic = dic.get_ENMainDict();

//获取字典里面字符最大长度
private Integer maxLength = dic.getMaxLength();

//定义一个损失数组
private double[] cost;
/**
 * 获取所有中文字
 * */
private List<String> getChinese(String word){
    Pattern pattern = Pattern.compile("[^a-zA-Z0-9']+");
    Matcher matcher = pattern.matcher(word);
    List<String> list = new ArrayList<>();
    while (matcher.find()){
        list.add(matcher.group(0));
    }
    return list;
}

/**
 * 获取连续英文词块
 * */
private String[] getEnglish(String word){
    Pattern pattern = Pattern.compile("[^a-zA-Z0-9']+");
    String[] enwords = pattern.split(word,-1);

    return enwords;
}

/**
 * 获取最小cost值
 * */
private CostTuple best_macth(int i,double[] cost,String word){
    Integer start = Math.max(0, i-this.maxLength);

    //截取从0开始到i个字符的损失
    double[] costList = new double[i-start];
    System.arraycopy(cost, start,costList,0,i-start);
    double[] reverseCost = reverse(costList);

    Double v = Double.MAX_VALUE;
    Integer m = Integer.MIN_VALUE;

    for(int k=0;k<reverseCost.length;k++){
        Double c = reverseCost[k];
        c = c+(_MainDic.get(word.substring(i-k-1, i).toLowerCase())==null?9e99:_MainDic.get(word.substring(i-k-1, i).toLowerCase()));
        if(c<v){
            v = c;
            m=k+1;
        }
    }
    CostTuple costTuple = new CostTuple(v, m);

    return costTuple;
}

/**
 * 具体的分割类
 * */
private List<String> split(String word){
    int length = word.length();
    cost = new double[length+1];
    cost[0]=0D;
    for(int i=1;i<length+1;i++){
        CostTuple costTuple = best_macth(i, cost, word);
        cost[i]=(double)costTuple.cost;
    }
    List<String> outList = new ArrayList<>();
    while (length>0){
        CostTuple costTupleBack = best_macth(length, cost, word);
        assert (double)costTupleBack.cost == cost[length];
        Boolean newToken = true;
        String subword = word.substring(length-(int)costTupleBack.wordlength, length);
        if(!subword.equals("'")){
            int listsize = outList.size();
            if(outList.size()>0){
                String lastString = outList.get(listsize-1);
                if(lastString.equals("'s") || (Character.isDigit(word.charAt(length-1)) && Character.isDigit(lastString.charAt(0)))){
                    lastString = subword+lastString;
                    outList.set(listsize-1,lastString);
                    newToken = false;
                }
            }
        }
        if(newToken){
            outList.add(subword);
        }
        length -= (int)costTupleBack.wordlength;
    }
    return outList;
}

public String  transFormSegmenter(String words) throws IOException {
    List<String> cnWords = this.getChinese(words);
    String[] enwords = this.getEnglish(words);
    assert cnWords.size()+1==enwords.length;
    List<String> resultList = new ArrayList<>();
    for(String word:enwords){
        List<String> wordLists = split(word);
        resultList.add(enlistToString(wordLists));
    }
    int cnsize = cnWords.size();
    for(int i=0;i<cnsize;i++){
        resultList.add(2*i+1, cnWords.get(i));
    }

    return listToString(resultList);
}

/**
 * 数组进行反转
 * */
private static double[] reverse(double[]x){
    int length = x.length;
    for(int i=0;i<(length+1)/2;i++){
        swap(x, i, length-i-1);
    }
    return x;
}
private static void swap(double[]x,int i,int j){
    double temp = x[i];
    x[i] = x[j];
    x[j]=temp;
}

private static String enlistToString(List<String> list){
    String word ="";
    for(String str:list){
        if(!"".equals(str.trim())){
            word = (str+" ")+word;
        }

    }
    return word;
}

private static String listToString(List<String> list){
    String word ="";
    for(String str:list){
        if(!"".equals(str.trim())){
            word += (str+" ");
        }

    }
    return word;
}

class CostTuple<K,V>{
    private K cost;
    private V wordlength;

    public CostTuple(K cost,V wordlength){
        this.cost = cost;
        this.wordlength = wordlength;
    }
}

}

整个类的调用入口函数是:transFormSegmenter

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值