【基于python版本的连续英文分词实现java版本的英文分词器】

最新推荐文章于 2023-10-13 15:41:08 发布

乌云风

最新推荐文章于 2023-10-13 15:41:08 发布

阅读量462

点赞数 1

分类专栏：英文分词自然语言文章标签： java 中文分词搜索引擎

本文链接：https://blog.csdn.net/itbigpig/article/details/127026801

版权

英文分词同时被 2 个专栏收录

1 篇文章 0 订阅

订阅专栏

自然语言

1 篇文章 0 订阅

订阅专栏

连续英文分词器java版本

定义词典，
构建词典
切词实现

在搜索领域，用户的输入是千奇百怪的，有时候用户输入的是连续的英文，如果不能有效的进行切分，那么搜索召回的效果可能会比较差，所以我们需要针对连续英文进行分词，主要有以下几个步骤：
1：定义词典
2：构建英文词典
3：切词

定义词典，

这里的词典可以就是一个文本文件，格式如下：
leagues
fossil
microsoft
property
depending
overall
universities
appearance

构建词典

代码如下：
package org.wltea.analyzer.custom;

import org.wltea.analyzer.cfg.Configuration;//这里用了IK里的一个配置文件，主要是获取词典路径，也可以不用，自己定义个路径就行

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.*;

/**

自定义的连续英文分词
*/
public class ENDictionary {

/**
- 英文词典单例
- */
  private static ENDictionary singleton;
/**
- 英文词典对象
- */
  private Dictionary _ENMainDict;
/**
- 词的最大长度
- */
  private Integer maxLength = 0;
private Configuration cfg;

private ENDictionary(Configuration cfg){
this.cfg = cfg;
this.loadMainDict();
}

/**
- 词典初始化
- 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
- 只有当Dictionary类被实际调用时，才会开始载入词典，
- 这将延长首次分词操作的时间
- 该方法提供了一个在应用加载阶段就初始化字典的手段
  */
  public static void initial(Configuration cfg){
  if(singleton == null){
  synchronized (ENDictionary.class){
  if(singleton == null){
  singleton = new ENDictionary(cfg);
  }
  }
  }
  }
/**
- 获取词典单子实例
- @return ENDictionary 单例对象
  */
  public static ENDictionary getSingleton(){
  if(singleton == null){
  throw new IllegalStateException(“英文词典尚未初始化，请先调用initial方法”);
  }
  return singleton;
  }
/**
- 批量增加英文词典
- */
private void loadMainDict(){
_ENMainDict = new Hashtable<String,Double>();
List diclist = new ArrayList<>();
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getENDictionary());
if(is == null){
throw new RuntimeException(“英文词典没有发现!”);
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
String theword;
//收集数据
do{
theword = br.readLine();
if(theword!=null && !“”.equals(theword.trim())){
diclist.add(theword);
Integer length = theword.length();
if(maxLength<length){ //获取最大词的长度
maxLength = length;
}
}
}while (theword!=null);
//迭代数据，计算每个词cost,形成字典
fillDictinary(_ENMainDict,diclist);
}catch (IOException ioe){
System.err.println(“英文词典加载异常”);
ioe.printStackTrace();
}finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/**
- 字典填充方法
- */
private void fillDictinary(Dictionary dic,List diclist){
Integer wordCount = diclist.size();
for(int i=0;i<wordCount;i++){
String word = diclist.get(i);
Double cost =cost(i, wordCount);
dic.put(word, cost);
}
}

/**
- 计算每个词的损失
- @param i:单词在字典里位置，从0开始
- @param wordCount:词典里总单词数
- @return :返回单词的损失值
- */
  private double cost(Integer i,Integer wordCount){
  return Math.log((i+1)*Math.log(wordCount));
  }
public Dictionary get_ENMainDict() {
return _ENMainDict;
}

public Integer getMaxLength() {
return maxLength;
}

}

切词实现

package org.wltea.analyzer.custom;

import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ENWordSegmenter {

private ENDictionary dic = ENDictionary.getSingleton();
//获取字典
private Dictionary<String,Double> _MainDic = dic.get_ENMainDict();

//获取字典里面字符最大长度
private Integer maxLength = dic.getMaxLength();

//定义一个损失数组
private double[] cost;
/**
 * 获取所有中文字
 * */
private List<String> getChinese(String word){
    Pattern pattern = Pattern.compile("[^a-zA-Z0-9']+");
    Matcher matcher = pattern.matcher(word);
    List<String> list = new ArrayList<>();
    while (matcher.find()){
        list.add(matcher.group(0));
    }
    return list;
}

/**
 * 获取连续英文词块
 * */
private String[] getEnglish(String word){
    Pattern pattern = Pattern.compile("[^a-zA-Z0-9']+");
    String[] enwords = pattern.split(word,-1);

    return enwords;
}

/**
 * 获取最小cost值
 * */
private CostTuple best_macth(int i,double[] cost,String word){
    Integer start = Math.max(0, i-this.maxLength);

    //截取从0开始到i个字符的损失
    double[] costList = new double[i-start];
    System.arraycopy(cost, start,costList,0,i-start);
    double[] reverseCost = reverse(costList);

    Double v = Double.MAX_VALUE;
    Integer m = Integer.MIN_VALUE;

    for(int k=0;k<reverseCost.length;k++){
        Double c = reverseCost[k];
        c = c+(_MainDic.get(word.substring(i-k-1, i).toLowerCase())==null?9e99:_MainDic.get(word.substring(i-k-1, i).toLowerCase()));
        if(c<v){
            v = c;
            m=k+1;
        }
    }
    CostTuple costTuple = new CostTuple(v, m);

    return costTuple;
}

/**
 * 具体的分割类
 * */
private List<String> split(String word){
    int length = word.length();
    cost = new double[length+1];
    cost[0]=0D;
    for(int i=1;i<length+1;i++){
        CostTuple costTuple = best_macth(i, cost, word);
        cost[i]=(double)costTuple.cost;
    }
    List<String> outList = new ArrayList<>();
    while (length>0){
        CostTuple costTupleBack = best_macth(length, cost, word);
        assert (double)costTupleBack.cost == cost[length];
        Boolean newToken = true;
        String subword = word.substring(length-(int)costTupleBack.wordlength, length);
        if(!subword.equals("'")){
            int listsize = outList.size();
            if(outList.size()>0){
                String lastString = outList.get(listsize-1);
                if(lastString.equals("'s") || (Character.isDigit(word.charAt(length-1)) && Character.isDigit(lastString.charAt(0)))){
                    lastString = subword+lastString;
                    outList.set(listsize-1,lastString);
                    newToken = false;
                }
            }
        }
        if(newToken){
            outList.add(subword);
        }
        length -= (int)costTupleBack.wordlength;
    }
    return outList;
}

public String  transFormSegmenter(String words) throws IOException {
    List<String> cnWords = this.getChinese(words);
    String[] enwords = this.getEnglish(words);
    assert cnWords.size()+1==enwords.length;
    List<String> resultList = new ArrayList<>();
    for(String word:enwords){
        List<String> wordLists = split(word);
        resultList.add(enlistToString(wordLists));
    }
    int cnsize = cnWords.size();
    for(int i=0;i<cnsize;i++){
        resultList.add(2*i+1, cnWords.get(i));
    }

    return listToString(resultList);
}

/**
 * 数组进行反转
 * */
private static double[] reverse(double[]x){
    int length = x.length;
    for(int i=0;i<(length+1)/2;i++){
        swap(x, i, length-i-1);
    }
    return x;
}
private static void swap(double[]x,int i,int j){
    double temp = x[i];
    x[i] = x[j];
    x[j]=temp;
}

private static String enlistToString(List<String> list){
    String word ="";
    for(String str:list){
        if(!"".equals(str.trim())){
            word = (str+" ")+word;
        }

    }
    return word;
}

private static String listToString(List<String> list){
    String word ="";
    for(String str:list){
        if(!"".equals(str.trim())){
            word += (str+" ");
        }

    }
    return word;
}

class CostTuple<K,V>{
    private K cost;
    private V wordlength;

    public CostTuple(K cost,V wordlength){
        this.cost = cost;
        this.wordlength = wordlength;
    }
}

}

整个类的调用入口函数是：transFormSegmenter

乌云风

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
【基于python版本的连续英文分词实现java版本的英文分词器】

在搜索领域，用户的输入是千奇百怪的，有时候会是连续英文进行输入，导致无召回，没有搜索结果，这个java版本的连续英文分词器主要是用户解决这类问题，增加搜索召回效果，提升用户体验。
复制链接

扫一扫