在搜索领域,用户的输入是千奇百怪的,有时候用户输入的是连续的英文,如果不能有效的进行切分,那么搜索召回的效果可能会比较差,所以我们需要针对连续英文进行分词,主要有以下几个步骤:
1:定义词典
2:构建英文词典
3:切词
定义词典,
这里的词典可以就是一个文本文件,格式如下:
leagues
fossil
microsoft
property
depending
overall
universities
appearance
构建词典
代码如下:
package org.wltea.analyzer.custom;
import org.wltea.analyzer.cfg.Configuration;//这里用了IK里的一个配置文件,主要是获取词典路径,也可以不用,自己定义个路径就行
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.*;
/**
-
自定义的连续英文分词
-
*/
public class ENDictionary {/**
- 英文词典单例
- */
private static ENDictionary singleton;
/**
- 英文词典对象
- */
private Dictionary _ENMainDict;
/**
- 词的最大长度
- */
private Integer maxLength = 0;
private Configuration cfg;
private ENDictionary(Configuration cfg){
this.cfg = cfg;
this.loadMainDict();
}/**
- 词典初始化
- 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
- 只有当Dictionary类被实际调用时,才会开始载入词典,
- 这将延长首次分词操作的时间
- 该方法提供了一个在应用加载阶段就初始化字典的手段
*/
public static void initial(Configuration cfg){
if(singleton == null){
synchronized (ENDictionary.class){
if(singleton == null){
singleton = new ENDictionary(cfg);
}
}
}
}
/**
- 获取词典单子实例
- @return ENDictionary 单例对象
*/
public static ENDictionary getSingleton(){
if(singleton == null){
throw new IllegalStateException(“英文词典尚未初始化,请先调用initial方法”);
}
return singleton;
}
/**
- 批量增加英文词典
- */
private void loadMainDict(){
_ENMainDict = new Hashtable<String,Double>();
List diclist = new ArrayList<>();
InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getENDictionary());
if(is == null){
throw new RuntimeException(“英文词典没有发现!”);
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
String theword;
//收集数据
do{
theword = br.readLine();
if(theword!=null && !“”.equals(theword.trim())){
diclist.add(theword);
Integer length = theword.length();
if(maxLength<length){ //获取最大词的长度
maxLength = length;
}
}
}while (theword!=null);
//迭代数据,计算每个词cost,形成字典
fillDictinary(_ENMainDict,diclist);
}catch (IOException ioe){
System.err.println(“英文词典加载异常”);
ioe.printStackTrace();
}finally {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}/**
- 字典填充方法
- */
private void fillDictinary(Dictionary dic,List diclist){
Integer wordCount = diclist.size();
for(int i=0;i<wordCount;i++){
String word = diclist.get(i);
Double cost =cost(i, wordCount);
dic.put(word, cost);
}
}/**
- 计算每个词的损失
- @param i:单词在字典里位置,从0开始
- @param wordCount:词典里总单词数
- @return :返回单词的损失值
- */
private double cost(Integer i,Integer wordCount){
return Math.log((i+1)*Math.log(wordCount));
}
public Dictionary get_ENMainDict() {
return _ENMainDict;
}public Integer getMaxLength() {
return maxLength;
}
}
切词实现
package org.wltea.analyzer.custom;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ENWordSegmenter {
private ENDictionary dic = ENDictionary.getSingleton();
//获取字典
private Dictionary<String,Double> _MainDic = dic.get_ENMainDict();
//获取字典里面字符最大长度
private Integer maxLength = dic.getMaxLength();
//定义一个损失数组
private double[] cost;
/**
* 获取所有中文字
* */
private List<String> getChinese(String word){
Pattern pattern = Pattern.compile("[^a-zA-Z0-9']+");
Matcher matcher = pattern.matcher(word);
List<String> list = new ArrayList<>();
while (matcher.find()){
list.add(matcher.group(0));
}
return list;
}
/**
* 获取连续英文词块
* */
private String[] getEnglish(String word){
Pattern pattern = Pattern.compile("[^a-zA-Z0-9']+");
String[] enwords = pattern.split(word,-1);
return enwords;
}
/**
* 获取最小cost值
* */
private CostTuple best_macth(int i,double[] cost,String word){
Integer start = Math.max(0, i-this.maxLength);
//截取从0开始到i个字符的损失
double[] costList = new double[i-start];
System.arraycopy(cost, start,costList,0,i-start);
double[] reverseCost = reverse(costList);
Double v = Double.MAX_VALUE;
Integer m = Integer.MIN_VALUE;
for(int k=0;k<reverseCost.length;k++){
Double c = reverseCost[k];
c = c+(_MainDic.get(word.substring(i-k-1, i).toLowerCase())==null?9e99:_MainDic.get(word.substring(i-k-1, i).toLowerCase()));
if(c<v){
v = c;
m=k+1;
}
}
CostTuple costTuple = new CostTuple(v, m);
return costTuple;
}
/**
* 具体的分割类
* */
private List<String> split(String word){
int length = word.length();
cost = new double[length+1];
cost[0]=0D;
for(int i=1;i<length+1;i++){
CostTuple costTuple = best_macth(i, cost, word);
cost[i]=(double)costTuple.cost;
}
List<String> outList = new ArrayList<>();
while (length>0){
CostTuple costTupleBack = best_macth(length, cost, word);
assert (double)costTupleBack.cost == cost[length];
Boolean newToken = true;
String subword = word.substring(length-(int)costTupleBack.wordlength, length);
if(!subword.equals("'")){
int listsize = outList.size();
if(outList.size()>0){
String lastString = outList.get(listsize-1);
if(lastString.equals("'s") || (Character.isDigit(word.charAt(length-1)) && Character.isDigit(lastString.charAt(0)))){
lastString = subword+lastString;
outList.set(listsize-1,lastString);
newToken = false;
}
}
}
if(newToken){
outList.add(subword);
}
length -= (int)costTupleBack.wordlength;
}
return outList;
}
public String transFormSegmenter(String words) throws IOException {
List<String> cnWords = this.getChinese(words);
String[] enwords = this.getEnglish(words);
assert cnWords.size()+1==enwords.length;
List<String> resultList = new ArrayList<>();
for(String word:enwords){
List<String> wordLists = split(word);
resultList.add(enlistToString(wordLists));
}
int cnsize = cnWords.size();
for(int i=0;i<cnsize;i++){
resultList.add(2*i+1, cnWords.get(i));
}
return listToString(resultList);
}
/**
* 数组进行反转
* */
private static double[] reverse(double[]x){
int length = x.length;
for(int i=0;i<(length+1)/2;i++){
swap(x, i, length-i-1);
}
return x;
}
private static void swap(double[]x,int i,int j){
double temp = x[i];
x[i] = x[j];
x[j]=temp;
}
private static String enlistToString(List<String> list){
String word ="";
for(String str:list){
if(!"".equals(str.trim())){
word = (str+" ")+word;
}
}
return word;
}
private static String listToString(List<String> list){
String word ="";
for(String str:list){
if(!"".equals(str.trim())){
word += (str+" ");
}
}
return word;
}
class CostTuple<K,V>{
private K cost;
private V wordlength;
public CostTuple(K cost,V wordlength){
this.cost = cost;
this.wordlength = wordlength;
}
}
}
整个类的调用入口函数是:transFormSegmenter