package com.qfang.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.util.LinkedList;
import java.util.Queue;
import com.chenlb.mmseg4j.Chunk;
import com.chenlb.mmseg4j.Seg;
import com.chenlb.mmseg4j.Sentence;
import com.chenlb.mmseg4j.Word;
/**
* 本类修改自{@link com.chenlb.mmseg4j.MMSeg}</br>
* NOTE:<ul>
* <li>1.<b>非线程安全</b></li>
* <li>2.修改了{@link com.chenlb.mmseg4j.MMSeg}对于英文、数字、汉字混合关键字的处理逻辑,</br>
* 现在可以混合使用中英数字文了,英文和数字不会变成断字符</li>
* <li>3.不可将此类用在其他不需要混合中英文关键字的情况。请使用原类{@link com.chenlb.mmseg4j.MMSeg}</li>
* <li>4.由于混合处理,生成的{@link com.chenlb.mmseg4j.Word}类的数字、英文、汉字的type都是</br>TYPE_WORD = "word",
* 尚未在lucene查询索引使用MMSegAnalyser实验过,请自行测试</li>
* </ul>
*
* @author yoara
*/
public class CnEnMMSeg {
private PushbackReader reader;
private Seg seg;
private StringBuilder bufSentence = new StringBuilder(256);
private Sentence currentSentence;
private Queue<Word> bufWord; // word 缓存, 因为有 chunk 分析三个以上.
public CnEnMMSeg(Reader input, Seg seg) {
this.seg = seg;
reset(input);
}
private int readedIdx = 0;
public void reset(Reader input) {
this.reader = new PushbackReader(new BufferedReader(input), 20);
currentSentence = null;
bufWord = new LinkedList<Word>();
bufSentence.setLength(0);
readedIdx = -1;
}
private int readNext() throws IOException {
int d = reader.read();
if(d > -1) {
readedIdx++;
//d = Character.toLowerCase(d);
}
return d;
}
private void pushBack(int data) throws IOException {
readedIdx--;
reader.unread(data);
}
public Word next() throws IOException {
//先从缓存中取
Word word = bufWord.poll();;
if(word == null) {
bufSentence.setLength(0);
int data = -1;
boolean read = true;
while(read && (data=readNext()) != -1) {
read = false; //默认一次可以读出同一类字符,就可以分词内容
int type = Character.getType(data);
String wordType = Word.TYPE_WORD;
switch(type) {
case Character.UPPERCASE_LETTER:
case Character.LOWERCASE_LETTER:
case Character.TITLECASE_LETTER:
case Character.MODIFIER_LETTER:
/*
* 1. 0x410-0x44f -> А-я //俄文
* 2. 0x391-0x3a9 -> Α-Ω //希腊大写
* 3. 0x3b1-0x3c9 -> α-ω //希腊小写
*/
data = toAscii(data);
NationLetter nl = getNation(data);
if(nl == NationLetter.UNKNOW) {
read = true;
break;
}
wordType = Word.TYPE_LETTER;
bufSentence.appendCodePoint(data);
switch(nl) {
case EN:
//字母后面的数字,如: VH049PA
readChars(bufSentence, new ReadCharByAsciiOrDigitOrOther());
currentSentence = createSentence(bufSentence);
break;
case RA:
readChars(bufSentence, new ReadCharByRussia());
bufWord.add(createWord(bufSentence, wordType));
break;
case GE:
readChars(bufSentence, new ReadCharByGreece());
bufWord.add(createWord(bufSentence, wordType));
break;
}
bufSentence.setLength(0);
break;
case Character.OTHER_LETTER:
case Character.DECIMAL_DIGIT_NUMBER:
/*
* 1. 0x3041-0x30f6 -> ぁ-ヶ //日文(平|片)假名
* 2. 0x3105-0x3129 -> ㄅ-ㄩ //注意符号
* 3. 数字
*/
bufSentence.appendCodePoint(data);
readChars(bufSentence, new ReadCharByAsciiOrDigitOrOther());
currentSentence = createSentence(bufSentence);
bufSentence.setLength(0);
break;
case Character.LETTER_NUMBER:
// ⅠⅡⅢ 单分
bufSentence.appendCodePoint(data);
readChars(bufSentence, new ReadCharByType(new int[]{Character.LETTER_NUMBER}));
int startIdx = startIdx(bufSentence);
for(int i=0; i<bufSentence.length(); i++) {
bufWord.add(new Word(new char[] {bufSentence.charAt(i)}, startIdx++, Word.TYPE_LETTER_NUMBER));
}
bufSentence.setLength(0); //缓存的字符清除
break;
case Character.OTHER_NUMBER:
//①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 连着用
bufSentence.appendCodePoint(data);
readChars(bufSentence, new ReadCharByType(new int[]{Character.OTHER_NUMBER}));
bufWord.add(createWord(bufSentence, Word.TYPE_OTHER_NUMBER));
bufSentence.setLength(0);
break;
default :
//其它认为无效字符
read = true;
}//switch
}
// 中文分词
if(currentSentence != null) {
do {
Chunk chunk = seg.seg(currentSentence);
for(int i=0; i<chunk.getCount(); i++) {
bufWord.add(chunk.getWords()[i]);
}
} while (!currentSentence.isFinish());
currentSentence = null;
}
word = bufWord.poll();
}
return word;
}
/**
* 读取下一串指定类型字符.
*
* @author chenlb 2009-8-15下午09:09:50
*/
private static abstract class ReadChar {
/**
* 这个字符是否读取, 不读取也不会读下一个字符.
*/
abstract boolean isRead(int codePoint);
int transform(int codePoint) {
return codePoint;
}
}
/**
* 读取下一串指定类型的字符放到 bufSentence 中.
* @param bufSentence
* @param readChar 判断字符的细节.
* @return 返回读取的个数
* @throws IOException {@link #readNext()} 或 {@link #pushBack()} 抛出的.
*/
private int readChars(StringBuilder bufSentence, ReadChar readChar) throws IOException {
int num = 0;
int data = -1;
while((data = readNext()) != -1) {
int d = readChar.transform(data);
if(readChar.isRead(d)) {
bufSentence.appendCodePoint(d);
num++;
} else { //不是数字回压,要下一步操作
pushBack(data);
break;
}
}
return num;
}
/**读取数字*/
private static class ReadCharDigit extends ReadChar {
boolean isRead(int codePoint) {
int type = Character.getType(codePoint);
return isDigit(type);
}
int transform(int codePoint) {
return toAscii(codePoint);
}
}
/**读取字母或数字*/
private static class ReadCharByAsciiOrDigit extends ReadCharDigit {
boolean isRead(int codePoint) {
boolean isRead = super.isRead(codePoint);
return isAsciiLetter(codePoint) || isRead;
}
}
/**读取字母或数字或其他字符*/
private static class ReadCharByAsciiOrDigitOrOther extends ReadCharByAsciiOrDigit {
boolean isRead(int codePoint) {
boolean isRead = super.isRead(codePoint);
int type = Character.getType(codePoint);
return isRead ||isCJK(type);
}
}
/**读取字母*/
@SuppressWarnings("unused")
private static class ReadCharByAscii extends ReadCharDigit {
boolean isRead(int codePoint) {
return isAsciiLetter(codePoint);
}
}
/**读取俄语*/
private static class ReadCharByRussia extends ReadCharDigit {
boolean isRead(int codePoint) {
return isRussiaLetter(codePoint);
}
}
/**读取希腊 */
private static class ReadCharByGreece extends ReadCharDigit {
boolean isRead(int codePoint) {
return isGreeceLetter(codePoint);
}
}
/**读取指定类型组的字符*/
private static class ReadCharByType extends ReadChar {
int[] charType;
public ReadCharByType(int[] charType) {
this.charType = charType;
}
boolean isRead(int codePoint) {
int type = Character.getType(codePoint);
boolean isRead = false;
for(int cType:charType){
isRead = (type == cType);
}
return isRead;
}
}
private Word createWord(StringBuilder bufSentence, String type) {
return new Word(toChars(bufSentence), startIdx(bufSentence), type);
}
private Word createWord(StringBuilder bufSentence, int startIdx, String type) {
return new Word(toChars(bufSentence), startIdx, type);
}
private Sentence createSentence(StringBuilder bufSentence) {
return new Sentence(toChars(bufSentence), startIdx(bufSentence));
}
/**取得 bufSentence 的第一个字符在整个文本中的位置*/
private int startIdx(StringBuilder bufSentence) {
return readedIdx - bufSentence.length() + 1;
}
/**从 StringBuilder 里复制出 char[] */
private static char[] toChars(StringBuilder bufSentence) {
char[] chs = new char[bufSentence.length()];
bufSentence.getChars(0, bufSentence.length(), chs, 0);
return chs;
}
/**
* 双角转单角
*/
private static int toAscii(int codePoint) {
if((codePoint>=65296 && codePoint<=65305) //0-9
|| (codePoint>=65313 && codePoint<=65338) //A-Z
|| (codePoint>=65345 && codePoint<=65370) //a-z
) {
codePoint -= 65248;
}
return codePoint;
}
private static boolean isAsciiLetter(int codePoint) {
return (codePoint >= 'A' && codePoint <= 'Z') || (codePoint >= 'a' && codePoint <= 'z');
}
private static boolean isRussiaLetter(int codePoint) {
return (codePoint >= 'А' && codePoint <= 'я') || codePoint=='Ё' || codePoint=='ё';
}
private static boolean isGreeceLetter(int codePoint) {
return (codePoint >= 'Α' && codePoint <= 'Ω') || (codePoint >= 'α' && codePoint <= 'ω');
}
/**
* EN -> 英语
* RA -> 俄语
* GE -> 希腊
*
*/
private static enum NationLetter {EN, RA, GE, UNKNOW};
private NationLetter getNation(int codePoint) {
if(isAsciiLetter(codePoint)) {
return NationLetter.EN;
}
if(isRussiaLetter(codePoint)) {
return NationLetter.RA;
}
if(isGreeceLetter(codePoint)) {
return NationLetter.GE;
}
return NationLetter.UNKNOW;
}
private static boolean isCJK(int type) {
return type == Character.OTHER_LETTER;
}
private static boolean isDigit(int type) {
return type == Character.DECIMAL_DIGIT_NUMBER;
}
@SuppressWarnings("unused")
private static boolean isLetter(int type) {
return type <= Character.MODIFIER_LETTER && type >= Character.UPPERCASE_LETTER;
}
}
使用方式:
/** 获得分词算法处理类
* @param text 输入文本段
* @param dicPath 文件路径,用于判断生成哪类词库
* **/
private static CnEnMMSeg makeMMSeg(String text, String dicPath) {
//数据对象转换成流
StringReader input = new StringReader(text);
Dictionary dic = getDictionary(dicPath);
//选用正向最大匹配
Seg seg = new ComplexSeg(dic);
CnEnMMSeg mmSeg = new CnEnMMSeg(input, seg);
return mmSeg;
}