MMSeg4J改造，使之支持中英文数字连词

最新推荐文章于 2021-02-19 04:36:48 发布

Yoara

最新推荐文章于 2021-02-19 04:36:48 发布

阅读量2.9k

点赞数

分类专栏：其他文章标签： mmseg4j lucene 分词中英文

本文链接：https://blog.csdn.net/Yoara/article/details/41310535

版权

其他专栏收录该内容

11 篇文章 0 订阅

订阅专栏

package com.qfang.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.util.LinkedList;
import java.util.Queue;

import com.chenlb.mmseg4j.Chunk;
import com.chenlb.mmseg4j.Seg;
import com.chenlb.mmseg4j.Sentence;
import com.chenlb.mmseg4j.Word;

/**
 * 本类修改自{@link com.chenlb.mmseg4j.MMSeg}</br>
 * NOTE:<ul>
 * 		<li>1.<b>非线程安全</b></li>
 * 		<li>2.修改了{@link com.chenlb.mmseg4j.MMSeg}对于英文、数字、汉字混合关键字的处理逻辑，</br>
 * 			现在可以混合使用中英数字文了，英文和数字不会变成断字符</li>
 * 		<li>3.不可将此类用在其他不需要混合中英文关键字的情况。请使用原类{@link com.chenlb.mmseg4j.MMSeg}</li>
 * 		<li>4.由于混合处理，生成的{@link com.chenlb.mmseg4j.Word}类的数字、英文、汉字的type都是</br>TYPE_WORD = "word"，
 * 				尚未在lucene查询索引使用MMSegAnalyser实验过，请自行测试</li>
 * </ul>
 * 
 * @author yoara
 */
public class CnEnMMSeg {
	
	private PushbackReader reader;
	private Seg seg;
	
	private StringBuilder bufSentence = new StringBuilder(256);
	private Sentence currentSentence;
	private Queue<Word> bufWord;	// word 缓存, 因为有 chunk 分析三个以上.
	
	public CnEnMMSeg(Reader input, Seg seg) {
		this.seg = seg;
		
		reset(input);
	}

	private int readedIdx = 0;
	
	public void reset(Reader input) {
		this.reader = new PushbackReader(new BufferedReader(input), 20);
		currentSentence = null;
		bufWord = new LinkedList<Word>();
		bufSentence.setLength(0);
		readedIdx = -1;
	}
	
	private int readNext() throws IOException {
		int d = reader.read();
		if(d > -1) {
			readedIdx++;
			//d = Character.toLowerCase(d);
		}
		return d;
	}
	
	private void pushBack(int data) throws IOException {
		readedIdx--;
		reader.unread(data);
	}

	
	public Word next() throws IOException {
		//先从缓存中取
		Word word = bufWord.poll();;
		if(word == null) {
			bufSentence.setLength(0);

			int data = -1;
			boolean read = true;
			while(read && (data=readNext()) != -1) {
				read = false;	//默认一次可以读出同一类字符,就可以分词内容
				int type = Character.getType(data);
				String wordType = Word.TYPE_WORD;
				switch(type) {
				case Character.UPPERCASE_LETTER:
				case Character.LOWERCASE_LETTER:
				case Character.TITLECASE_LETTER:
				case Character.MODIFIER_LETTER:
					/*
					 * 1. 0x410-0x44f -> А-я	//俄文
					 * 2. 0x391-0x3a9 -> Α-Ω	//希腊大写
					 * 3. 0x3b1-0x3c9 -> α-ω	//希腊小写
					 */
					data = toAscii(data);
					NationLetter nl = getNation(data);
					if(nl == NationLetter.UNKNOW) {
						read = true;
						break;
					}
					wordType = Word.TYPE_LETTER;
					bufSentence.appendCodePoint(data);
					switch(nl) {
					case EN:
						//字母后面的数字,如: VH049PA
						readChars(bufSentence, new ReadCharByAsciiOrDigitOrOther());
						currentSentence = createSentence(bufSentence);
						
						break;
					case RA:
						readChars(bufSentence, new ReadCharByRussia());
						bufWord.add(createWord(bufSentence, wordType));
						break;
					case GE:
						readChars(bufSentence, new ReadCharByGreece());
						bufWord.add(createWord(bufSentence, wordType));
						break;
					}
					bufSentence.setLength(0);
					break;
				case Character.OTHER_LETTER:
				case Character.DECIMAL_DIGIT_NUMBER:
					/*
					 * 1. 0x3041-0x30f6 -> ぁ-ヶ	//日文(平|片)假名
					 * 2. 0x3105-0x3129 -> ㄅ-ㄩ	//注意符号
					 * 3. 数字
					 */
					bufSentence.appendCodePoint(data);
					
					readChars(bufSentence, new ReadCharByAsciiOrDigitOrOther());
					currentSentence = createSentence(bufSentence);

					bufSentence.setLength(0);

					break;
				case Character.LETTER_NUMBER:
					// ⅠⅡⅢ 单分
					bufSentence.appendCodePoint(data);
					readChars(bufSentence, new ReadCharByType(new int[]{Character.LETTER_NUMBER}));

					int startIdx = startIdx(bufSentence);
					for(int i=0; i<bufSentence.length(); i++) {
						bufWord.add(new Word(new char[] {bufSentence.charAt(i)}, startIdx++, Word.TYPE_LETTER_NUMBER));
					}

					bufSentence.setLength(0);	//缓存的字符清除

					break;
				case Character.OTHER_NUMBER:
					//①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 连着用
					bufSentence.appendCodePoint(data);
					readChars(bufSentence, new ReadCharByType(new int[]{Character.OTHER_NUMBER}));

					bufWord.add(createWord(bufSentence, Word.TYPE_OTHER_NUMBER));
					bufSentence.setLength(0);
					break;
				default :
					//其它认为无效字符
					read = true;
				}//switch
			}
				
			// 中文分词
			if(currentSentence != null) {
				do {
					Chunk chunk = seg.seg(currentSentence);
					for(int i=0; i<chunk.getCount(); i++) {
						bufWord.add(chunk.getWords()[i]);
					}
				} while (!currentSentence.isFinish());
				
				currentSentence = null;
			}
			
			word = bufWord.poll();
		}
		
		return word;
	}
	
	
	/**
	 * 读取下一串指定类型字符.
	 * 
	 * @author chenlb 2009-8-15下午09:09:50
	 */
	private static abstract class ReadChar {
		/**
		 * 这个字符是否读取, 不读取也不会读下一个字符.
		 */
		abstract boolean isRead(int codePoint);
		int transform(int codePoint) {
			return codePoint;
		}
	}
	
	/**
	 * 读取下一串指定类型的字符放到 bufSentence 中.
	 * @param bufSentence
	 * @param readChar 判断字符的细节.
	 * @return 返回读取的个数
	 * @throws IOException {@link #readNext()} 或 {@link #pushBack()} 抛出的.
	 */
	private int readChars(StringBuilder bufSentence, ReadChar readChar) throws IOException {
		int num = 0;
		int data = -1;
		while((data = readNext()) != -1) {
			int d = readChar.transform(data);
			if(readChar.isRead(d)) {
				bufSentence.appendCodePoint(d);
				num++;
			} else {	//不是数字回压,要下一步操作
				pushBack(data);
				break;
			}
		}
		return num;
	}
	
	/**读取数字*/
	private static class ReadCharDigit extends ReadChar {

		boolean isRead(int codePoint) {
			int type = Character.getType(codePoint);
			return isDigit(type);
		}
		
		int transform(int codePoint) {
			return toAscii(codePoint);
		}
		
	}
	
	/**读取字母或数字*/
	private static class ReadCharByAsciiOrDigit extends ReadCharDigit {

		boolean isRead(int codePoint) {
			boolean isRead = super.isRead(codePoint);
			return isAsciiLetter(codePoint) || isRead;
		}
	}
	
	/**读取字母或数字或其他字符*/
	private static class ReadCharByAsciiOrDigitOrOther extends ReadCharByAsciiOrDigit {

		boolean isRead(int codePoint) {
			boolean isRead = super.isRead(codePoint);
			int type = Character.getType(codePoint);
			return  isRead ||isCJK(type);
		}
	}
	
	/**读取字母*/
	@SuppressWarnings("unused")
	private static class ReadCharByAscii extends ReadCharDigit {
		boolean isRead(int codePoint) {
			return isAsciiLetter(codePoint);
		}
	}
	
	/**读取俄语*/
	private static class ReadCharByRussia extends ReadCharDigit {

		boolean isRead(int codePoint) {
			return isRussiaLetter(codePoint);
		}
		
	}
	
	/**读取希腊 */
	private static class ReadCharByGreece extends ReadCharDigit {

		boolean isRead(int codePoint) {
			return isGreeceLetter(codePoint);
		}
		
	}
	
	/**读取指定类型组的字符*/
	private static class ReadCharByType extends ReadChar {
		int[] charType;
		public ReadCharByType(int[] charType) {
			this.charType = charType;
		}

		boolean isRead(int codePoint) {
			int type = Character.getType(codePoint);
			boolean isRead = false;
			for(int cType:charType){
				isRead = (type == cType);
			}
			return isRead;
		}
		
	}
	
	private Word createWord(StringBuilder bufSentence, String type) {
		return new Word(toChars(bufSentence), startIdx(bufSentence), type);
	}
	
	private Word createWord(StringBuilder bufSentence, int startIdx, String type) {
		return new Word(toChars(bufSentence), startIdx, type);
	}
	
	private Sentence createSentence(StringBuilder bufSentence) {
		return new Sentence(toChars(bufSentence), startIdx(bufSentence));
	}
	
	/**取得 bufSentence 的第一个字符在整个文本中的位置*/
	private int startIdx(StringBuilder bufSentence) {
		return readedIdx - bufSentence.length() + 1;
	}
	
	/**从 StringBuilder 里复制出 char[] */
	private static char[] toChars(StringBuilder bufSentence) {
		char[] chs = new char[bufSentence.length()];
		bufSentence.getChars(0, bufSentence.length(), chs, 0);
		return chs;
	}
	
	/**
	 * 双角转单角
	 */
	private static int toAscii(int codePoint) {
		if((codePoint>=65296 && codePoint<=65305)	//０-９
				|| (codePoint>=65313 && codePoint<=65338)	//Ａ-Ｚ
				|| (codePoint>=65345 && codePoint<=65370)	//ａ-ｚ
				) {	
			codePoint -= 65248;
		}
		return codePoint;
	}
	
	private static boolean isAsciiLetter(int codePoint) {
		return (codePoint >= 'A' && codePoint <= 'Z') || (codePoint >= 'a' && codePoint <= 'z');
	}
	
	private static boolean isRussiaLetter(int codePoint) {
		return (codePoint >= 'А' && codePoint <= 'я') || codePoint=='Ё' || codePoint=='ё';
	}
	
	private static boolean isGreeceLetter(int codePoint) {
		return (codePoint >= 'Α' && codePoint <= 'Ω') || (codePoint >= 'α' && codePoint <= 'ω');
	}
	/**
	 * EN -> 英语
	 * RA -> 俄语
	 * GE -> 希腊
	 * 
	 */
	private static enum NationLetter {EN, RA, GE, UNKNOW};
	
	private NationLetter getNation(int codePoint) {
		if(isAsciiLetter(codePoint)) {
			return NationLetter.EN;
		}
		if(isRussiaLetter(codePoint)) {
			return NationLetter.RA;
		}
		if(isGreeceLetter(codePoint)) {
			return NationLetter.GE;
		}
		return NationLetter.UNKNOW;
	}
	
	private static boolean isCJK(int type) {
		return type == Character.OTHER_LETTER;
	}
	private static boolean isDigit(int type) {
		return type == Character.DECIMAL_DIGIT_NUMBER;
	}
	@SuppressWarnings("unused")
	private static boolean isLetter(int type) {
		return type <= Character.MODIFIER_LETTER && type >= Character.UPPERCASE_LETTER;
	}
}

使用方式：

	/** 获得分词算法处理类
	 * @param text 输入文本段
	 * @param dicPath 文件路径，用于判断生成哪类词库
	 *  **/
	private static CnEnMMSeg makeMMSeg(String text, String dicPath) {
		//数据对象转换成流
		StringReader input = new StringReader(text);
		Dictionary dic = getDictionary(dicPath);
		//选用正向最大匹配
		Seg seg = new ComplexSeg(dic);
		CnEnMMSeg mmSeg = new CnEnMMSeg(input, seg);
		return mmSeg;
	}