MMSeg4J改造,使之支持中英文数字连词

package com.qfang.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.util.LinkedList;
import java.util.Queue;

import com.chenlb.mmseg4j.Chunk;
import com.chenlb.mmseg4j.Seg;
import com.chenlb.mmseg4j.Sentence;
import com.chenlb.mmseg4j.Word;

/**
 * 本类修改自{@link com.chenlb.mmseg4j.MMSeg}</br>
 * NOTE:<ul>
 * 		<li>1.<b>非线程安全</b></li>
 * 		<li>2.修改了{@link com.chenlb.mmseg4j.MMSeg}对于英文、数字、汉字混合关键字的处理逻辑,</br>
 * 			现在可以混合使用中英数字文了,英文和数字不会变成断字符</li>
 * 		<li>3.不可将此类用在其他不需要混合中英文关键字的情况。请使用原类{@link com.chenlb.mmseg4j.MMSeg}</li>
 * 		<li>4.由于混合处理,生成的{@link com.chenlb.mmseg4j.Word}类的数字、英文、汉字的type都是</br>TYPE_WORD = "word",
 * 				尚未在lucene查询索引使用MMSegAnalyser实验过,请自行测试</li>
 * </ul>
 * 
 * @author yoara
 */
public class CnEnMMSeg {
	
	private PushbackReader reader;
	private Seg seg;
	
	private StringBuilder bufSentence = new StringBuilder(256);
	private Sentence currentSentence;
	private Queue<Word> bufWord;	// word 缓存, 因为有 chunk 分析三个以上.
	
	public CnEnMMSeg(Reader input, Seg seg) {
		this.seg = seg;
		
		reset(input);
	}

	private int readedIdx = 0;
	
	public void reset(Reader input) {
		this.reader = new PushbackReader(new BufferedReader(input), 20);
		currentSentence = null;
		bufWord = new LinkedList<Word>();
		bufSentence.setLength(0);
		readedIdx = -1;
	}
	
	private int readNext() throws IOException {
		int d = reader.read();
		if(d > -1) {
			readedIdx++;
			//d = Character.toLowerCase(d);
		}
		return d;
	}
	
	private void pushBack(int data) throws IOException {
		readedIdx--;
		reader.unread(data);
	}

	
	public Word next() throws IOException {
		//先从缓存中取
		Word word = bufWord.poll();;
		if(word == null) {
			bufSentence.setLength(0);

			int data = -1;
			boolean read = true;
			while(read && (data=readNext()) != -1) {
				read = false;	//默认一次可以读出同一类字符,就可以分词内容
				int type = Character.getType(data);
				String wordType = Word.TYPE_WORD;
				switch(type) {
				case Character.UPPERCASE_LETTER:
				case Character.LOWERCASE_LETTER:
				case Character.TITLECASE_LETTER:
				case Character.MODIFIER_LETTER:
					/*
					 * 1. 0x410-0x44f -> А-я	//俄文
					 * 2. 0x391-0x3a9 -> Α-Ω	//希腊大写
					 * 3. 0x3b1-0x3c9 -> α-ω	//希腊小写
					 */
					data = toAscii(data);
					NationLetter nl = getNation(data);
					if(nl == NationLetter.UNKNOW) {
						read = true;
						break;
					}
					wordType = Word.TYPE_LETTER;
					bufSentence.appendCodePoint(data);
					switch(nl) {
					case EN:
						//字母后面的数字,如: VH049PA
						readChars(bufSentence, new ReadCharByAsciiOrDigitOrOther());
						currentSentence = createSentence(bufSentence);
						
						break;
					case RA:
						readChars(bufSentence, new ReadCharByRussia());
						bufWord.add(createWord(bufSentence, wordType));
						break;
					case GE:
						readChars(bufSentence, new ReadCharByGreece());
						bufWord.add(createWord(bufSentence, wordType));
						break;
					}
					bufSentence.setLength(0);
					break;
				case Character.OTHER_LETTER:
				case Character.DECIMAL_DIGIT_NUMBER:
					/*
					 * 1. 0x3041-0x30f6 -> ぁ-ヶ	//日文(平|片)假名
					 * 2. 0x3105-0x3129 -> ㄅ-ㄩ	//注意符号
					 * 3. 数字
					 */
					bufSentence.appendCodePoint(data);
					
					readChars(bufSentence, new ReadCharByAsciiOrDigitOrOther());
					currentSentence = createSentence(bufSentence);

					bufSentence.setLength(0);

					break;
				case Character.LETTER_NUMBER:
					// ⅠⅡⅢ 单分
					bufSentence.appendCodePoint(data);
					readChars(bufSentence, new ReadCharByType(new int[]{Character.LETTER_NUMBER}));

					int startIdx = startIdx(bufSentence);
					for(int i=0; i<bufSentence.length(); i++) {
						bufWord.add(new Word(new char[] {bufSentence.charAt(i)}, startIdx++, Word.TYPE_LETTER_NUMBER));
					}

					bufSentence.setLength(0);	//缓存的字符清除

					break;
				case Character.OTHER_NUMBER:
					//①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 连着用
					bufSentence.appendCodePoint(data);
					readChars(bufSentence, new ReadCharByType(new int[]{Character.OTHER_NUMBER}));

					bufWord.add(createWord(bufSentence, Word.TYPE_OTHER_NUMBER));
					bufSentence.setLength(0);
					break;
				default :
					//其它认为无效字符
					read = true;
				}//switch
			}
				
			// 中文分词
			if(currentSentence != null) {
				do {
					Chunk chunk = seg.seg(currentSentence);
					for(int i=0; i<chunk.getCount(); i++) {
						bufWord.add(chunk.getWords()[i]);
					}
				} while (!currentSentence.isFinish());
				
				currentSentence = null;
			}
			
			word = bufWord.poll();
		}
		
		return word;
	}
	
	
	/**
	 * 读取下一串指定类型字符.
	 * 
	 * @author chenlb 2009-8-15下午09:09:50
	 */
	private static abstract class ReadChar {
		/**
		 * 这个字符是否读取, 不读取也不会读下一个字符.
		 */
		abstract boolean isRead(int codePoint);
		int transform(int codePoint) {
			return codePoint;
		}
	}
	
	/**
	 * 读取下一串指定类型的字符放到 bufSentence 中.
	 * @param bufSentence
	 * @param readChar 判断字符的细节.
	 * @return 返回读取的个数
	 * @throws IOException {@link #readNext()} 或 {@link #pushBack()} 抛出的.
	 */
	private int readChars(StringBuilder bufSentence, ReadChar readChar) throws IOException {
		int num = 0;
		int data = -1;
		while((data = readNext()) != -1) {
			int d = readChar.transform(data);
			if(readChar.isRead(d)) {
				bufSentence.appendCodePoint(d);
				num++;
			} else {	//不是数字回压,要下一步操作
				pushBack(data);
				break;
			}
		}
		return num;
	}
	
	/**读取数字*/
	private static class ReadCharDigit extends ReadChar {

		boolean isRead(int codePoint) {
			int type = Character.getType(codePoint);
			return isDigit(type);
		}
		
		int transform(int codePoint) {
			return toAscii(codePoint);
		}
		
	}
	
	/**读取字母或数字*/
	private static class ReadCharByAsciiOrDigit extends ReadCharDigit {

		boolean isRead(int codePoint) {
			boolean isRead = super.isRead(codePoint);
			return isAsciiLetter(codePoint) || isRead;
		}
	}
	
	/**读取字母或数字或其他字符*/
	private static class ReadCharByAsciiOrDigitOrOther extends ReadCharByAsciiOrDigit {

		boolean isRead(int codePoint) {
			boolean isRead = super.isRead(codePoint);
			int type = Character.getType(codePoint);
			return  isRead ||isCJK(type);
		}
	}
	
	/**读取字母*/
	@SuppressWarnings("unused")
	private static class ReadCharByAscii extends ReadCharDigit {
		boolean isRead(int codePoint) {
			return isAsciiLetter(codePoint);
		}
	}
	
	/**读取俄语*/
	private static class ReadCharByRussia extends ReadCharDigit {

		boolean isRead(int codePoint) {
			return isRussiaLetter(codePoint);
		}
		
	}
	
	/**读取希腊 */
	private static class ReadCharByGreece extends ReadCharDigit {

		boolean isRead(int codePoint) {
			return isGreeceLetter(codePoint);
		}
		
	}
	
	/**读取指定类型组的字符*/
	private static class ReadCharByType extends ReadChar {
		int[] charType;
		public ReadCharByType(int[] charType) {
			this.charType = charType;
		}

		boolean isRead(int codePoint) {
			int type = Character.getType(codePoint);
			boolean isRead = false;
			for(int cType:charType){
				isRead = (type == cType);
			}
			return isRead;
		}
		
	}
	
	private Word createWord(StringBuilder bufSentence, String type) {
		return new Word(toChars(bufSentence), startIdx(bufSentence), type);
	}
	
	private Word createWord(StringBuilder bufSentence, int startIdx, String type) {
		return new Word(toChars(bufSentence), startIdx, type);
	}
	
	private Sentence createSentence(StringBuilder bufSentence) {
		return new Sentence(toChars(bufSentence), startIdx(bufSentence));
	}
	
	/**取得 bufSentence 的第一个字符在整个文本中的位置*/
	private int startIdx(StringBuilder bufSentence) {
		return readedIdx - bufSentence.length() + 1;
	}
	
	/**从 StringBuilder 里复制出 char[] */
	private static char[] toChars(StringBuilder bufSentence) {
		char[] chs = new char[bufSentence.length()];
		bufSentence.getChars(0, bufSentence.length(), chs, 0);
		return chs;
	}
	
	/**
	 * 双角转单角
	 */
	private static int toAscii(int codePoint) {
		if((codePoint>=65296 && codePoint<=65305)	//0-9
				|| (codePoint>=65313 && codePoint<=65338)	//A-Z
				|| (codePoint>=65345 && codePoint<=65370)	//a-z
				) {	
			codePoint -= 65248;
		}
		return codePoint;
	}
	
	private static boolean isAsciiLetter(int codePoint) {
		return (codePoint >= 'A' && codePoint <= 'Z') || (codePoint >= 'a' && codePoint <= 'z');
	}
	
	private static boolean isRussiaLetter(int codePoint) {
		return (codePoint >= 'А' && codePoint <= 'я') || codePoint=='Ё' || codePoint=='ё';
	}
	
	private static boolean isGreeceLetter(int codePoint) {
		return (codePoint >= 'Α' && codePoint <= 'Ω') || (codePoint >= 'α' && codePoint <= 'ω');
	}
	/**
	 * EN -> 英语
	 * RA -> 俄语
	 * GE -> 希腊
	 * 
	 */
	private static enum NationLetter {EN, RA, GE, UNKNOW};
	
	private NationLetter getNation(int codePoint) {
		if(isAsciiLetter(codePoint)) {
			return NationLetter.EN;
		}
		if(isRussiaLetter(codePoint)) {
			return NationLetter.RA;
		}
		if(isGreeceLetter(codePoint)) {
			return NationLetter.GE;
		}
		return NationLetter.UNKNOW;
	}
	
	private static boolean isCJK(int type) {
		return type == Character.OTHER_LETTER;
	}
	private static boolean isDigit(int type) {
		return type == Character.DECIMAL_DIGIT_NUMBER;
	}
	@SuppressWarnings("unused")
	private static boolean isLetter(int type) {
		return type <= Character.MODIFIER_LETTER && type >= Character.UPPERCASE_LETTER;
	}
}


使用方式:

	/** 获得分词算法处理类
	 * @param text 输入文本段
	 * @param dicPath 文件路径,用于判断生成哪类词库
	 *  **/
	private static CnEnMMSeg makeMMSeg(String text, String dicPath) {
		//数据对象转换成流
		StringReader input = new StringReader(text);
		Dictionary dic = getDictionary(dicPath);
		//选用正向最大匹配
		Seg seg = new ComplexSeg(dic);
		CnEnMMSeg mmSeg = new CnEnMMSeg(input, seg);
		return mmSeg;
	}



##What is wechat4j? wechat develop framework for java(微信开发框架JAVA版,最简单易用微信开发框架) ##wechat4j可以用来干什么? wechat4j是一个帮助你开发微信应用的jar包,使用它,你开发微信公众号应用只需要几秒钟的时间,完全不用关注太细节的东西。 ##wechat4j快速开始 可以去下载wechat4j示例项目[wechat4jDemo](https://github.com/repoproject/wechat4jDemo),然后在其基础之上修改即可。如果你要自己搭建,那么使用wechat4j只需要三步就可以搭建微信开发环境。 1. 创建一个web工程,导入jdk和相关的web工程jar包。 2. 下载wechat4j.jar包,下载地址[wechat4j下载](https://github.com/sword-org/wechat4j/releases)。 3. 创建wechat4j配置文件,在src目录下(java根目录)创建wechat4j.properties文件,配置你微信公众号的相关信息。内容如下: ```properties #you wechat token wechat.token=token #wechat appid wechat.appid=appid #wechat app secret wechat.appsecret=secret ``` 你也可以在jar包的META-INF目录下找到wechat4j.properties.sample文件,复制到src目录下修改名称即可。wechat4j.properties配置文件的详细配置项意义参见[wechat4j配置文件解读](https://github.com/sword-org/wechat4j/wiki/wechat4j%E9%85%8D%E7%BD%AE%E6%96%87%E4%BB%B6%E8%A7%A3%E8%AF%BB) 通过以上步骤,你的微信工程就完全搭好了。 ##wechat4j 运行环境 wechat4j要求的最低java运行环境是jdk1.6 wechat4j.jar的依赖jar包 > * commons-codec.jar 1.3以上 > * commons-lang3.jar > * log4j.jar 1.2以上 > * fastjson-1.2.0.jar > * sword-lang-1.2 (https://github.com/sword-org/sword-lang/releases) > * fluent-hc-4.3.6.jar(httpclient依赖) > * httpclient-4.3.6.jar > * httpcore-4.3.3.jar (httpclient依赖) > * servlet-api.jar 如果你是web工程,导入支持web工程的包就会包括,例如tomcat包 你可以去集中下载这些jar包的集合[wechat4j所需jar下载](http://files.cnblogs.com/chengn/wechat4j-lib.rar),也可以去maven库或者对应jar包的项目官网下载. ##开发自己的微信应用 wechat4j开发环境搭好之后,就可以开始开发自己的微信应用了。比如我有一个微信号的token是lejian,下面就以她为例子来说明。 ###创建自己公众号服务类 创建自己的微信公众号服务类,需要继承wechat4j的WechatSupport类,然后实现其抽象方法即可,下面以文本消息处理为例子 ```java public class Lejian extends WechatSupport{ public Lejian(HttpServletRequest request) { super(request); } @Override protected void onText() { this.wechatRequest.getFromUserName(); String content = "test ok"; responseText(content); } } ``` 上面代码中的``onText()``是WechatSupport的抽象方法,需要你自己的类来实现,表示对文本消息的处理,示例中是接收到用户的消息之后,返回给用户“test ok”文本消息。 ###创建微信服务地址 创建微信服务地址(微信公众平台中配置的自己服务器地址)servlet类。如果是springmvc则创建对应的controller,如果是struts则创建对应的action类。servlet类示例如下: ```java protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { Lejian lejian = new Lejian(request); String result = lejian.execute(); response.getOutputStream().write(result.getBytes()); } protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { Lejian lejian = new Lejian(request); String result = lejian.execute(); response.getOutputStream().write(result.getBytes()); } ``` 通过上面两步你的微信服务就可以运行了 ##如何得到微信的请求参数 继承了``WechatSupport``类之后,你可以通过``wechatRequest.getFromUserName()``类似的方法来得到微信服务器请求的参数。详细信息请阅读[微信请求参数](https://github.com/sword-org/wechat4j/wiki/%E5%BE%97%E5%88%B0%E5%BE%AE%E4%BF%A1%E8%AF%B7%E6%B1%82%E5%8F%82%E6%95%B0) ##如何设置响应参数 继承了``WechatSupport``类之后,你可以通过``wechatResponse.setFromUserName(fromUser)``类似的方法来设置给微信服务器的响应参数。详细信息请阅读[响应微信服务器参数](https://github.com/sword-org/wechat4j/wiki/%E8%AE%BE%E7%BD%AE%E5%93%8D%E5%BA%94%E5%BE%AE%E4%BF%A1%E5%8F%82%E6%95%B0) ##如何响应用户信息 以文本信息为例,响应文本信息只需要在你的``onText``方法中使用``responseText(content)``即可(参见上面的代码例子) ##wechat4j示例项目 * [wechat4jDemo](https://github.com/repoproject/wechat4jDemo) 如果你有好的demo项目,请邮件或者修改本文件然后pull request给我,我会列在上面。 ##技术支持 * [wechat4j开发者文档中心](http://www.chengn.com/wechat4j/) * [wechat4j开发文档](https://github.com/sword-org/wechat4j/wiki) * wechat4j技术交流QQ群 **423134346** * 支持邮件 sword_org@163.com * wechat4j暂无论坛,欢迎开通论坛交流版块,如果开通请邮件,我会添加到这里。 ##贡献代码 1. 如果你觉得本项目不错,希望你能够点击一下右上角的star 2. 如果你希望参与改进本项目,那么请点击右上角的fork,修改之后pull request即可。如果你的贡献不错,你就会收到加入[sword](https://github.com/sword-org)开源社区的邀请。 3. 如果你发现了一个bug,请你创建一个issue来报告。 非常非常欢迎你能够参与本项目的建设,每人做出一点点贡献,对整个项目来说就是一个非常大的贡献,希望集合众人的力量,让项目走的更好,能够为更多的人服务。 ###贡献者列表 * [@chengn](https://github.com/chengn) * [@truecn](https://github.com/truecn) * [@Zhangys-hh](https://github.com/Zhangys-hh)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值