一个简单最大正向匹配（Maximum Matching）MM中文分词算法的实现

最新推荐文章于 2022-11-20 14:33:51 发布

原创最新推荐文章于 2022-11-20 14:33:51 发布 · 4.4k 阅读

3 ·

CC 4.0 BY-SA版权

文章标签：

#算法 #string #c #null #hashmap #dictionary

中文分词专栏收录该内容

2 篇文章

订阅专栏

1.构建词典内存树的TrieNode节点类：

package cn.wzb.segmenter.mm.bean;

import java.util.HashMap;

/**
 * 构建内存词典的Trie树结点
 *   	
 */
public class TrieNode {
	/** 结点关键字，其值为中文词中的一个字 */
	public char key = '\0';
	
	/** 如果该字在词语的末尾，则bound=true */
	public boolean bound = false;
	
	/** 指向下一个结点的指针结构，用来存放当前字在词中的下一个字的位置 */
	public HashMap<Character, TrieNode> childs = new HashMap<Character, TrieNode>();

	public TrieNode() {}

	public TrieNode(char key) {
		this.key = key;
	}
}

2. 最大正向匹配算法（Maximum Matching）算法的实现类：segmenter类：核心方法segment();

package cn.wzb.segmenter.mm;

import java.io.IOException;

import cn.wzb.segmenter.AbstractSegmenter;
import cn.wzb.segmenter.mm.bean.TrieDictionary;
import cn.wzb.segmenter.mm.bean.TrieNode;

public class MMSegmenter extends AbstractSegmenter {
	public static TrieDictionary dict = null;
	
	static { //加载词典
		String dictionaryName = "/cn/wzb/dictionary/word_dic_utf8.txt";
		dict = TrieDictionary.getInstance(dictionaryName);
	}
	
	public MMSegmenter() {
		super("一个简单的最大的正向匹配器：MMSegmenter");
	}

	/**
	 * 词典：用Trie树表示，每个节点都是一个TrieNode节点
	 * 每个TrieNode节点中有:
	 *   1.表示一个字
	 *   2.以该字为前缀的所有的下一个字的HashMap<"字"， 字的TrieNode>
	 *   3.bound标记，该字是不是一个词的结尾。在最大匹配中有用（Maximum Matching） 
	 * 
	 * 正向MM（Maximum Matching）算法的核心思想：
	 *  1. 从句子中，取词 
	 *  2. 将词添加到分词列表中 
	 *  3. 将分词标记 "|"添加到分词表
	 * 
	 * 其中的句子中的成分分为以下几种： 
	 * 1. 非分词：如分隔符，直接跳过 
	 * 2. 分词： 分词分为以下几种：
	 *  	a. 非中文分词：将分隔符分隔的连续的非中文字符作为一个分词 
	 *  	b. 中文分词： 
	 *  		i. 词典中的词：作为一个分词 
	 *  	   ii. 词典中的词的前缀：将每个字作为一个分词 
	 *        iii. 非词典中的词： 将每个字作为一个分词
	 * 
	 * 该分词的核心：对于前缀词的划分
	 */

	public String segment(String sentence) {
		StringBuffer segBuffer = new StringBuffer();

		TrieNode p = dict.getRoot();
		;
		TrieNode pChild = null;

		int length = sentence.length();
		int segBoundIndex = -1; //保存上次分词结束字符在sentence中的位置	

		for (int i = 0; i < length; ++i) {			
			char c = sentence.charAt(i);
			if (CharacterType.isCharSeperator(c)) {// 分隔符
				// do nothing;
			} else if (CharacterType.isCharOther(c)) {// 其他语言字符				
				 do {
					segBuffer.append(c);
					if(++i == length){
						break;
					}
					c = sentence.charAt(i);				
				}while (CharacterType.isCharOther(c));
				 if( i != length) --i; //还原现场			 
			} else if (CharacterType.isCharChinese(c)) {
				pChild = p.childs.get(Character.valueOf(c));
				if (pChild == null) {// 不在词典中的中文字符
					segBuffer.append(c);
				} else {
					 do {// 在词典中的词
						segBuffer.append(c);
						if (p == dict.getRoot() || pChild.bound) { // 算法的关键，能够保证前缀词，被划分。
							segBoundIndex = i;
						}
						if (++i >= length) {
							break;
						}
						c = sentence.charAt(i);
						p = pChild;
						pChild = (TrieNode) p.childs.get(Character.valueOf(c));
					}while (pChild != null);
					//切除非词典中词的前缀词
					if (--i >= segBoundIndex) {
						segBuffer.delete(segBuffer.length() - (i - segBoundIndex), segBuffer.length());
					}
					//还原现场
					i = segBoundIndex;
					p = dict.getRoot();
				}
			}
			segBuffer.append('|'); //添加分词标记
		}

		return new String(segBuffer);
	}

	public String segment(String sentence, String verison) {
		StringBuffer segBuffer = new StringBuffer();

		int segBoundIdx = 0;
		int length = sentence.length();
		TrieNode p = null; 
		TrieNode pChild = null;
		
		for (int i = 0; i < length; i++) {
			char c = sentence.charAt(i);
			
			p = dict.getRoot();			
			pChild = p.childs.get(Character.valueOf(c));
			
			// 不在词典中的字符
			if (pChild == null) {
				if (CharacterType.isCharSeperator(c)){
					segBuffer.append(c);// do something;
				} if (CharacterType.isCharChinese(c)) {
					segBuffer.append(c);
				} else {
					do { // 非中文字符
						segBuffer.append(c);
						if (++i == length){
							break;
						}						
						c = sentence.charAt(i);
					} while (CharacterType.isCharOther(c));
					if( i != length) --i; //还原现场
				}
			} else { // 中文字词
				while (pChild != null) {
					if (p == dict.getRoot() || pChild.bound) { //词典中的词或者词典中词的前缀词；前缀词将被单字划分
						segBoundIdx = i;
					}
					segBuffer.append(c);					
					if (++i == length) {
						break;
					}									
					c = sentence.charAt(i);
					p = pChild;	
					pChild = p.childs.get(Character.valueOf(c));
				}
				//切除分词表中不在词典中的前缀字词
				if (--i > segBoundIdx) {
					segBuffer.delete(segBuffer.length() - (i - segBoundIdx), segBuffer.length());
				}
				//还原现场
				i = segBoundIdx;				
			}			
			segBuffer.append('|');
		}
		
		return new String(segBuffer);
	}

	public static void main(String args[]) throws IOException {
		MMSegmenter mmsegger = new MMSegmenter();
		System.out.println(mmsegger.segment("中华人民共和国是一个伟大的国家hello world"));
		System.out.println(mmsegger.segment("小红是个爱学习的好学生!!!!!"));
		System.out.println(mmsegger.segment("中华民de hello world!人民共"));
		System.out.println(mmsegger.segment("中华人民共"));
		System.out.println(mmsegger.segment("中华人民共和国家"));
		System.out.println(mmsegger.segment("爱国"));
		System.out.println(mmsegger.segment("爱我Love你"));
		System.out.println(mmsegger.segment("京华时报２００８年1月23日报道 昨天，受一股来自中西伯利亚的强冷空气影响，本市出现大风降温天气，白天最高气温只有零下7摄氏度，同时伴有6到7级的偏北风。"));
		
		System.out.println("another version: ");		
		System.out.println(mmsegger.segment("中华人民共和国是一个伟大的国家hello world", " "));
		System.out.println(mmsegger.segment("小红是个爱学习的好学生!!!!!", " "));
		System.out.println(mmsegger.segment("中华民de hello world!人民共", " "));
		System.out.println(mmsegger.segment("中华人民共", " "));
		System.out.println(mmsegger.segment("中华人民共和国家", " "));
		System.out.println(mmsegger.segment("爱国", " "));
		System.out.println(mmsegger.segment("爱我Love你", " "));
		System.out.println(mmsegger.segment("京华时报2008年1月23日报道 昨天，受一股来自中西伯利亚的强冷空气影响，本市出现大风降温天气，白天最高气温只有零下7摄氏度，同时伴有6到7级的偏北风。", ""));
	

		

		//System.out.println(CharacterType.isCharSeperator(' '));	
	}	
}

3.关于字符类型辅助类：CharacterType类：

package cn.wzb.segmenter.mm;

class CharacterType {	
	public static boolean isCharSeperator(char c) {
		return "\u3002\uFF01\uFF1F\uFF1A\uFF1B\u3001\uFF0C\uFF08\uFF09\u300A\u300B\u3010\u3011{}\u201C\u201D\u2018\u2019!?:;,()<>[]{}\"'\n\r\t ".indexOf(c) != -1;
	}
	
	public static boolean isCharChinese(char c) {
		return c >= '\u4E00' && c <= '\u9FBF';
	}
	
	public static boolean isCharOther(char c) {
		return !isCharSeperator(c) && !isCharChinese(c);
	}

	//private static final String C_E_SEPERATOR = "\u3002\uFF01\uFF1F\uFF1A\uFF1B\u3001\uFF0C\uFF08\uFF09\u300A\u300B\u3010\u3011{}\u201C\u201D\u2018\u2019!?:;,()<>[]{}\"'\n\r\t ";
    //private static final String str = "。！？：；、，（）《》【】{}“”‘’!?:;,()<>[]{}\"'\n\r\t ";
}

4. 该算法使用的词典文件：

希望
中华
人民
共和国
中华人民共和国
一个
伟大
国家
西安
北京
家庭
家里
爱国者
我Love你
学习
好学生
学生
爱学
爱学习

5.分词测试结果：

dictionary loading OK!
[oooggooo]一个简单的最大的正向匹配器：MMSegmenter segmenter on
中华人民共和国|是|一个|伟大|的|国家|hello|world|
小|红|是|个|爱学习|的|好学生||||||
中华|民|de|hello|world|人民|共|
中华|人民|共|
中华人民共和国|家|
爱|国|
爱|我Love你|
another version: 
中华人民共和国|是|一个|伟大|的|国家|hello|world|
小|红|是|个|爱学习|的|好学生|!|!|!|!|!|
中华|民|de|hello|world|人民|共|
中华|人民|共|
中华人民共和国|家|
爱|国|
爱|我Love你|