一个简单最大正向匹配（Maximum Matching）MM中文分词算法的实现

最新推荐文章于 2020-07-03 13:51:02 发布

weixin_34266504

最新推荐文章于 2020-07-03 13:51:02 发布

阅读量201

点赞数

文章标签： c/c++ matlab java

转载http://blog.csdn.net/wzb56/article/details/7914954#

1.构建词典内存树的TrieNode节点类：

package cn.wzb.segmenter.mm.bean;

import java.util.HashMap;

/**
* 构建内存词典的Trie树结点
*
*/
public class TrieNode {
     /** 结点关键字，其值为中文词中的一个字 */
     public char key = '\0';

     /** 如果该字在词语的末尾，则bound=true */
     public boolean bound = false;

     /** 指向下一个结点的指针结构，用来存放当前字在词中的下一个字的位置 */
     public HashMap<Character, TrieNode> childs = new HashMap<Character, TrieNode>();

     public TrieNode() {}

     public TrieNode( char key) {
         this.key = key;
    }
}

2. 最大正向匹配算法（Maximum Matching）算法的实现类：segmenter类：核心方法segment();

package cn.wzb.segmenter.mm;

import java.io.IOException;

import cn.wzb.segmenter.AbstractSegmenter;
import cn.wzb.segmenter.mm.bean.TrieDictionary;
import cn.wzb.segmenter.mm.bean.TrieNode;

public class MMSegmenter extends AbstractSegmenter {
     public static TrieDictionary dict = null;

     static { // 加载词典
        String dictionaryName = "/cn/wzb/dictionary/word_dic_utf8.txt";
        dict = TrieDictionary.getInstance(dictionaryName);
    }

     public MMSegmenter() {
         super("一个简单的最大的正向匹配器：MMSegmenter");
    }

     /**
     * 词典：用Trie树表示，每个节点都是一个TrieNode节点
     * 每个TrieNode节点中有:
     *   1.表示一个字
     *   2.以该字为前缀的所有的下一个字的HashMap<"字"，字的TrieNode>
     *   3.bound标记，该字是不是一个词的结尾。在最大匹配中有用（Maximum Matching）
     *
     * 正向MM（Maximum Matching）算法的核心思想：
     *  1. 从句子中，取词
     *  2. 将词添加到分词列表中
     *  3. 将分词标记 "|"添加到分词表
     *
     * 其中的句子中的成分分为以下几种：
     * 1. 非分词：如分隔符，直接跳过
     * 2. 分词：分词分为以下几种：
     *      a. 非中文分词：将分隔符分隔的连续的非中文字符作为一个分词
     *      b. 中文分词：
     *          i. 词典中的词：作为一个分词
     *         ii. 词典中的词的前缀：将每个字作为一个分词
     *        iii. 非词典中的词：将每个字作为一个分词
     *
     * 该分词的核心：对于前缀词的划分
      */

     public String segment(String sentence) {
        StringBuffer segBuffer = new StringBuffer();

        TrieNode p = dict.getRoot();
        ;
        TrieNode pChild = null;

         int length = sentence.length();
         int segBoundIndex = -1; // 保存上次分词结束字符在sentence中的位置

         for ( int i = 0; i < length; ++i) {
             char c = sentence.charAt(i);
             if (CharacterType.isCharSeperator(c)) { // 分隔符
                 // do nothing;
            } else if (CharacterType.isCharOther(c)) { // 其他语言字符
                  do {
                    segBuffer.append(c);
                     if(++i == length){
                         break;
                    }
                    c = sentence.charAt(i);
                } while (CharacterType.isCharOther(c));
                  if( i != length) --i; // 还原现场
            } else if (CharacterType.isCharChinese(c)) {
                pChild = p.childs.get(Character.valueOf(c));
                 if (pChild == null) { // 不在词典中的中文字符
                    segBuffer.append(c);
                } else {
                      do { // 在词典中的词
                        segBuffer.append(c);
                         if (p == dict.getRoot() || pChild.bound) { // 算法的关键，能够保证前缀词，被划分。
                            segBoundIndex = i;
                        }
                         if (++i >= length) {
                             break;
                        }
                        c = sentence.charAt(i);
                        p = pChild;
                        pChild = (TrieNode) p.childs.get(Character.valueOf(c));
                    } while (pChild != null);
                     // 切除非词典中词的前缀词
                     if (--i >= segBoundIndex) {
                        segBuffer.delete(segBuffer.length() - (i - segBoundIndex), segBuffer.length());
                    }
                     // 还原现场
                    i = segBoundIndex;
                    p = dict.getRoot();
                }
            }
            segBuffer.append('|'); // 添加分词标记
        }

         return new String(segBuffer);
    }

     public String segment(String sentence, String verison) {
        StringBuffer segBuffer = new StringBuffer();

         int segBoundIdx = 0;
         int length = sentence.length();
        TrieNode p = null;
        TrieNode pChild = null;

         for ( int i = 0; i < length; i++) {
             char c = sentence.charAt(i);

            p = dict.getRoot();
            pChild = p.childs.get(Character.valueOf(c));

             // 不在词典中的字符
             if (pChild == null) {
                 if (CharacterType.isCharSeperator(c)){
                    segBuffer.append(c); // do something;
                } if (CharacterType.isCharChinese(c)) {
                    segBuffer.append(c);
                } else {
                     do { // 非中文字符
                        segBuffer.append(c);
                         if (++i == length){
                             break;
                        }
                        c = sentence.charAt(i);
                    } while (CharacterType.isCharOther(c));
                     if( i != length) --i; // 还原现场
                }
            } else { // 中文字词
                 while (pChild != null) {
                     if (p == dict.getRoot() || pChild.bound) { // 词典中的词或者词典中词的前缀词；前缀词将被单字划分
                        segBoundIdx = i;
                    }
                    segBuffer.append(c);
                     if (++i == length) {
                         break;
                    }
                    c = sentence.charAt(i);
                    p = pChild;
                    pChild = p.childs.get(Character.valueOf(c));
                }
                 // 切除分词表中不在词典中的前缀字词
                 if (--i > segBoundIdx) {
                    segBuffer.delete(segBuffer.length() - (i - segBoundIdx), segBuffer.length());
                }
                 // 还原现场
                i = segBoundIdx;
            }
            segBuffer.append('|');
        }

         return new String(segBuffer);
    }

     public static void main(String args[]) throws IOException {
        MMSegmenter mmsegger = new MMSegmenter();
        System.out.println(mmsegger.segment("中华人民共和国是一个伟大的国家hello world"));
        System.out.println(mmsegger.segment("小红是个爱学习的好学生!!!!!"));
        System.out.println(mmsegger.segment("中华民de hello world!人民共"));
        System.out.println(mmsegger.segment("中华人民共"));
        System.out.println(mmsegger.segment("中华人民共和国家"));
        System.out.println(mmsegger.segment("爱国"));
        System.out.println(mmsegger.segment("爱我Love你"));
        System.out.println(mmsegger.segment("京华时报２００８年1月23日报道昨天，受一股来自中西伯利亚的强冷空气影响，本市出现大风降温天气，白天最高气温只有零下7摄氏度，同时伴有6到7级的偏北风。"));

        System.out.println("another version: ");
        System.out.println(mmsegger.segment("中华人民共和国是一个伟大的国家hello world", " "));
        System.out.println(mmsegger.segment("小红是个爱学习的好学生!!!!!", " "));
        System.out.println(mmsegger.segment("中华民de hello world!人民共", " "));
        System.out.println(mmsegger.segment("中华人民共", " "));
        System.out.println(mmsegger.segment("中华人民共和国家", " "));
        System.out.println(mmsegger.segment("爱国", " "));
        System.out.println(mmsegger.segment("爱我Love你", " "));
        System.out.println(mmsegger.segment("京华时报2008年1月23日报道昨天，受一股来自中西伯利亚的强冷空气影响，本市出现大风降温天气，白天最高气温只有零下7摄氏度，同时伴有6到7级的偏北风。", ""));




         // System.out.println(CharacterType.isCharSeperator(' '));
    }
}

3.关于字符类型辅助类：CharacterType类：

package cn.wzb.segmenter.mm;

class CharacterType {
     public static boolean isCharSeperator( char c) {
         return "\u3002\uFF01\uFF1F\uFF1A\uFF1B\u3001\uFF0C\uFF08\uFF09\u300A\u300B\u3010\u3011{}\u201C\u201D\u2018\u2019!?:;,()<>[]{}\"'\n\r\t ".indexOf(c) != -1;
    }

     public static boolean isCharChinese( char c) {
         return c >= '\u4E00' && c <= '\u9FBF';
    }

     public static boolean isCharOther( char c) {
         return !isCharSeperator(c) && !isCharChinese(c);
    }

     // private static final String C_E_SEPERATOR = "\u3002\uFF01\uFF1F\uFF1A\uFF1B\u3001\uFF0C\uFF08\uFF09\u300A\u300B\u3010\u3011{}\u201C\u201D\u2018\u2019!?:;,()<>[]{}\"'\n\r\t ";
     // private static final String str = "。！？：；、，（）《》【】{}“”‘’!?:;,()<>[]{}\"'\n\r\t ";
}

4. 该算法使用的词典文件：

[java] view plain copy

希望
中华
人民
共和国
中华人民共和国
一个
伟大
国家
西安
北京
家庭
家里
爱国者
我Love你
学习
好学生
学生
爱学
爱学习

5.分词测试结果：

[java] view plain copy

dictionary loading OK!
[oooggooo]一个简单的最大的正向匹配器：MMSegmenter segmenter on
中华人民共和国|是|一个|伟大|的|国家|hello|world|
小|红|是|个|爱学习|的|好学生||||||
中华|民|de|hello|world|人民|共|
中华|人民|共|
中华人民共和国|家|
爱|国|
爱|我Love你|
another version:
中华人民共和国|是|一个|伟大|的|国家|hello|world|
小|红|是|个|爱学习|的|好学生|!|!|!|!|!|
中华|民|de|hello|world|人民|共|
中华|人民|共|
中华人民共和国|家|
爱|国|
爱|我Love你|

weixin_34266504

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
一个简单最大正向匹配（Maximum Matching）MM中文分词算法的实现

转载http://blog.csdn.net/wzb56/article/details/7914954#1.构建词典内存树的TrieNode节点类： package cn.wzb.segmenter.mm.bean;import java.util.HashMap;/** * 构建内存词典的Trie树结点 * */public class TrieNode { /...
复制链接

扫一扫