NLP实验四：汉语分词

please叫我滚去学习

已于 2022-10-01 10:05:59 修改

阅读量413

点赞数

分类专栏：自然语言处理文章标签：自然语言处理 java 算法 python

于 2022-06-03 16:10:52 首次发布

本文链接：https://blog.csdn.net/zhiliao_aaa/article/details/125113150

版权

自然语言处理专栏收录该内容

5 篇文章 2 订阅

订阅专栏

最大匹配算法（Max Matching）

最大匹配算法

前向最大匹配算法（Forward Max Matching）

FMM

Java实现：

package NLP.NLP04;

import java.util.*;

public class NLP04_FMM {
    private final Set<String> Dictionary = new HashSet<>();

    /**
     * 添加词库
     */
    public void setDictionary(String[] words) {
        Collections.addAll(this.Dictionary, words);
    }

    /**
     * 正向最大匹配Forward Maximum Matching
     *
     * @param sentence 要分词的句子
     * @param maxSize  词的最大长度
     */
    public String[] positiveSplit(String sentence, int maxSize) {
        List<String> result = new ArrayList<>();
        while (sentence.length() > 0) {
            String forward_maxSize = sentence.substring(0, Math.min(maxSize, sentence.length()));//先取这句话的前maxSize个字
            while (!Dictionary.contains(forward_maxSize)) {//如果这些截取的字组成的词不在词典中
                if (forward_maxSize.length() > 1) {
                    forward_maxSize = forward_maxSize.substring(0, forward_maxSize.length() - 1);//把这个前缀部分从尾部减去一个字
                } else {
                    break;//到最后都没在词典中找到的话，退出循环，直接按照一个新词加入就行
                }
            }
            result.add(forward_maxSize);
            sentence = sentence.substring(forward_maxSize.length());//当前的句子已经匹配上一个词，要把这个词从头部减去
        }
        return result.toArray(new String[0]);
    }

    public static void main(String[] args) {
        String[] words = new String[]{"他", "我", "你", "她", "呵呵", "哈哈", "中文", "字母", "数学", "吃饭", "二", "三", "读研", "究极", "去", "他们", "老人",
                "走", "念书", "睡觉", "看苏", "看看", "走走", "喝水", "看电视", "手指", "脚趾", "手脚", "刚好", "好的", "并且", "短发", "滑的", "头发", "在", "吃饭",
                "句子", "是", "研究", "研究生", "生物", "物化", "化学", "学", "的", "一", "位", "科学", "家", "科学家", "。", "？", "！", "吃饭饭", "睡觉觉", "恶心心"
                , "边", "过年", "计算机", "计算机网络", "网络", "操作系统", "自然语言处理", "汉语", "分词", "自然", "语言", "处理"};
        String[] sentences = {"他是研究生物化学的一个科学家。",
                "你去研究自然语言处理。",
                "她是计算机网络科学家。",
                "我的头发是短发并且是滑的。",
                "他边吃饭边看电视。"};
        String[][] answers = {{"他", "是", "研究", "生物化学", "的", "一", "个", "科学家", "。"},
                {"你", "去", "研究", "自然语言处理", "。"},
                {"她", "是", "计算机", "网络", "科学家", "。"},
                {"我", "的", "头发", "是", "短发", "并且", "是", "滑的", "。"},
                {"他", "边", "吃饭", "边", "看", "电视", "。"}};
        NLP04_FMM nlp04_fmm = new NLP04_FMM();
        /*添加词到词库*/
        nlp04_fmm.setDictionary(words);
        int num_of_outWords = 0;
        int num_of_rights = 0;
        int num_of_answers = 0;
        for (String sentence : sentences) {
            System.out.println("sentence：" + sentence);
        }
        System.out.println();
        for (String[] answer : answers) {
            num_of_answers += answer.length;
            System.out.println("answer：" + Arrays.toString(answer));
        }
        System.out.println();
        System.out.println("FMM(正向最大匹配)分词算法：");
        /*下面我指定最大词长度为7*/
        for (int i = 0; i < sentences.length; i++) {
            String[] FMM = nlp04_fmm.positiveSplit(sentences[i], 7);
            num_of_outWords += FMM.length;
            List<String> answer_list = Arrays.asList(answers[i]);
            for (String result_words : FMM) {
                if (answer_list.contains(result_words))
                    num_of_rights++;
            }
            System.out.println("Result：" + Arrays.toString(FMM)
                    .substring(1, Arrays.toString(FMM).length() - 1)
                    .replaceAll(", ", "/"));
        }
        System.out.println();
        System.out.println("测试集共输出：" + num_of_outWords + "个词");
        System.out.println("标准答案是：" + num_of_answers + "个词");
        System.out.println("系统切分出来的结果中有" + num_of_rights + "个词是正确的");
        double P = (double)num_of_rights / num_of_outWords;
        double R = (double)num_of_rights / num_of_answers;
        double F1 = 2 * P * R / (P + R);
        System.out.println();
        System.out.println("P=" + P);
        System.out.println("R=" + R);
        System.out.println("F1=" + F1);
    }

}

测试结果

在这里插入图片描述

N-最短路径算法(Python实现)

算法思想

N最短路径算法是一种基于词典的分词算法. 每个句子将生成一个有向无环图DAG, 每个字作为图的一个定点, 边代表可能的分词.
在这里插入图片描述
在上图中，边的起点为词的第一个字，边的终点为词尾的下一个字。边1表示“我”字单字成词，边2表示 “只是”可以作为一个单词。
每个边拥有一个权值，表示该词出现的概率。最简单的做法是采用词频作为权值。
N最短路径分词即在上述有向无环图中寻找N条权值和最大的路径，路径上的边标志了最可能的分词结果。通常我们只寻找权值和最大的那一条路径。
在这里插入图片描述
根据上图的权值最大路径得到的分词结果为：“我/只是/做/了/一些/微小/的/工作”

举例

N-最短路径方法

评价指标

在这里插入图片描述

实现词典

# 词典将根据输入的文本, 统计各单词的词频:
# 得到的词频存储在Words.txt文件下
class WordDictModel:
    def __init__(self):
        self.word_dict = {}
        self.data = None
        self.stop_words = {}

    def load_data(self, filename):
        self.data = open(filename, "r", encoding="utf-8")

    # update函数可以在原有词典的基础上, 根据新的输入更新词频.
    def update(self):
        # build word_dict
        for line in self.data:
            words = line.split(" ")
            for word in words:
                if word in self.stop_words:
                    continue
                if self.word_dict.get(word):
                    self.word_dict[word] += 1
                else:
                    self.word_dict[word] = 1

    # 得到的词频存储在Words.txt文件下
    def save(self, filename="Words.txt", code="txt"):
        fw = open(filename, 'w', encoding="utf-8")
        data = {
            "word_dict": self.word_dict
        }

        for key in self.word_dict:
            tmp = "%s %d\n" % (key, self.word_dict[key])
            fw.write(tmp)
        fw.close()

    # 读取Words.txt词频文件
    def load(self, filename="Words.txt", code="txt"):
        fr = open(filename, 'r', encoding='utf-8')

        # load model
        model = {}
        word_dict = {}
        for line in fr:
            tmp = line.split(" ")
            if len(tmp) < 2:
                continue
            word_dict[tmp[0]] = int(tmp[1])
            model = {"word_dict": word_dict}
        fr.close()

        # update word dict
        word_dict = model["word_dict"]
        for key in word_dict:
            if self.word_dict.get(key):
                self.word_dict[key] += word_dict[key]
            else:
                self.word_dict[key] = word_dict[key]

最短路径分词：

# encoding=utf-8
from Word_Dictionary import WordDictModel


# 最短路径分词
# 分词器需要继承WordDictModel, 并利用其词典. 遍历输入语句中所有子串, 并查询其词频构造有向无环图:
class DAGSegment(WordDictModel):
    # 产生的dag格式:
    # {
    #     0: [(1, 1)],
    #     1: [(2, 1), (3, 738)],
    #     2: [(3, 1)],
    #     3: [(4, 1)],
    #     4: [(5, 1)],
    #     5: [(6, 1), (7, 2309)],
    #     6: [(7, 1)],
    #     7: [(8, 1), (9, 304)],
    #     8: [(9, 1)],
    #     9: [(10, 1)],
    #     10: [(11, 1), (12, 2605)],
    #     11: [(12, 1)],
    # }
    # 字典中的值代表其键的后继顶点, 如1: [(2, 1), (3, 738)]表示顶点1到顶点2的权值为1, 到顶点3的权值为738.
    def build_dag(self, sentence):
        dag = {}
        for start in range(len(sentence)):
            unique = [start + 1]
            tmp = [(start + 1, 1)]
            for stop in range(start+1, len(sentence)+1):
                fragment = sentence[start:stop]
                num = self.word_dict.get(fragment, 0)
                if num > 0 and (stop not in unique):
                    tmp.append((stop, num))
                    unique.append(stop)
            dag[start] = tmp
            # print(dag[start])
        return dag

    # 最短路径最好使用Floyd或Dijkstra算法.但其耗时太久, 大多数情况下使用贪心算法求得次优解就可以达到所需精度:
    def predict(self, sentence):
        Len = len(sentence)
        route = [0] * Len
        dag = self.build_dag(sentence)  # {i: (stop, num)}

        for i in range(0, Len):
            route[i] = max(dag[i], key=lambda x: x[1])[0]
        return route