NLP:Python 实现unigram（一元组）和bigram（二元组）

海绵大星星

已于 2022-07-05 10:57:06 修改

阅读量2.4k

点赞数 3

分类专栏： NLP Python 文章标签：自然语言处理数据挖掘人工智能

于 2022-07-05 10:47:04 首次发布

本文链接：https://blog.csdn.net/qq_43552032/article/details/125614648

版权

Python 同时被 2 个专栏收录

7 篇文章 0 订阅

订阅专栏

NLP

3 篇文章 1 订阅

订阅专栏

def add_endings(text):
    """将文章分句并加句子结束符</s>,返回带有</s>的文章（字符串类型）"""
    sentences = nltk.sent_tokenize( text )
    result_text = """"""
    for sentence in sentences:
        sentence = "<s> " + sentence + " </s> "
        result_text += sentence
    return result_text


def preprocess(text):
    """传入文章原文，进行简单的预处理后，返回tokens列表"""
    text = text.lower()  # 小写
    text = add_endings( text )  # 分句并加句子开始结束符
    # print( "经过预处理的文章：\n", text )
    pattern = re.compile( r"[-\[\]()\t\n.,;!?“”‘'\\`~\s]+" )
    tokens = pattern.split( text )
    return tokens


def unigrams(tokens):
    """传入规范化以及分词后的符记序列，并以字典的形式返回一元语法模型，其中键具有唯一性，值则代表了该一元语法在语料库中出现的次数"""
    unigram = {}
    for word in tokens:
        if word not in unigram:
            unigram[word] = 1
        else:
            unigram[word] = unigram[word] + 1
    return unigram


def bigrams(tokens):
    """传入规范化以及分词后的符记序列，并以字典的形式返回二元语法模型，其中键具有唯一性，值则代表了该二元语法在语料库中出现的次数"""
    bigram = {}
    # 每两个相邻单词截取为一个二元语法
    bi_grammar = ''
    for index, word in enumerate( tokens ):  # 同时枚举出符记的下标和值
        #  print( index, word )
        if word != "</s>" and word != "":
            bi_grammar = word + " " + tokens[index + 1]
            #  print( bi_grammar )
            if bi_grammar not in bigram:
                bigram[bi_grammar] = 1
            else:
                bigram[bi_grammar] = bigram[bi_grammar] + 1

    return bigram