NLP:Python 实现unigram(一元组)和bigram(二元组)

def add_endings(text):
    """将文章分句并加句子结束符</s>,返回带有</s>的文章(字符串类型)"""
    sentences = nltk.sent_tokenize( text )
    result_text = """"""
    for sentence in sentences:
        sentence = "<s> " + sentence + " </s> "
        result_text += sentence
    return result_text


def preprocess(text):
    """传入文章原文,进行简单的预处理后,返回tokens列表"""
    text = text.lower()  # 小写
    text = add_endings( text )  # 分句并加句子开始结束符
    # print( "经过预处理的文章:\n", text )
    pattern = re.compile( r"[-\[\]()\t\n.,;!?“”‘'\\`~\s]+" )
    tokens = pattern.split( text )
    return tokens


def unigrams(tokens):
    """传入规范化以及分词后的符记序列,并以字典的形式返回一元语法模型,其中键具有唯一性,值则代表了该一元语法在语料库中出现的次数"""
    unigram = {}
    for word in tokens:
        if word not in unigram:
            unigram[word] = 1
        else:
            unigram[word] = unigram[word] + 1
    return unigram


def bigrams(tokens):
    """传入规范化以及分词后的符记序列,并以字典的形式返回二元语法模型,其中键具有唯一性,值则代表了该二元语法在语料库中出现的次数"""
    bigram = {}
    # 每两个相邻单词截取为一个二元语法
    bi_grammar = ''
    for index, word in enumerate( tokens ):  # 同时枚举出符记的下标和值
        #  print( index, word )
        if word != "</s>" and word != "":
            bi_grammar = word + " " + tokens[index + 1]
            #  print( bi_grammar )
            if bi_grammar not in bigram:
                bigram[bi_grammar] = 1
            else:
                bigram[bi_grammar] = bigram[bi_grammar] + 1

    return bigram
  • 3
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

海绵大星星

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值