代码:
# 1.构建实验语料库
corpus = [
"我喜欢吃苹果",
"我喜欢吃香蕉",
"她喜欢吃葡萄",
"他不喜欢吃香蕉",
"他喜欢吃苹果",
"她喜欢吃草莓",
]
# 2.把句子分成N个“Gram”
def tokenize(text):
return [char for char in text]
# 3.计算每个Bigram在语料库中的词频
from collections import defaultdict, Counter
def count_ngrams(corpus, n):
ngrams_count = defaultdict(Counter)
for text in corpus:
tokens = tokenize(text)
for i in range(len(tokens) - n + 1):
ngram = tuple(tokens[i:i + n])
prefix = ngram[:-1]
token = ngram[-1]
ngrams_count[prefix][token] += 1
return ngrams_count
bigram_counts = count_ngrams(corpus,2)
# print("Bigram 词频:")
# for prefix, counts in bigram_counts.items():
# print("{}:{}".format("".join(prefix), dict(counts)))
# 4.计算每个Bigram出现的频率
def ngram_probabilities(ngrams_coun