代码:
# 1.构建实验语料库
corpus = [
"我喜欢吃苹果",
"我喜欢吃香蕉",
"她喜欢吃葡萄",
"他不喜欢吃香蕉",
"他喜欢吃苹果",
"她喜欢吃草莓",
]
# 2.把句子分成N个“Gram”
def tokenize(text):
return [char for char in text]
# 3.计算每个Bigram在语料库中的词频
from collections import defaultdict, Counter
def count_ngrams(corpus, n):
ngrams_count = defaultdict(Counter)
for text in corpus:
tokens = tokenize(text)
for i in range(len(tokens) - n + 1):
ngram = tuple(tokens[i:i + n])
prefix = ngram[:-1]
token = ngram[-1]
ngrams_count[prefix][token] += 1
return ngrams_count
bigram_counts = count_ngrams(corpus,2)
# print("Bigram 词频:")
# for prefix, counts in bigram_counts.items():
# print("{}:{}".format("".join(prefix), dict(counts)))
# 4.计算每个Bigram出现的频率
def ngram_probabilities(ngrams_counts):
ngram_probs = defaultdict(Counter)
for prefix, tokens_count in ngrams_counts.items(): #返回前缀和计数字典
total_count = sum(tokens_count.values())
for token, count in tokens_count.items(): #返回gram和计数
ngram_probs[prefix][token] = count / total_count
return ngram_probs
bigram_probs = ngram_probabilities(bigram_counts)
# print("\nbigram 出现的概率:")
# for prefix, probs in bigram_probs.items():
# print("{}:{}".format(prefix, dict(probs)))
# 5.根据Bigram出现的概率,定义生成下一个词的函数
def generate_next_token(prefix, ngram_probs):
if not prefix in ngram_probs:
return None
next_token_probs = ngram_probs[prefix]
next_token = max(next_token_probs, key=next_token_probs.get)
return next_token
# 6.输入一个前缀,生成连续的文本
def generate_text(prefix, ngram_probs,n,length=6):
tokens = list(prefix)
for _ in range(length):
next_token = generate_next_token(tuple(tokens[-(n-1):]),ngram_probs)
if not next_token:
break
tokens.append(next_token)
return ''.join(tokens)
generate_text = generate_text('他',bigram_probs,2)
print("\n生成的文本:",generate_text)
结果: