正文
代码
import nltk
import string
from nltk.util import ngrams
from collections import Counter
# 下载nltk数据包(首次使用时需要)
# nltk.download('punkt')
# 修改后的示例文本
document = "In the bustling city of Metroville, a young inventor named Leo " \
"worked tirelessly in his small workshop. His latest project was a " \
"drone capable of delivering supplies to remote areas. After months " \
"of trial and error, the prototype was finally ready for its first " \
"test flight. Early one morning, Leo launched the drone, watching " \
"anxiously as it soared into the sky. The drone navigated the urban " \
"landscape with ease, avoiding obstacles and reaching its destination " \
"flawlessly. This success marked a significant step forward in Leo's " \
"mission to improve access to essential resources for isolated communities " \
"around the world. Leo's drone project was truly innovative. " \
"Leo worked hard on his drone project. The prototype was a success, " \
"and Leo was proud of his work."
# 数据预处理函数
def preprocess_text(text):
# 转为小写
text = text.lower()
# 去除标点符号
text = text.translate(str.maketrans('', '', string.punctuation))
# 分词
tokens = nltk.word_tokenize(text)
return tokens
# 分词并预处理
tokens = preprocess_text(document)
# 生成N-gram特征
def generate_ngrams(tokens, n):
n_grams = list(ngrams(tokens, n))
return n_grams
# 统计N-gram频率
def count_ngrams(n_grams):
return Counter(n_grams)
# 提取Unigram, Bigram和Trigram特征
unigrams = generate_ngrams(tokens, 1)
bigrams = generate_ngrams(tokens, 2)
trigrams = generate_ngrams(tokens, 3)
# 统计N-gram频率
unigram_counts = count_ngrams(unigrams)
bigram_counts = count_ngrams(bigrams)
trigram_counts = count_ngrams(trigrams)
# 打印结果
print("Unigram特征及其频率:")
for gram, count in unigram_counts.items():
print(f"{gram}: {count}")
print("\nBigram特征及其频率:")
for gram, count in bigram_counts.items():
print(f"{gram}: {count}")
print("\nTrigram特征及其频率:")
for gram, count in trigram_counts.items():
print(f"{gram}: {count}")
# 将特征及其频率转换为特征向量
def ngram_to_feature_vector(ngram_counts):
features = {}
for gram, count in ngram_counts.items():
features[' '.join(gram)] = count
return features
# 转换Unigram, Bigram和Trigram特征为特征向量
unigram_features = ngram_to_feature_vector(unigram_counts)
bigram_features = ngram_to_feature_vector(bigram_counts)
trigram_features = ngram_to_feature_vector(trigram_counts)
print("\nUnigram特征向量:")
print(unigram_features)
print("\nBigram特征向量:")
print(bigram_features)
print("\nTrigram特征向量:")
print(trigram_features)
参考
【n-gram算法】一篇文章讲透~_ngram-CSDN博客