特征工程模型:N gram

正文

 代码

import nltk
import string
from nltk.util import ngrams
from collections import Counter

# 下载nltk数据包(首次使用时需要)
# nltk.download('punkt')

# 修改后的示例文本
document = "In the bustling city of Metroville, a young inventor named Leo " \
           "worked tirelessly in his small workshop. His latest project was a " \
           "drone capable of delivering supplies to remote areas. After months " \
           "of trial and error, the prototype was finally ready for its first " \
           "test flight. Early one morning, Leo launched the drone, watching " \
           "anxiously as it soared into the sky. The drone navigated the urban " \
           "landscape with ease, avoiding obstacles and reaching its destination " \
           "flawlessly. This success marked a significant step forward in Leo's " \
           "mission to improve access to essential resources for isolated communities " \
           "around the world. Leo's drone project was truly innovative. " \
           "Leo worked hard on his drone project. The prototype was a success, " \
           "and Leo was proud of his work."

# 数据预处理函数
def preprocess_text(text):
    # 转为小写
    text = text.lower()
    # 去除标点符号
    text = text.translate(str.maketrans('', '', string.punctuation))
    # 分词
    tokens = nltk.word_tokenize(text)
    return tokens

# 分词并预处理
tokens = preprocess_text(document)

# 生成N-gram特征
def generate_ngrams(tokens, n):
    n_grams = list(ngrams(tokens, n))
    return n_grams

# 统计N-gram频率
def count_ngrams(n_grams):
    return Counter(n_grams)

# 提取Unigram, Bigram和Trigram特征
unigrams = generate_ngrams(tokens, 1)
bigrams = generate_ngrams(tokens, 2)
trigrams = generate_ngrams(tokens, 3)

# 统计N-gram频率
unigram_counts = count_ngrams(unigrams)
bigram_counts = count_ngrams(bigrams)
trigram_counts = count_ngrams(trigrams)

# 打印结果
print("Unigram特征及其频率:")
for gram, count in unigram_counts.items():
    print(f"{gram}: {count}")

print("\nBigram特征及其频率:")
for gram, count in bigram_counts.items():
    print(f"{gram}: {count}")

print("\nTrigram特征及其频率:")
for gram, count in trigram_counts.items():
    print(f"{gram}: {count}")

# 将特征及其频率转换为特征向量
def ngram_to_feature_vector(ngram_counts):
    features = {}
    for gram, count in ngram_counts.items():
        features[' '.join(gram)] = count
    return features

# 转换Unigram, Bigram和Trigram特征为特征向量
unigram_features = ngram_to_feature_vector(unigram_counts)
bigram_features = ngram_to_feature_vector(bigram_counts)
trigram_features = ngram_to_feature_vector(trigram_counts)

print("\nUnigram特征向量:")
print(unigram_features)

print("\nBigram特征向量:")
print(bigram_features)

print("\nTrigram特征向量:")
print(trigram_features)

参考

【n-gram算法】一篇文章讲透~_ngram-CSDN博客

自然语言处理NLP中的N-gram模型_.gelnx nlp-CSDN博客

[NLP复习笔记] N-gram 及基本平滑方法 - MarisaMagic - 博客园 (cnblogs.com)

  • 12
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值