自然语言处理NLTK篇

ci..ci

于 2024-09-28 12:14:16 发布

阅读量142

点赞数 5

文章标签：自然语言处理人工智能

本文链接：https://blog.csdn.net/qq_49786473/article/details/142613134

版权

前言

NLTK（Natural Language Toolkit）是一个用于处理人类语言数据的Python库。它提供了丰富的工具和资源，支持各种自然语言处理（NLP）任务，如文本分类、标记化、词性标注、命名实体识别、情感分析等。NLTK库以英语为主中文的自然语言处理jieba库效果更好。

NLTK基础操作

#下载相关词料库
import nltk
nltk.download('wordnet')#WordNet 语料库
nltk.download('stopwords')  # 下载停用词
nltk.download('punkt')      # 下载分词器
nltk.download('punkt') #语义分割库
nltk.download('averaged_perceptron_tagger')#词性注册模型
nltk.download('maxent_ne_chunker')#实体识别的模型
nltk.download('words')#词表
nltk.download('maxent_ne_chunker')#命名实体识别（NER）所需的模型

# 获取同义词
synonyms = wordnet.synsets('happy')
print(synonyms)
结果
Synset('happy.a.01'), Synset('felicitous.s.02'), Synset('glad.s.02'), Synset('happy.s.04')]
#a代表形容词，s代表形容词副词


#nltk.tokenize 用于将文本分割为词或句子
from nltk.tokenize import word_tokenize, sent_tokenize

text = "Hello world. This is NLTK."
sentences = sent_tokenize(text)
words = word_tokenize(text)
print(sentences)
print(words)

#输出
['Hello world.', 'This is NLTK.']
['Hello', 'world', '.', 'This', 'is', 'NLTK', '.']

#nltk.stem用于词干提取
from nltk.stem import PorterStemmer

ps = PorterStemmer()
words = ["running", "ran", "easily", "fairly"]
stems = [ps.stem(word) for word in words]
print(stems)

#PorterStemmer 通过去掉词的后缀来简化词形，可能不会总是返回真实的单词形式，而是词干。这个过程并不总是完美，可能会导致某些词的输出不如预期
['running', 'ran', 'delicious', 'faimilary']

#nltk.tag用于词性标注
from nltk import pos_tag

tokens = word_tokenize("They refuse to permit us to obtain the refuse permit.")
tagged = pos_tag(tokens)
print(tagged)

[('They', 'PRP'), ('refuse', 'VBP'), ('to', 'TO'), ('permit', 'VB'), ('us', 'PRP'), ('to', 'TO'), ('obtain', 'VB'), ('the', 'DT'), ('refuse', 'NN'), ('permit', 'NN'), ('.', '.')]


#nltk.classify 用于分类任务，使用贝叶斯分类实现小型分类任务
from nltk.classify import NaiveBayesClassifier
from nltk import FreqDist
# 示例数据
train_data = [({'feature1': 1, 'feature2': 0}, 'class1'),
              ({'feature1': 0, 'feature2': 1}, 'class2')]
classifier = NaiveBayesClassifier.train(train_data)

print(classifier.classify({'feature1': 1, 'feature2': 0}))

#nltk.cluster用于聚类。
from nltk.cluster import KMeansClusterer
import numpy as np

data = np.array([[1, 2], [1, 4], [1, 0],
                 [10, 2], [10, 4], [10, 0]])

kclusterer = KMeansClusterer(2, distance=nltk.cluster.util.cosine_distance)
clusters = kclusterer.cluster(data, assign_clusters=True)
print(clusters)



#nltk.chunk、用于分块（如命名实体识别）
from nltk import ne_chunk

sentence = "Barack Obama was the 44th president of the United States."
tokens = word_tokenize(sentence)
tagged = pos_tag(tokens)
chunked = ne_chunk(tagged)
print(chunked)

结果(PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  the/DT
  44th/JJ
  president/NN
  of/IN
  the/DT
  (GPE United/NNP States/NNPS)
  ./.)


United states被识别为政治实体 obama barack 识别为人名

#nltk.metrics 用于评估和比较。
from nltk.metrics import precision, recall

# 真阳性和假阳性应该以集合的形式传入
tp = {1, 2}  # 真阳性示例
fp = {2,3,4}     # 假阳性示例
fn = {2, 5}  # 假阴性示例
prec = precision(tp, fp)
rec = recall(tp, fn)
print(f'Precision: {prec}, Recall: {rec}')


#nltk.probability 用于概率分布。
from nltk.probability import FreqDist
text = "This is a sample text with several words this is just a sample"
words = word_tokenize(text)
fdist = FreqDist(words)
print(fdist.most_common(5))  # 输出最常见的5个单词
[('is', 2), ('a', 2), ('sample', 2), ('This', 1), ('text', 1)]

import ssl
import nltk

# 禁用 SSL 验证
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('gutenberg')
#下载语料库
# 选择一本书（例如，莎士比亚的《哈姆雷特》）
hamlet_words = gutenberg.words(r"C:\Users\lzh\Desktop\HM.txt")
#转为nltk。text格式
hamlet_text = nltk.Text(hamlet_words)

# 查看文本的前十个单词
#查找近意词
#similar原理在文本中找到单词 "king"，那么该词前后相邻的几个词就构成了 "king" 的上下文。similar() 函数会将与目标词在相同上下文中出现的其他词当作候选词。
#如果文本数量不够该功能效果达不到预期
hamlet_text.similar('king')
# 查找上下文
hamlet_text.concordance('king')
#查看搭配
hamlet_text.collocations()
# 共同上下文
hamlet_text.common_contexts(['king', 'prince'])
#计算长度
print(f'Total words in Hamlet: {len(hamlet_text)}')
#set
# 获取唯一单词集合
unique_words = set(hamlet_words)
print(f'Unique words count: {len(unique_words)}')
#sorted
# 获取并排序唯一单词
sorted_unique_words = sorted(unique_words)
print(sorted_unique_words[:10])  # 显示前十个单词
from nltk import FreqDist

# 创建频率分布
fdist = FreqDist(hamlet_words)

# 查看最常见的前10个单词
print(fdist.most_common(10))

import nltk
from nltk.corpus import gutenberg
from nltk import FreqDist
import matplotlib.pyplot as plt

# 绘制频率分布图
fdist.plot(300, cumulative=False)
plt.show()# 绘制“king”这个词的分布图
# 绘制“king”的分布图
hamlet_text.dispersion_plot(['king'])
plt.show()