前言
NLTK(Natural Language Toolkit)是一个用于处理人类语言数据的Python库。它提供了丰富的工具和资源,支持各种自然语言处理(NLP)任务,如文本分类、标记化、词性标注、命名实体识别、情感分析等。NLTK库以英语为主中文的自然语言处理jieba库效果更好。
NLTK基础操作
#下载相关词料库
import nltk
nltk.download('wordnet')#WordNet 语料库
nltk.download('stopwords') # 下载停用词
nltk.download('punkt') # 下载分词器
nltk.download('punkt') #语义分割库
nltk.download('averaged_perceptron_tagger')#词性注册模型
nltk.download('maxent_ne_chunker')#实体识别的模型
nltk.download('words')#词表
nltk.download('maxent_ne_chunker')#命名实体识别(NER)所需的模型
# 获取同义词
synonyms = wordnet.synsets('happy')
print(synonyms)
结果
Synset('happy.a.01'), Synset('felicitous.s.02'), Synset('glad.s.02'), Synset('happy.s.04')]
#a代表形容词,s代表形容词副词
#nltk.tokenize 用于将文本分割为词或句子
from nltk.tokenize import word_tokenize, sent_tokenize
text = "Hello world. This is NLTK."
sentences = sent_tokenize(text)
words = word_tokenize(text)
print(sentences)
print(words)
#输出
['Hello world.', 'This is NLTK.']
['Hello', 'world', '.', 'This', 'is', 'NLTK', '.']
#nltk.stem用于词干提取
from nltk.stem import PorterStemmer
ps = PorterStemmer()
words = ["running", "ran", "easily", "fairly"]
stems = [ps.stem(word) for word in words]
print(stems)
#PorterStemmer 通过去掉词的后缀来简化词形,可能不会总是返回真实的单词形式,而是词干。这个过程并不总是完美,可能会导致某些词的输出不如预期
['running', 'ran', 'delicious', 'faimilary']
#nltk.tag用于词性标注
from nltk import pos_tag
tokens = word_tokenize("They refuse to permit us to obtain the refuse permit.")
tagged = pos_tag(tokens)
print(tagged)
[('They', 'PRP'), ('refuse', 'VBP'), ('to', 'TO'), ('permit', 'VB'), ('us', 'PRP'), ('to', 'TO'), ('obtain', 'VB'), ('the', 'DT'), ('refuse', 'NN'), ('permit', 'NN'), ('.', '.')]
#nltk.classify 用于分类任务,使用贝叶斯分类实现小型分类任务
from nltk.classify import NaiveBayesClassifier
from nltk import FreqDist
# 示例数据
train_data = [({'feature1': 1, 'feature2': 0}, 'class1'),
({'feature1': 0, 'feature2': 1}, 'class2')]
classifier = NaiveBayesClassifier.train(train_data)
print(classifier.classify({'feature1': 1, 'feature2': 0}))
#nltk.cluster用于聚类。
from nltk.cluster import KMeansClusterer
import numpy as np
data = np.array([[1, 2], [1, 4], [1, 0],
[10, 2], [10, 4], [10, 0]])
kclusterer = KMeansClusterer(2, distance=nltk.cluster.util.cosine_distance)
clusters = kclusterer.cluster(data, assign_clusters=True)
print(clusters)
#nltk.chunk、用于分块(如命名实体识别)
from nltk import ne_chunk
sentence = "Barack Obama was the 44th president of the United States."
tokens = word_tokenize(sentence)
tagged = pos_tag(tokens)
chunked = ne_chunk(tagged)
print(chunked)
结果(PERSON Barack/NNP)
(PERSON Obama/NNP)
was/VBD
the/DT
44th/JJ
president/NN
of/IN
the/DT
(GPE United/NNP States/NNPS)
./.)
United states被识别为政治实体 obama barack 识别为人名
#nltk.metrics 用于评估和比较。
from nltk.metrics import precision, recall
# 真阳性和假阳性应该以集合的形式传入
tp = {1, 2} # 真阳性示例
fp = {2,3,4} # 假阳性示例
fn = {2, 5} # 假阴性示例
prec = precision(tp, fp)
rec = recall(tp, fn)
print(f'Precision: {prec}, Recall: {rec}')
#nltk.probability 用于概率分布。
from nltk.probability import FreqDist
text = "This is a sample text with several words this is just a sample"
words = word_tokenize(text)
fdist = FreqDist(words)
print(fdist.most_common(5)) # 输出最常见的5个单词
[('is', 2), ('a', 2), ('sample', 2), ('This', 1), ('text', 1)]
import ssl
import nltk
# 禁用 SSL 验证
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('gutenberg')
#下载语料库
# 选择一本书(例如,莎士比亚的《哈姆雷特》)
hamlet_words = gutenberg.words(r"C:\Users\lzh\Desktop\HM.txt")
#转为nltk。text格式
hamlet_text = nltk.Text(hamlet_words)
# 查看文本的前十个单词
#查找近意词
#similar原理在文本中找到单词 "king",那么该词前后相邻的几个词就构成了 "king" 的上下文。similar() 函数会将与目标词在相同上下文中出现的其他词当作候选词。
#如果文本数量不够该功能效果达不到预期
hamlet_text.similar('king')
# 查找上下文
hamlet_text.concordance('king')
#查看搭配
hamlet_text.collocations()
# 共同上下文
hamlet_text.common_contexts(['king', 'prince'])
#计算长度
print(f'Total words in Hamlet: {len(hamlet_text)}')
#set
# 获取唯一单词集合
unique_words = set(hamlet_words)
print(f'Unique words count: {len(unique_words)}')
#sorted
# 获取并排序唯一单词
sorted_unique_words = sorted(unique_words)
print(sorted_unique_words[:10]) # 显示前十个单词
from nltk import FreqDist
# 创建频率分布
fdist = FreqDist(hamlet_words)
# 查看最常见的前10个单词
print(fdist.most_common(10))
import nltk
from nltk.corpus import gutenberg
from nltk import FreqDist
import matplotlib.pyplot as plt
# 绘制频率分布图
fdist.plot(300, cumulative=False)
plt.show()# 绘制“king”这个词的分布图
# 绘制“king”的分布图
hamlet_text.dispersion_plot(['king'])
plt.show()