import nltk
import string
import random
sw=set(nltk.corpus.stopwords.words('english'))#加载停用词
punctuation=set(string.punctuation)#加载标点符号
'''提取单词长度作为特征'''
def word_features(word):
return {'len':len(word)}
'''判断是否是停用词'''
def isStopword(word):
return word in sw or word in punctuation
'''加载gutenberg语料库'''
gb=nltk.corpus.gutenberg
words=gb.words("shakespeare-caesar.txt")
'''标注(单词,分类)'''
labeled_words=([(word.lower(),isStopword(word.lower())) for word in words])
random.seed(1)
random.shuffle(labeled_words)
#print(labeled_words[:5])
#从原始数据中提取特征
features=([(word_features(n),word) for (n,word) in labeled_words])
'''训练集,测试集'''
cutoff=int(0.9*len(features))
train_set,test_set=features[:cutoff],features[cutoff:]
#利用训练集训练模型
classifier=nltk.NaiveBayesClassifier.train(train_set)
#检验分类效果
print("':' class ",classifier.classify(word_features(':')))
print("'fiue' class ",classifier.classify(word_features('fiue')))
#利用训练集训练的模型,用测试集计算准确率
print(nltk.classify.accuracy(classifier,test_set))
#最能区分分类的特征
print(classifier.show_most_informative_features())
结果:
':' class True
'fiue' class False
0.836687306501548
Most Informative Features
len = 7 False : True = 53.8 : 1.0
len = 6 False : True = 47.6 : 1.0
len = 1 True : False = 12.0 : 1.0
len = 2 True : False = 11.1 : 1.0
len = 5 False : True = 10.1 : 1.0
len = 4 False : True = 2.0 : 1.0
len = 3 True : False = 1.9 : 1.0
根据结果得出,:分类为True,fiue分类为False;利用训练集训练的模型,在应用到测试集上时,其准确率为83.7%;最后输出能区分类别的特征值数据,比如,当单词长度为7时,划分为False的可能性是True的53.8倍。