import random
from nltk.corpus import movie_reviews #影评语料库
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy
import string
'''标注(单词列表,分类)'''
labeled_docs=[(list(movie_reviews.words(fid)),cat)for cat in movie_reviews.categories() for fid in movie_reviews.fileids(cat)]
random.seed(12)
random.shuffle(labeled_docs)
#print(labeled_docs[:1])
'''原始文档'''
review_words=movie_reviews.words()
print(len(review_words))
sw=set(nltk.corpus.stopwords.words('english'))
punctuation=set(string.punctuation)
def isStopword(word):
return word in sw or word in punctuation
'''过滤文档'''
filtered=[w.lower() for w in review_words if not isStopword(w.lower())]
print(len(filtered))
'''单词按频率排序,词频最高的5%的单词作为特征'''
words=FreqDist(filtered)
N=int(0.05*len(words.keys()))
word_fe
Python情感分析:朴素贝叶斯分类NLTK(学习笔记)
最新推荐文章于 2021-11-29 17:39:03 发布