文件下载地址:
链接: https://pan.baidu.com/s/1WeEyUKfrYoaZNd-jpl_UCA 提取码: ge7a
"""
电影评论 情感分析
"""
import nltk.corpus as nc
import nltk.classify as cf
import nltk.classify.util as cu
# 加载正向样本
pos_data = []
fileids = nc.movie_reviews.fileids("pos")
for fileid in fileids:
sample = {}
words = nc.movie_reviews.words(fileid)
for word in words:
sample[word] = True
pos_data.append((sample, "POSITIVE"))
print(len(pos_data))
# 加载负向样本
neg_data = []
fileids = nc.movie_reviews.fileids("neg")
for fileid in fileids:
sample = {}
words = nc.movie_reviews.words(fileid)
for word in words:
sample[word] = True
neg_data.append((sample, "NEGTIVE"))
# 整理数据集 80%训练 20%测试
pnumb, nnumb = int(len(pos_data) * 0.8), int(len(neg_data) * 0.8)
train_data = pos_data[:pnumb] + neg_data[:nnumb]
test_data = pos_data[pnumb:] + neg_data[nnumb:]
# 创建模型 朴素贝叶斯分类
model = cf.NaiveBayesClassifier.train(train_data)
# 正确率计算
acc = cu.accuracy(model, test_data)
print(acc)
# 模拟业务场景
reviews = [
'It is an amazing movie.',
'This is a dull movie. I would never recommend it to anyone.',
'The cinematography is pretty great in this movie.',
'The direction was terrible and the story was all over the place.']
for review in reviews:
sample = {}
words = review.split()
for word in words:
sample[word] = True
pcls = model.classify(sample)
print(review, '->', pcls)