情感分析
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
# sent.py
import nltk.corpus as nc
import nltk.classify as cf
import nltk.classify.util as cu
pdata = []
fileids = nc.movie_reviews.fileids('pos')
# print(fileids)
for fileid in fileids:
feature = {}
words = nc.movie_reviews.words(fileid)
for word in words:
feature[word] = True
pdata.append((feature, 'POSITIVE'))
ndata = []
fileids = nc.movie_reviews.fileids('neg')
# print(fileids)
for fileid in fileids:
feature = {}
words = nc.movie_reviews.words(fileid)
for word in words:
feature[word] = True
ndata.append((feature, 'NEGATIVE'))
pnumb, nnumb = int(len(pdata) * 0.8), int(len(ndata) * 0.8)
train_data = pdata[:pnumb] + ndata[:nnumb]
test_data = pdata[pnumb:] + ndata[nnumb:]
model = cf.NaiveBayesClassifier.train(train_data)
model = cf.NaiveBayesClassifier.train(train_data)
ac = cu.accuracy(model, test_data)
print(ac)
tops = model.most_informative_features()
for top in tops[:10]:
print(top[0])
reviews = [
'It is an amazing movie.',
'This is a dull movie. I would never recommend it to anyone.',
'The cinematography is pretty great in this move.',
'The direction was terrible and the story was all over the place.']
sents, probs = [], []
for review in reviews:
feature = {}
words = review.split()
for word in words:
feature[word] = True
pcls = model.prob_classify(feature)
sent = pcls.max()
prob = pcls.prob(sent)
sents.append(sent)
probs.append(prob)
for review, sent, prob in zip(reviews, sents, probs):
print(review, '->', sent, '%.2f%%' % round(prob * 100, 2))
code result
0.735
outstanding
insulting
vulnerable
ludicrous
uninvolving
astounding
avoids
fascination
anna
animators
It is an amazing movie. -> POSITIVE 63.16%
This is a dull movie. I would never recommend it to anyone. -> NEGATIVE 76.52%
The cinematography is pretty great in this move. -> POSITIVE 68.67%
The direction was terrible and the story was all over the place. -> NEGATIVE 67.03%