数据挖掘情感分析python_nltk_28Twitter情感分析模型

最新推荐文章于 2024-08-14 10:44:08 发布

weixin_39785970

最新推荐文章于 2024-08-14 10:44:08 发布

阅读量192

点赞数

文章标签：数据挖掘情感分析python

生产Twitter情感分析的模型，并保存数据为pickle，此过程可能要一个小时，所以下次调用数据就很简单了

# -*- coding: utf-8 -*-

"""

Created on Thu Jan 12 10:44:19 2017

@author: Administrator

用于短评论分析-- Twitter

保存后的"positive.txt"，"negative.txt"需要转码为utf-8

在线转码网址

http://www.esk365.com/tools/GB2312-UTF8.asp

features=5000,准确率百分之60以上

features=10000，准确率百分之以上

运行时间可能长达一个小时

"""

import nltk

import random

import pickle

from nltk.tokenize import word_tokenize

short_pos = open("positive.txt","r").read()

short_neg = open("negative.txt","r").read()

# move this up here

documents = []

all_words = []

for r in short_pos.split('\n'):

documents.append( (r, "pos") )

for r in short_neg.split('\n'):

documents.append( (r, "neg") )

# j is adject, r is adverb, and v is verb

#allowed_word_types = ["J","R","V"] 允许形容词类别

allowed_word_types = ["J"]

for p in short_pos.split('\n'):

documents.append( (p, "pos") )

words = word_tokenize(p)

pos = nltk.pos_tag(words)

for w in pos:

if w[1][0] in allowed_word_types:

all_words.append(w[0].lower())

for p in short_neg.split('\n'):

documents.append( (p, "neg") )

words = word_tokenize(p)

pos = nltk.pos_tag(words)

for w in pos:

if w[1][0] in allowed_word_types:

all_words.append(w[0].lower())

#保存文档

save_documents = open("pickled_algos/documents.pickle","wb")

pickle.dump(documents, save_documents)

save_documents.close()

#保存特征

all_words = nltk.FreqDist(all_words)

#最好改成2万以上

word_features = list(all_words.keys())[:5000]

save_word_features = open("pickled_algos/word_features5k.pickle","wb")

pickle.dump(word_features, save_word_features)

save_word_features.close()

def find_features(document):

words = word_tokenize(document)

features = {}

for w in word_features:

features[w] = (w in words)

return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

random.shuffle(featuresets)

print(len(featuresets))

testing_set = featuresets[10000:]

training_set = featuresets[:10000]

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)

classifier.show_most_informative_features(15)

#保存分类器

save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb")

pickle.dump(classifier, save_classifier)

save_classifier.close()

sentiment_mod.py

# -*- coding: utf-8 -*-

"""

Created on Thu Jan 12 16:47:51 2017

@author: Administrator

"""

#File: sentiment_mod.py

import nltk

import random

import pickle

from nltk.tokenize import word_tokenize

documents_f = open("pickled_algos/documents.pickle", "rb")

documents = pickle.load(documents_f)

documents_f.close()

word_features5k_f = open("pickled_algos/word_features5k.pickle", "rb")

word_features = pickle.load(word_features5k_f)

word_features5k_f.close()

def find_features(document):

words = word_tokenize(document)

features = {}

for w in word_features:

features[w] = (w in words)

return features

featuresets_f = open("pickled_algos/featuresets.pickle", "rb")

featuresets = pickle.load(featuresets_f)

featuresets_f.close()

random.shuffle(featuresets)

print(len(featuresets))

testing_set = featuresets[10000:]

training_set = featuresets[:10000]

open_file = open("pickled_algos/originalnaivebayes5k.pickle", "rb")

classifier = pickle.load(open_file)

open_file.close()

def sentiment(text):

feats = find_features(text)

return classifier.classify(feats)

测试

# -*- coding: utf-8 -*-

"""

Created on Thu Jan 12 16:50:12 2017

@author: Administrator

"""

import sentiment_mod as s

print(s.sentiment("This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!"))

print(s.sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10"))

weixin_39785970

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫