数据挖掘情感分析python_nltk_28Twitter情感分析模型

895802-20180417215535185-501042650.png

生产Twitter情感分析的模型,并保存数据为pickle,此过程可能要一个小时,所以下次调用数据就很简单了

# -*- coding: utf-8 -*-

"""

Created on Thu Jan 12 10:44:19 2017

@author: Administrator

用于短评论分析-- Twitter

保存后的"positive.txt","negative.txt"需要转码为utf-8

在线转码网址

http://www.esk365.com/tools/GB2312-UTF8.asp

features=5000,准确率百分之60以上

features=10000,准确率百分之 以上

运行时间可能长达一个小时

"""

import nltk

import random

import pickle

from nltk.tokenize import word_tokenize

short_pos = open("positive.txt","r").read()

short_neg = open("negative.txt","r").read()

# move this up here

documents = []

all_words = []

for r in short_pos.split('\n'):

documents.append( (r, "pos") )

for r in short_neg.split('\n'):

documents.append( (r, "neg") )

# j is adject, r is adverb, and v is verb

#allowed_word_types = ["J","R","V"] 允许形容词类别

allowed_word_types = ["J"]

for p in short_pos.split('\n'):

documents.append( (p, "pos") )

words = word_tokenize(p)

pos = nltk.pos_tag(words)

for w in pos:

if w[1][0] in allowed_word_types:

all_words.append(w[0].lower())

for p in short_neg.split('\n'):

documents.append( (p, "neg") )

words = word_tokenize(p)

pos = nltk.pos_tag(words)

for w in pos:

if w[1][0] in allowed_word_types:

all_words.append(w[0].lower())

#保存文档

save_documents = open("pickled_algos/documents.pickle","wb")

pickle.dump(documents, save_documents)

save_documents.close()

#保存特征

all_words = nltk.FreqDist(all_words)

#最好改成2万以上

word_features = list(all_words.keys())[:5000]

save_word_features = open("pickled_algos/word_features5k.pickle","wb")

pickle.dump(word_features, save_word_features)

save_word_features.close()

def find_features(document):

words = word_tokenize(document)

features = {}

for w in word_features:

features[w] = (w in words)

return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

random.shuffle(featuresets)

print(len(featuresets))

testing_set = featuresets[10000:]

training_set = featuresets[:10000]

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)

classifier.show_most_informative_features(15)

#保存分类器

save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb")

pickle.dump(classifier, save_classifier)

save_classifier.close()

sentiment_mod.py

# -*- coding: utf-8 -*-

"""

Created on Thu Jan 12 16:47:51 2017

@author: Administrator

"""

#File: sentiment_mod.py

import nltk

import random

import pickle

from nltk.tokenize import word_tokenize

documents_f = open("pickled_algos/documents.pickle", "rb")

documents = pickle.load(documents_f)

documents_f.close()

word_features5k_f = open("pickled_algos/word_features5k.pickle", "rb")

word_features = pickle.load(word_features5k_f)

word_features5k_f.close()

def find_features(document):

words = word_tokenize(document)

features = {}

for w in word_features:

features[w] = (w in words)

return features

featuresets_f = open("pickled_algos/featuresets.pickle", "rb")

featuresets = pickle.load(featuresets_f)

featuresets_f.close()

random.shuffle(featuresets)

print(len(featuresets))

testing_set = featuresets[10000:]

training_set = featuresets[:10000]

open_file = open("pickled_algos/originalnaivebayes5k.pickle", "rb")

classifier = pickle.load(open_file)

open_file.close()

def sentiment(text):

feats = find_features(text)

return classifier.classify(feats)

测试

# -*- coding: utf-8 -*-

"""

Created on Thu Jan 12 16:50:12 2017

@author: Administrator

"""

import sentiment_mod as s

print(s.sentiment("This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!"))

print(s.sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10"))

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值