# -*- coding: utf-8 -*-from __future__ import unicode_literals
import os
import codecs
from..import normal
from..import seg
from..classification.bayes import Bayes
data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),'sentiment.marshal')classSentiment(object):def__init__(self):
self.classifier = Bayes()defsave(self, fname, iszip=True):
self.classifier.save(fname, iszip)defload(self, fname=data_path, iszip=True):
self.classifier.load(fname, iszip)defhandle(self, doc):
words = seg.seg(doc)
words = normal.filter_stop(words)return words
deftrain(self, neg_docs, pos_docs):
data =[]for sent in neg_docs:
data.append([self.handle(sent),'neg'])for sent in pos_docs:
data.append([self.handle(sent),'pos'])
self.classifier.train(data)defclassify(self, sent):
ret, prob = self.classifier.classify(self.handle(sent))if ret =='pos':return prob
return1-prob
classifier = Sentiment()
classifier.load()deftrain(neg_file, pos_file):
neg_docs = codecs.open(neg_file,'r','utf-8').readlines()
pos_docs = codecs.open(pos_file,'r','utf-8').readlines()global classifier
classifier = Sentiment()
classifier.train(neg_docs, pos_docs)defsave(fname, iszip=True):
classifier.save(fname, iszip)defload(fname, iszip=True):
classifier.load(fname, iszip)defclassify(sent):return classifier.classify(sent)
train:用于训练一个情感分类器
classify:用于预测的函数
handle函数会被以上两个用到,主要工作是
对输入文本分词
去停用词
2.2. 分类核心
情感分类的基本模型是贝叶斯模型Bayes
训练的核心代码:
deftrain(self, data):# data 中既包含正样本,也包含负样本for d in data:# data中是list# d[0]:分词的结果,list# d[1]:正/负样本的标记
c = d[1]if c notin self.d:
self.d[c]= AddOneProb()# 类的初始化for word in d[0]:# 分词结果中的每一个词
self.d[c].add(word,1)# 返回的是正类和负类之和
self.total =sum(map(lambda x: self.d[x].getsum(), self.d.keys()))# 取得所有的d中的sum之和
贝叶斯模型的使用是贝叶斯定理算概率的公式了的对应代码
defclassify(self, x):
tmp ={}for k in self.d:# 正类和负类
tmp[k]= log(self.d[k].getsum())- log(self.total)# 正类/负类的和的log函数-所有之和的log函数for word in x:
tmp[k]+= log(self.d[k].freq(word))# 词频,不存在就为0
ret, prob =0,0for k in self.d:
now =0try:for otherk in self.d:
now += exp(tmp[otherk]-tmp[k])
now =1/now
except OverflowError:
now =0if now > prob:
ret, prob = k, now
return(ret, prob)