相关数据集
–就是判断一句话的词性,可以从几个方面来看
1、分词,将一句话分成几个词语 不同的分词方法会产生不同的效果,利用jieba分词的默认形式(即每个字在词语中会且仅会出现一次)
2、判断词语的词性,如情感词,反义词,程度副词 。并用数字标注词语的性质[word,positon,score]
–反义词可以直接算为-1
–jieba分词出的情感词和情感词库不对应。情感词库太差啦
3、计算分数,情感词语之间求和。不同词语组合求积
# -*- coding: utf-8 -*-
"""
Created on Wed May 3 16:25:05 2017
http://www.jianshu.com/p/4cfcf1610a73?nomobile=yes 参考链接
#情感分析
@author: chuc
"""
from collections import defaultdict
import jieba
"""
1. 文本切割
"""
def sent2word(sentence):
"""
Segment a sentence to words
Delete stopwords
"""
jieba.load_userdict("motion/dict.txt")
segList = jieba.cut(sentence)
segResult = []
for w in segList:
segResult.append(w)
'''f = open('motion/stopword.txt')
stopwords = f.readlines()
f.close()
newSent = []
for word in segResult:
if word in stopwords:
# print "stopword: %s" % word
continue
else:
newSent.append(word)
'''
return segResult
"""
2. 情感定位
"""
def classifyWords(wordDict):
# (1) 情感词
f=open('motion/BosonNLP_sentiment_score.txt',encoding='utf-8')
senList = f.readline()
senDict = defaultdict()
while senList:
#senDict.append(senList.split())
senDict[senList.split(' ')[0]] = senList.split(' ')[1]
senList = f.readline()
f.close()
# (2) 否定词
g = open('motion/notDict.txt',encoding='utf-8')
notList = g.readline()
notDic = []
while notList:
notDic.append(notList)
notList = g.readline()
g.close()
# (3) 程度副词
f = open('motion/degree.txt')
degreeList = f.readline()
degreeDict = defaultdict()
while degreeList:
degreeDict[degreeList.split()[0]] = degreeList.split()[1]
degreeList = f.readline()
f.close()
senWord = defaultdict()
notWord = defaultdict()
degreeWord = defaultdict()
t = 0
for word in wordDict:
print(word)
if word in senDict.keys() and word not in notDic and word not in degreeDict.keys():
senWord[t] = senDict[word]
elif word in notDic[0] and word not in degreeDict.keys():
notWord[t] = -1
elif word in degreeDict.keys():
degreeWord[t] = degreeDict[word]
t = t+1
#print( senWord, notWord, degreeWord)
return senWord, notWord, degreeWord
'''
计算句子分数
'''
def score(sen,no,degree,word):
score = 0
for i in range(len(word)):
if i in no.keys() and i+1 in sen.keys():
sen[i+1] =float(no[i])*float(sen[i+1])
elif i in degree.keys() and i+1 in no.keys() and i+2 in sen.keys():
sen[i+2] =float(no[i])*float(sen[i+2]*float(degree[i]))
elif i in degree.keys() and i+1 in sen.keys():
sen[i+1] = float(degree[i])*float(sen[i+1])
elif i in degree.keys() and i+1 in degree.keys():
sen[i]=float(degree[i])*float(degree[i+1])
#考虑不同的短语组合算分
for j in sen.keys():
score = score+float(sen[j])
return score
def culate(sentences):
sp = sent2word(sentences)
d,dd,ddd = classifyWords(sp)
score1 = score(d,dd,ddd,sp)
return score1
测试的结果不是很好。
1、句子的组合未写完全
2、结巴分词库和分类情感的词库不对应导致像(“很差”不在情感词库中,导致计算出的得分为0)
3、得分范围广,可以在训练后,归一化处理