爬虫
NLP
import jieba
import numpy as np
import pymongo
from NLP.Config import *
'''
db
'''
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
def get_comments_from_db(count=100):
try:
comments_list = [msg['comment'] for msg in db[MONGO_TABLE].find().limit(count)]
return comments_list
except Exception as e:
print(e.args)
return None
'''
将txt转为list
'''
def get_list_from_file(name=''):
path = './{}.txt'.format(name)
words = []
with open(path, mode='r', encoding='utf-8') as f:
for w in f:
words.append(w.strip())
return words
'''
几个词典
'''
stop_words = get_list_from_file(name='stopwords')
refute_words = get_list_from_file(name='refute')
nega_words = get_list_from_file(name='negative')
posi_words = get_list_from_file(name='positive')
degree_words = get_list_from_file(name='degree')
degree_index_list = ['extreme', 'very', 'more', 'ish', 'last']
degree_dict = {}
for i in range(4):
first_index = degree_index_list[i]
second_index = degree_index_list[i+1]
degree_dict[first_index]\
= degree_words[degree_words.index(first_index)+1 : degree_words.index(second_index)]
'''
1. 分词(情感词, 否定词, 程度词/号, )
2. 评论的情感值的均值与方差
'''
def sentiment_value(**kwargs):
comment = kwargs['comment']
words = [w.strip() for w in jieba.cut(comment, cut_all=False) if w not in stop_words]
sent_value_list = []
# 对每个word求一次情感值
pre_index = 0
print(words)
for word in list(words):
seg_sent_value = 0
# 求情感值,
if word in posi_words:
seg_sent_value += POSI_VALUE
elif word in nega_words:
seg_sent_value += NEGA_VALUE
if seg_sent_value != 0:
index = words.index(word)
for w in words[pre_index : index]:
if w in degree_dict['extreme']:
seg_sent_value *= EXTREME_VALUE
elif w in degree_dict['very']:
seg_sent_value *= VERY_VALUE
elif w in degree_dict['more']:
seg_sent_value *= MORE_VALUE
elif w in degree_dict['ish']:
seg_sent_value *= ISH_VALUE
elif w in refute_words:
seg_sent_value *= REFUTE_VALUE
pre_index = index + 1
sent_value_list.append(seg_sent_value)
if sent_value_list:
arr = np.array(sent_value_list)
print(arr)
words_value_dict = {
'sum' : arr.sum(),
'avg' : arr.mean(),
'std' : arr.std()
}
return words_value_dict
else:
return None
def run():
for comment in get_comments_from_db():
print(sentiment_value(comment=comment))
if __name__ == '__main__':
run()
链接: https://pan.baidu.com/s/1kVn4Hx1 密码: y4xa