对知乎数据进行情感分析

最新推荐文章于 2023-12-31 01:10:32 发布

数据科学真奇妙呀真奇妙

最新推荐文章于 2023-12-31 01:10:32 发布

阅读量736

点赞数 1

本文链接：https://blog.csdn.net/weixin_46660582/article/details/106108514

版权

关于编码问题：‘utf-8’ codec can’t decode byte 0xb7 in position 0: invalid start byte参见这篇博文：https://www.cnblogs.com/xiaolan-Lin/p/11653432.html
代码来自于这篇博文：https://blog.csdn.net/lom9357bye/article/details/79058946?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522158937528619724839264541%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fall.57674%2522%257D&request_id=158937528619724839264541&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2_allfirst_rank_v2~rank_v25-2-79058946.nonecase&utm_term=%E5%9F%BA%E4%BA%8E%E6%83%85%E6%84%9F%E8%AF%8D%E5%85%B8

程度词要自己标注权重，可以把txt的文件复制到excel，新建一行再自动填充：在这里插入图片描述

词典数据来自于这里：https://download.csdn.net/download/siyanyu/9880970

网上找代下下载的词典，需要的话可以点击网盘：链接：https://pan.baidu.com/s/1oVXtjtD9pTLP5Y277He-IQ
提取码：6h98

最后附上代码：

# -*- coding: utf-8 -*-
"""
Created on Wed May 13 21:10:58 2020

1、情感词典：BosonNLP情感词典是从微博、新闻、论坛等数据来源的上百万篇情感标注数据当中
自动构建的情感极性词典。因为标注包括微博数据，该词典囊括了很多网络用语及非正式简称，
对非规范文本也有较高的覆盖率。该情感词典可以用于构建社交媒体情感分析引擎，负面内容发现等应用。
2、停用词
3、否定词
4、程度副词


@author: 谢蕾
"""


from collections import defaultdict
import os
import re
import jieba
import codecs
import xlrd
import datetime
from xlrd import xldate_as_tuple
from openpyxl import Workbook

def handle_date(date):
    tuple = xldate_as_tuple(date, 0)
    # print(tuple)
    # excel_datetime=datetime.datetime(*tuple)
    # print(excel_datetime)
    return tuple
 
def seg_word(sentence):
    """使用jieba对文档分词"""
    seg_list = jieba.cut(sentence)
    seg_result = []
    for w in seg_list:
        seg_result.append(w)
    # 读取停用词文件
    stopwords = set()
    fr = codecs.open('停用词.txt', 'r', 'utf-8')
    for word in fr:
        stopwords.add(word.strip())
    fr.close()
    # 去除停用词
    return list(filter(lambda x: x not in stopwords, seg_result))
 
def classify_words(word_dict):
    """词语分类,找出情感词、否定词、程度副词"""
    # 读取情感字典文件
    sen_file = open('情感词1.0.txt', 'r+', encoding='utf-8')
    # 获取字典文件内容
    sen_list = sen_file.readlines()
    # 创建情感字典
    sen_dict = defaultdict()
    # 读取字典文件每一行内容，将其转换为字典对象，key为情感词，value为对应的分值
    for s in sen_list:
        # 每一行内容根据空格分割，索引0是情感词，索引1是情感分值
        sen_dict[s.split(' ')[0]] = s.split(' ')[1]
 
    # 读取否定词文件
    not_word_file = open('否定词.txt', 'r+', encoding='unicode_escape')
    # 由于否定词只有词，没有分值，使用list即可
    not_word_list = not_word_file.readlines()
 
    # 读取程度副词文件
    degree_file = open('程度词.txt', 'r+', encoding='utf-8')
    degree_list = degree_file.readlines()
    degree_dic = defaultdict()
    # 程度副词与情感词处理方式一样，转为程度副词字典对象，key为程度副词，value为对应的程度值
    for d in degree_list:
        degree_dic[d.split(',')[0]] = d.split(',')[1]
 
    # 分类结果，词语的index作为key,词语的分值作为value，否定词分值设为-1
    sen_word = dict()
    not_word = dict()
    degree_word = dict()
 
    # 分类
    for word in word_dict.keys():
        if word in sen_dict.keys() and word not in not_word_list and word not in degree_dic.keys():
            # 找出分词结果中在情感字典中的词
            sen_word[word_dict[word]] = sen_dict[word]
        elif word in not_word_list and word not in degree_dic.keys():
            # 分词结果中在否定词列表中的词
            not_word[word_dict[word]] = -1
        elif word in degree_dic.keys():
            # 分词结果中在程度副词中的词
            degree_word[word_dict[word]] = degree_dic[word]
    sen_file.close()
    degree_file.close()
    not_word_file.close()
    # 将分类结果返回
    return sen_word, not_word, degree_word
 
def list_to_dict(word_list):
    """将分词后的列表转为字典，key为单词，value为单词在列表中的索引，索引相当于词语在文档中出现的位置"""
    data = {}
    for x in range(0, len(word_list)):
        data[word_list[x]] = x
    return data
 
def get_init_weight(sen_word, not_word, degree_word):
    # 权重初始化为1
    W = 1
    # 将情感字典的key转为list
    sen_word_index_list = list(sen_word.keys())
    if len(sen_word_index_list) == 0:
        return W
    # 获取第一个情感词的下标，遍历从0到此位置之间的所有词，找出程度词和否定词
    for i in range(0, sen_word_index_list[0]):
        if i in not_word.keys():
            W *= -1
        elif i in degree_word.keys():
            # 更新权重，如果有程度副词，分值乘以程度副词的程度分值
            W *= float(degree_word[i])
    return W
 
def socre_sentiment(sen_word, not_word, degree_word, seg_result):
    """计算得分"""
    # 权重初始化为1
    W = 1
    score = 0
    # 情感词下标初始化
    sentiment_index = -1
    # 情感词的位置下标集合
    sentiment_index_list = list(sen_word.keys())
    # 遍历分词结果(遍历分词结果是为了定位两个情感词之间的程度副词和否定词)
    for i in range(0, len(seg_result)):
        # 若是程度副词
        if i in degree_word.keys():
            W*=degree_word[i]
        # 若是否定词
        elif i in not_word.keys():
            # print(i)
            W*=-1
        elif i in sen_word.keys():
            score+=float(W)*float(sen_word[i])
            W=1
        # 定位到下一个情感词
        if sentiment_index < len(sentiment_index_list) - 1:
            i = sentiment_index_list[sentiment_index + 1]
    return score
 
# 计算得分
def setiment_score(sententce):
    # 1.对文档分词
    seg_list = seg_word(sententce)
    # 2.将分词结果列表转为dic，然后找出情感词、否定词、程度副词
    sen_word, not_word, degree_word = classify_words(list_to_dict(seg_list))
    # 3.计算得分
    score = socre_sentiment(sen_word, not_word, degree_word, seg_list)
    return score
 
# # 测试
# print(setiment_score("我今天心情很糟糕也不开心"))

workbook = Workbook()
result_sheet = workbook.active
result_sheet.title = "随便"

sheets = xlrd.open_workbook(r'D:\answer.xls').sheets()

#先用第一个sheet试试水
for i in range(1):
    sheet = sheets[i]
    for j in range(1,sheet.nrows):
        a = []
        # print(sheet.cell_value(i,3))
        create_time = datetime.datetime(*(handle_date(sheet.cell_value(j,3))[:3]))#精确到天
        answer = sheet.cell_value(i,4)
        a.append(create_time)
        a.append(setiment_score(answer))
        result_sheet.append(a)

workbook.save("情感数据.xlsx")

数据科学真奇妙呀真奇妙

关注

1
点赞
踩
8

收藏

觉得还不错? 一键收藏
2
评论
对知乎数据进行情感分析

关于编码问题：‘utf-8’ codec can’t decode byte 0xb7 in position 0: invalid start byte参见这篇博文：https://www.cnblogs.com/xiaolan-Lin/p/11653432.html代码来自于这篇博文：https://blog.csdn.net/lom9357bye/article/details/79058946?ops_request_misc=%257B%2522request%255Fid%2522%253A%
复制链接

扫一扫