情感分析方法与实践

11.11.1

于 2024-07-18 19:53:06 发布

阅读量177

点赞数 2

分类专栏：文本挖掘文章标签： python

本文链接：https://blog.csdn.net/m0_55885128/article/details/140531526

版权

文本挖掘专栏收录该内容

5 篇文章 0 订阅

订阅专栏

第1关：情感分析的基本方法

1\AD

2\BCD

第2关：基于情感词典的情感分析实战

任务描述

本关任务：根据本关所学有关情感分析的知识，编写基于情感词典进行情感分析的程序，并通过所有测试用例。

相关知识

为了完成本关任务，你需要掌握：

基于情感词典的情感分析算法思想；
基于情感词典的情感分析算法实现流程。

from collections import defaultdict
import jieba
import codecs
def seg_word(sentence): # 使用jieba对文档分词
    seg_list = jieba.cut(sentence)
    seg_result = []
    for w in seg_list:
        seg_result.append(w)
# 读取停用词文件
    stopwords = set()
    fr = codecs.open('./stopword.txt', 'r', 'utf-8')
    for word in fr:
        stopwords.add(word.strip())
    fr.close()
    # 去除停用词
    return list(filter(lambda x: x not in stopwords, seg_result))
def classify_words(word_dict): # 词语分类,找出情感词、否定词、程度副词
    # 读取情感字典文件
    sen_file = open('./sentiment_score.txt', 'r+', encoding='utf-8')
    # 获取字典文件内容
    sen_list = sen_file.readlines()
    # 创建情感字典
    sen_dict = defaultdict()
    # 读取字典文件每一行内容，将其转换为字典对象，key为情感词，value为对应的分值
    for s in sen_list:
        # 每一行内容根据空格分割，索引0是情感词，索引1是情感分值
        sen_dict[s.split(' ')[0]] = s.split(' ')[1]
    # 读取否定词文件
    not_word_file = open('./notDic.txt', 'r+', encoding='utf-8')
    # 由于否定词只有词，没有分值，使用list即可
    not_word_list = not_word_file.readlines()
    # 读取程度副词文件
    degree_file = open('./degree.txt', 'r+', encoding='utf-8')
    degree_list = degree_file.readlines()
    degree_dic = defaultdict()
    # 程度副词与情感词处理方式一样，转为程度副词字典对象，key为程度副词，value为对应的程度值
    for d in degree_list:
        degree_dic[d.split(',')[0]] = d.split(',')[1]
    # 分类结果，词语的index作为key，词语的分值作为value，否定词分值设为-1
    sen_word = dict()
    not_word = dict()
    degree_word = dict()
    # 分类
    for word in word_dict.keys():
        if word in sen_dict.keys() and word not in not_word_list and word not in degree_dic.keys():
            # 找出分词结果中在情感字典中的词
            sen_word[word_dict[word]] = sen_dict[word]
        elif word in not_word_list and word not in degree_dic.keys():
            # 分词结果中在否定词列表中的词
            not_word[word_dict[word]] = -1
        elif word in degree_dic.keys():
            # 分词结果中在程度副词中的词
            degree_word[word_dict[word]] = degree_dic[word]
    sen_file.close()
    degree_file.close()
    not_word_file.close()
    # 将分类结果返回
    return sen_word, not_word, degree_word
def list_to_dict(word_list):
    data = {}
    for x in range(0, len(word_list)):
        data[word_list[x]] = x
    return data
def get_init_weight(sen_word, not_word, degree_word):
    # 权重初始化为1
    W = 1
    # 将情感字典的key转为list
    sen_word_index_list = list(sen_word.keys())
    if len(sen_word_index_list) == 0:
        return W
    # 获取第一个情感词的下标，遍历从0到此位置之间的所有词，找出程度词和否定词
    for i in range(0, sen_word_index_list[0]):
        if i in not_word.keys():
            W *= -1
        elif i in degree_word.keys():
            # 更新权重，如果有程度副词，分值乘以程度副词的程度分值
            W *= float(degree_word[i])
    return W
def socre_sentiment(sen_word, not_word, degree_word, seg_result): # 计算得分
    # 权重初始化为1
    W = 1
    score = 0
    # 情感词下标初始化
    sentiment_index = -1
    # 情感词的位置下标集合
    sentiment_index_list = list(sen_word.keys())
    # 任务：完成基于情感词典对情感得分的计算
    # ********** Begin *********#
    for i in range(0, len(seg_result)):
            # 如果是情感词（根据下标是否在情感词分类结果中判断）
            if i in sen_word.keys():
                # 权重*情感词得分
                score += W * float(sen_word[i])
                # 情感词下标加1，获取下一个情感词的位置
                sentiment_index += 1
                if sentiment_index < len(sentiment_index_list) - 1:
                    # 判断当前的情感词与下一个情感词之间是否有程度副词或否定词
                    for j in range(sentiment_index_list[sentiment_index], sentiment_index_list[sentiment_index + 1]):
                        # 更新权重，如果有否定词，取反
                        if j in not_word.keys():
                            W *= -1
                        elif j in degree_word.keys():
                            # 更新权重，如果有程度副词，分值乘以程度副词的程度分值
                            W *= float(degree_word[j])
            # 定位到下一个情感词
            if sentiment_index < len(sentiment_index_list) - 1:
                i = sentiment_index_list[sentiment_index + 1]    
    return score
def setiment_score(sententce):
    # 1.对文档分词
    seg_list = seg_word(sententce)
    # 2.将分词结果列表转为dic，然后找出情感词、否定词、程度副词
    sen_word, not_word, degree_word = classify_words(list_to_dict(seg_list))
    # 3.计算得分
    score = socre_sentiment(sen_word, not_word, degree_word, seg_list)
    return score