信息内容安全实验-----某东评论数据感情分析

最新推荐文章于 2022-07-18 06:45:00 发布

努力找实习ing

最新推荐文章于 2022-07-18 06:45:00 发布

阅读量413

点赞数 1

分类专栏：信息内容安全实验文章标签：情感分析 python

本文链接：https://blog.csdn.net/weixin_45968555/article/details/111469178

版权

信息内容安全实验专栏收录该内容

1 篇文章 0 订阅

订阅专栏

文章目录

前言
一、大致思路
二、源代码
总结

前言

本文使用的是基于情感词典的情感分析

一、大致思路

一.导入数据
1.导入数据。
二.数据预处理
(一)去重
(二)数据清洗
(三)分词、词性标注、去掉停用词、词云图
1.去重(利用python自带的函数drop_duplicates去重)。
2.数据清洗(利用正则表达式删掉数字字母和一些品牌名)。
3.用jieba进行分词，整理得词以及对应的词性。
4.形成包含词、词性和词长度的新框架。
5.去掉停用词和标点符号。
6.重新统计分词数量。
7.提取名词并形成分词后词云图和分此后名词词云图。
三.模型构建
(一）情感分析
1.导入知网的评价词，积极赋值1，消极赋值-1。
2.增加新词。
3.将知网的评价词表和数据预处理得到的数据连接。
4.修正情感倾向，即判断是否是否定的(准备一加入修正权值，准备二去掉权值等于0的。)
5.计算情感值，大于1 的为pos，小于1的为neg。
6.合并到一张大表里面。
7.生成积极和消极词云图。

二、源代码

import numpy as np
import pandas as pd
import re
import PIL
import jieba.posseg as pg
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter

if __name__ == '__main__':
    raw_data = pd.read_excel('review_try.xlsx')
    raw_data.info()
    for cate in ['review_nickname', 'review_time']:
        print(raw_data[cate].value_counts())
    # 去重
    review = raw_data.copy()
    review = review[['review_content']]
    print('去重之前:', review.shape[0])
    review = review.drop_duplicates()
    print('去重之后:', review.shape[0])
    # 数据清洗
    # 清洗之前
    content = review['review_content']
    for i in range(1, 100):
        print(content[i])
        print('----------------------')
    # 清洗之后，把字母数字京东，荣耀，华为，手机，机，pg
    info = re.compile('[0-9a-zA-Z]|京东|华为|荣耀|手机|机|苹果|小米|')
    content = content.apply(lambda x: info.sub('', x))
    # 清洗之后
    for i in range(1, 100):
        print(content[i])
        print('---------------------')
    # 用jieba进行分词
    seg_content = content.apply(lambda x: [(x.word, x.flag) for x in pg.cut(x)])
    print(seg_content.shape)
    print(len(seg_content))
    print('.............')
    for i in range(20):
        print(seg_content[i])
    print('.............')
    # 统计评论数
    n_word = seg_content.apply(lambda x:len(x))
    print(n_word)
    print(n_word.head(8))
    # 得到各分词在第几评论
    n_content = [[x+1]*y for x,y in zip(list(seg_content.index), list(n_word))]
    index_content_long = sum(n_content, [])
    print(len(index_content_long))
    # 分词以及词性，去掉[],拉平。
    print(seg_content.head())
    seg_content_long = sum(seg_content, [])
    print(seg_content_long)
    # 得到加长版的分词，词性。
    word_long = [x[0] for x in seg_content_long]
    nature_long = [x[1] for x in seg_content_long]
    print(len(word_long))
    print(len(nature_long))
    # 形成新框架
    review_long = pd.DataFrame({'index_content': index_content_long, 'word': word_long, 'nature': nature_long})
    print(review_long.shape)
    print(review_long.head())
    # 去掉标点符号，停用词。
    print(review_long['nature'].unique())
    review_long_clean = review_long[review_long['nature'] != 'x']
    print(review_long_clean.shape)
    # 导入停用词
    with open('stoplist.txt', 'r', encoding='utf-8') as f:
        stop_words = f.readlines()
    print(len(stop_words))
    print(stop_words[0:5])
    # 停用词预处理
    stop_words = [word.strip('\n') for word in stop_words]
    print(stop_words[0:5])
    # 得到不含停用词的分词表
    word_long_clean = list(set(word_long) - set(stop_words))
    print(len(word_long_clean))
    review_long_clean = review_long_clean[review_long_clean['word'].isin(word_long_clean)]
    print(review_long_clean.shape)
    # 再次统计每条评论里面的分词数量
    n_word = review_long_clean.groupby('index_content').count()['word']
    print(n_word)
    index_word = [list(np.arange(1, x+1)) for x in n_word]
    index_word_long = sum(index_word, [])
    review_long_clean['index_word'] = index_word_long
    print(review_long_clean.head())
    review_long_clean.to_csv('1_review_long_clean.csv')
    #  提取名词
    n_review_long_clean = review_long_clean[['n' in nat for nat in review_long_clean.nature]]
    print(n_review_long_clean.shape)
    print(n_review_long_clean.head())
    n_review_long_clean.nature.value_counts()
    n_review_long_clean.to_csv('n_review_long_clean.csv')
    # 词云图
    image1 = PIL.Image.open('星星.jpg')
    MASK = np.array(image1)
    wordcloud = WordCloud(font_path='C:\\Windows\\Fonts\\msyh.ttc', max_words=100, background_color='white', mask=MASK)
    wordcloud.generate_from_frequencies(Counter(review_long_clean.word.values))
    wordcloud.to_file('分词后的词云图.png')
    #plt.figure(figsize=(20, 10))
    #plt.imshow(wordcloud)
    #plt.axis('off')
    #plt.show()
    image2 = PIL.Image.open('爱心.jpg')
    MASK1 = np.array(image2)
    wordcloud1 = WordCloud(font_path='C:\\Windows\\Fonts\\msyh.ttc', max_words=100, background_color='white', mask=MASK1)
    wordcloud1.generate_from_frequencies(Counter(n_review_long_clean.word.values))
    wordcloud1.to_file('分词后的词云图(名词).png')
    #plt.figure(figsize=(20, 10))
    #plt.imshow(wordcloud)
    #plt.axis('off')
    #plt.show()
    #情感分析
    #来自知网的评价词
    pos_comment = pd.read_csv('正面评价词语（中文）.txt', header=None, sep='\n', encoding='utf-8')
    neg_comment = pd.read_csv('负面评价词语（中文）.txt', header=None, sep='\n', encoding='utf-8')

    pos_emotion = pd.read_csv('正面情感词语（中文）.txt', header=None, sep='\n', encoding='utf-8')
    neg_emotion = pd.read_csv('负面情感词语（中文）.txt', header=None, sep='\n', encoding='utf-8')

    pos = pd.concat([pos_comment, pos_emotion], axis=0)
    neg = pd.concat([neg_comment, neg_emotion], axis=0)
    # 增加新词
    new_pos = pd.Series(['点赞'])
    new_neg = pd.Series(['歇菜'])
    positive = pd.concat([pos, new_pos], axis=0)
    negtive = pd.concat([neg, new_neg], axis=0)
    positive.columns = ['review']
    positive['weight'] = pd.Series([1]*len(positive))
    positive.head()
    negtive.columns = ['review']
    negtive['weight'] = pd.Series([-1]*len(negtive))
    negtive.head()
    pos_neg = pd.concat([positive,negtive], axis=0)
    print(pos_neg.shape)
    # 表连接
    data = review_long_clean.copy()
    review_mltype = pd.merge(data, pos_neg, how='left', left_on='word', right_on='review')
    review_mltype = review_mltype.drop(['review'], axis=1)
    review_mltype = review_mltype.replace(np.nan, 0)
    # 修正情感倾向
    notdic = pd.read_csv('not.csv')
    notdic['freq'] = [1]*len(notdic)
    # 准备一
    review_mltype['amend_weight'] = review_mltype['weight']
    review_mltype['id'] = np.arange(0, review_mltype.shape[0])
    # 准备二
    only_review_mltype = review_mltype[review_mltype['weight'] != 0]
    only_review_mltype.index = np.arange(0, only_review_mltype.shape[0])
    i = 4
    review_i = review_mltype[review_mltype['index_content'] == only_review_mltype['index_content'][i]]
    print(review_i)
    # 判断否定词语气
    index = only_review_mltype['id']
    for i in range(0,only_review_mltype.shape[0]):
        review_i = review_mltype[review_mltype['index_content'] == only_review_mltype['index_content'][i]]
        review_i.index = np.arange(0, review_i.shape[0])
        word_ind = only_review_mltype['index_word'][i]
        if word_ind == 2:
            na = sum({review_i['word'][word_ind-1] in notdic['term']})
            if na == 1:
                review_mltype['amend_weight'][index[i]] = -(review_mltype['weight'][index[i]])
        elif word_ind > 2:
            na = sum([word in notdic['term'] for word in review_i['word'][[word_ind-1,word_ind-2]]])
            if na == 1:
                review_mltype['amend_weight'][index[i]] = -(review_mltype['weight'][index[i]])
    print(review_mltype.shape)
    review_mltype[(review_mltype['weight']-review_mltype['amend_weight']) != 0]
    # 计算情感值
    print(review_mltype.tail())
    emotion_value = review_mltype.groupby('index_content', as_index=False)['amend_weight'].sum()
    print(emotion_value.head())
    emotion_value.to_csv('1_emotion_value', index=True, header=True)
    # 每条评论的amend_weight不等于0
    content_emotion_value = emotion_value.copy()
    print(content_emotion_value.shape)
    content_emotion_value = content_emotion_value[content_emotion_value['amend_weight'] != 0]
    content_emotion_value['ml_type'] = ''
    content_emotion_value['ml_type'][content_emotion_value['amend_weight'] > 0] = 'pos'
    content_emotion_value['ml_type'][content_emotion_value['amend_weight'] < 0] = 'neg'
    print(content_emotion_value.shape)
    print(content_emotion_value.head())
    # 合并到大表中
    content_emotion_value = content_emotion_value.drop(['amend_weight'], axis=1)
    print(review_mltype.shape)
    review_mltype = pd.merge(review_mltype, content_emotion_value, how='left', left_on='index_content', right_on='index_content')
    review_mltype = review_mltype.drop(['id'], axis=1)
    print(review_mltype.shape)
    print(review_mltype.head())
    review_mltype.to_csv('1_review_mltype', index=True, header=True)
    # 生成词云图
    # 只看情感词
    data = review_mltype.copy()
    data = data[data['amend_weight'] != 0]
    word_data_pos = data[data['ml_type'] == 'pos']
    word_data_neg = data[data['ml_type'] == 'neg']
    image3 = PIL.Image.open('星星.jpg')
    MASK3 = np.array(image3)
    wordcloud3 = WordCloud(font_path='C:\\Windows\\Fonts\\msyh.ttc', max_words=100, background_color='white', mask=MASK3)
    wordcloud3.generate_from_frequencies(Counter(word_data_pos.word.values))
    wordcloud3.to_file('积极情感词云图.png')
    image4 = PIL.Image.open('爱心.jpg')
    MASK4 = np.array(image4)
    wordcloud4 = WordCloud(font_path='C:\\Windows\\Fonts\\msyh.ttc', max_words=100, background_color='white',mask=MASK4)
    wordcloud4.generate_from_frequencies(Counter(word_data_neg.word.values))
    wordcloud4.to_file('消极情感词云图.png')