信息内容安全实验-----某东评论数据感情分析

前言

本文使用的是基于情感词典的情感分析

一、大致思路

一.导入数据
1.导入数据。
二.数据预处理
(一)去重
(二)数据清洗
(三)分词、词性标注、去掉停用词、词云图
1.去重(利用python自带的函数drop_duplicates去重)。
2.数据清洗(利用正则表达式删掉数字字母和一些品牌名)。
3.用jieba进行分词,整理得词以及对应的词性。
4.形成包含词、词性和词长度的新框架。
5.去掉停用词和标点符号。
6.重新统计分词数量。
7.提取名词并形成分词后词云图和分此后名词词云图。
三.模型构建
(一)情感分析
1.导入知网的评价词,积极赋值1,消极赋值-1。
2.增加新词。
3.将知网的评价词表和数据预处理得到的数据连接。
4.修正情感倾向,即判断是否是否定的(准备一加入修正权值,准备二去掉权值等于0的。)
5.计算情感值,大于1 的为pos,小于1的为neg。
6.合并到一张大表里面。
7.生成积极和消极词云图。

二、源代码

import numpy as np
import pandas as pd
import re
import PIL
import jieba.posseg as pg
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter

if __name__ == '__main__':
    raw_data = pd.read_excel('review_try.xlsx')
    raw_data.info()
    for cate in ['review_nickname', 'review_time']:
        print(raw_data[cate].value_counts())
    # 去重
    review = raw_data.copy()
    review = review[['review_content']]
    print('去重之前:', review.shape[0])
    review = review.drop_duplicates()
    print('去重之后:', review.shape[0])
    # 数据清洗
    # 清洗之前
    content = review['review_content']
    for i in range(1, 100):
        print(content[i])
        print('----------------------')
    # 清洗之后,把字母数字京东,荣耀,华为,手机,机,pg
    info = re.compile('[0-9a-zA-Z]|京东|华为|荣耀|手机|机|苹果|小米|')
    content = content.apply(lambda x: info.sub('', x))
    # 清洗之后
    for i in range(1, 100):
        print(content[i])
        print('---------------------')
    # 用jieba进行分词
    seg_content = content.apply(lambda x: [(x.word, x.flag) for x in pg.cut(x)])
    print(seg_content.shape)
    print(len(seg_content))
    print('.............')
    for i in range(20):
        print(seg_content[i])
    print('.............')
    # 统计评论数
    n_word = seg_content.apply(lambda x:len(x))
    print(n_word)
    print(n_word.head(8))
    # 得到各分词在第几评论
    n_content = [[x+1]*y for x,y in zip(list(seg_content.index), list(n_word))]
    index_content_long = sum(n_content, [])
    print(len(index_content_long))
    # 分词以及词性,去掉[],拉平。
    print(seg_content.head())
    seg_content_long = sum(seg_content, [])
    print(seg_content_long)
    # 得到加长版的分词,词性。
    word_long = [x[0] for x in seg_content_long]
    nature_long = [x[1] for x in seg_content_long]
    print(len(word_long))
    print(len(nature_long))
    # 形成新框架
    review_long = pd.DataFrame({'index_content': index_content_long, 'word': word_long, 'nature': nature_long})
    print(review_long.shape)
    print(review_long.head())
    # 去掉标点符号,停用词。
    print(review_long['nature'].unique())
    review_long_clean = review_long[review_long['nature'] != 'x']
    print(review_long_clean.shape)
    # 导入停用词
    with open('stoplist.txt', 'r', encoding='utf-8') as f:
        stop_words = f.readlines()
    print(len(stop_words))
    print(stop_words[0:5])
    # 停用词预处理
    stop_words = [word.strip('\n') for word in stop_words]
    print(stop_words[0:5])
    # 得到不含停用词的分词表
    word_long_clean = list(set(word_long) - set(stop_words))
    print(len(word_long_clean))
    review_long_clean = review_long_clean[review_long_clean['word'].isin(word_long_clean)]
    print(review_long_clean.shape)
    # 再次统计每条评论里面的分词数量
    n_word = review_long_clean.groupby('index_content').count()['word']
    print(n_word)
    index_word = [list(np.arange(1, x+1)) for x in n_word]
    index_word_long = sum(index_word, [])
    review_long_clean['index_word'] = index_word_long
    print(review_long_clean.head())
    review_long_clean.to_csv('1_review_long_clean.csv')
    #  提取名词
    n_review_long_clean = review_long_clean[['n' in nat for nat in review_long_clean.nature]]
    print(n_review_long_clean.shape)
    print(n_review_long_clean.head())
    n_review_long_clean.nature.value_counts()
    n_review_long_clean.to_csv('n_review_long_clean.csv')
    # 词云图
    image1 = PIL.Image.open('星星.jpg')
    MASK = np.array(image1)
    wordcloud = WordCloud(font_path='C:\\Windows\\Fonts\\msyh.ttc', max_words=100, background_color='white', mask=MASK)
    wordcloud.generate_from_frequencies(Counter(review_long_clean.word.values))
    wordcloud.to_file('分词后的词云图.png')
    #plt.figure(figsize=(20, 10))
    #plt.imshow(wordcloud)
    #plt.axis('off')
    #plt.show()
    image2 = PIL.Image.open('爱心.jpg')
    MASK1 = np.array(image2)
    wordcloud1 = WordCloud(font_path='C:\\Windows\\Fonts\\msyh.ttc', max_words=100, background_color='white', mask=MASK1)
    wordcloud1.generate_from_frequencies(Counter(n_review_long_clean.word.values))
    wordcloud1.to_file('分词后的词云图(名词).png')
    #plt.figure(figsize=(20, 10))
    #plt.imshow(wordcloud)
    #plt.axis('off')
    #plt.show()
    #情感分析
    #来自知网的评价词
    pos_comment = pd.read_csv('正面评价词语(中文).txt', header=None, sep='\n', encoding='utf-8')
    neg_comment = pd.read_csv('负面评价词语(中文).txt', header=None, sep='\n', encoding='utf-8')

    pos_emotion = pd.read_csv('正面情感词语(中文).txt', header=None, sep='\n', encoding='utf-8')
    neg_emotion = pd.read_csv('负面情感词语(中文).txt', header=None, sep='\n', encoding='utf-8')

    pos = pd.concat([pos_comment, pos_emotion], axis=0)
    neg = pd.concat([neg_comment, neg_emotion], axis=0)
    # 增加新词
    new_pos = pd.Series(['点赞'])
    new_neg = pd.Series(['歇菜'])
    positive = pd.concat([pos, new_pos], axis=0)
    negtive = pd.concat([neg, new_neg], axis=0)
    positive.columns = ['review']
    positive['weight'] = pd.Series([1]*len(positive))
    positive.head()
    negtive.columns = ['review']
    negtive['weight'] = pd.Series([-1]*len(negtive))
    negtive.head()
    pos_neg = pd.concat([positive,negtive], axis=0)
    print(pos_neg.shape)
    # 表连接
    data = review_long_clean.copy()
    review_mltype = pd.merge(data, pos_neg, how='left', left_on='word', right_on='review')
    review_mltype = review_mltype.drop(['review'], axis=1)
    review_mltype = review_mltype.replace(np.nan, 0)
    # 修正情感倾向
    notdic = pd.read_csv('not.csv')
    notdic['freq'] = [1]*len(notdic)
    # 准备一
    review_mltype['amend_weight'] = review_mltype['weight']
    review_mltype['id'] = np.arange(0, review_mltype.shape[0])
    # 准备二
    only_review_mltype = review_mltype[review_mltype['weight'] != 0]
    only_review_mltype.index = np.arange(0, only_review_mltype.shape[0])
    i = 4
    review_i = review_mltype[review_mltype['index_content'] == only_review_mltype['index_content'][i]]
    print(review_i)
    # 判断否定词语气
    index = only_review_mltype['id']
    for i in range(0,only_review_mltype.shape[0]):
        review_i = review_mltype[review_mltype['index_content'] == only_review_mltype['index_content'][i]]
        review_i.index = np.arange(0, review_i.shape[0])
        word_ind = only_review_mltype['index_word'][i]
        if word_ind == 2:
            na = sum({review_i['word'][word_ind-1] in notdic['term']})
            if na == 1:
                review_mltype['amend_weight'][index[i]] = -(review_mltype['weight'][index[i]])
        elif word_ind > 2:
            na = sum([word in notdic['term'] for word in review_i['word'][[word_ind-1,word_ind-2]]])
            if na == 1:
                review_mltype['amend_weight'][index[i]] = -(review_mltype['weight'][index[i]])
    print(review_mltype.shape)
    review_mltype[(review_mltype['weight']-review_mltype['amend_weight']) != 0]
    # 计算情感值
    print(review_mltype.tail())
    emotion_value = review_mltype.groupby('index_content', as_index=False)['amend_weight'].sum()
    print(emotion_value.head())
    emotion_value.to_csv('1_emotion_value', index=True, header=True)
    # 每条评论的amend_weight不等于0
    content_emotion_value = emotion_value.copy()
    print(content_emotion_value.shape)
    content_emotion_value = content_emotion_value[content_emotion_value['amend_weight'] != 0]
    content_emotion_value['ml_type'] = ''
    content_emotion_value['ml_type'][content_emotion_value['amend_weight'] > 0] = 'pos'
    content_emotion_value['ml_type'][content_emotion_value['amend_weight'] < 0] = 'neg'
    print(content_emotion_value.shape)
    print(content_emotion_value.head())
    # 合并到大表中
    content_emotion_value = content_emotion_value.drop(['amend_weight'], axis=1)
    print(review_mltype.shape)
    review_mltype = pd.merge(review_mltype, content_emotion_value, how='left', left_on='index_content', right_on='index_content')
    review_mltype = review_mltype.drop(['id'], axis=1)
    print(review_mltype.shape)
    print(review_mltype.head())
    review_mltype.to_csv('1_review_mltype', index=True, header=True)
    # 生成词云图
    # 只看情感词
    data = review_mltype.copy()
    data = data[data['amend_weight'] != 0]
    word_data_pos = data[data['ml_type'] == 'pos']
    word_data_neg = data[data['ml_type'] == 'neg']
    image3 = PIL.Image.open('星星.jpg')
    MASK3 = np.array(image3)
    wordcloud3 = WordCloud(font_path='C:\\Windows\\Fonts\\msyh.ttc', max_words=100, background_color='white', mask=MASK3)
    wordcloud3.generate_from_frequencies(Counter(word_data_pos.word.values))
    wordcloud3.to_file('积极情感词云图.png')
    image4 = PIL.Image.open('爱心.jpg')
    MASK4 = np.array(image4)
    wordcloud4 = WordCloud(font_path='C:\\Windows\\Fonts\\msyh.ttc', max_words=100, background_color='white',mask=MASK4)
    wordcloud4.generate_from_frequencies(Counter(word_data_neg.word.values))
    wordcloud4.to_file('消极情感词云图.png')

总结

大致的代码来自B站一位小姐姐的视频。

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值