文本挖掘案例 - 评论分析

爱学习不掉头发

于 2024-11-12 20:17:30 发布

阅读量995

点赞数 8

分类专栏：数据分析文章标签：数据分析数据挖掘自然语言处理

本文链接：https://blog.csdn.net/weixin_51385258/article/details/143698339

版权

数据分析专栏收录该内容

3 篇文章

订阅专栏

1.文本挖掘介绍

根据用户的评论进行分析，获取到用户的评价，然后对于产品的改进进行指导。
通常获取到的评论都是字符串，在使用机器学习进行处理的时候不是很方便，所以需要将文本进行处理，所以要对文本进行编码。
文本处理过程通常包括：

获取原始文本
分词
- 英文可以采用空格进行区分，中文需要借助第三方库jieba进行分词
- 词干提取：将一些含有语态时态的单词还原为i没有语态和时态的词
- 词形还原：将一些缩写还原
- 停用词去除：去除对于结果没有影响的词，助词、虚词、连词等；通常会加载停用词表
- 文本处理过程中，主要关注名词、动词和形容词
文本向量化（编码）

2. 文本挖掘流程

2,1 异常值处理

导入需要的包

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import math
import seaborn as sns
import datetime

plt.style.use("fivethirtyeight")
import warnings

warnings.filterwarnings('ignore')

# 导入自然语言处理的包
# nltk：文本处理的包
import nltk
from nltk.stem.wordnet import WordNetLemmatizer # 词性还原
from nltk.corpus import wordnet as wn # 语料库,包含一些出版物等
from collections import Counter # 统计个数

import pyecharts.options as opts
from pyecharts.charts import WordCloud

设置图形显示中文和负号

plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] # 用来正常显示中文标签
#plt.rcParams['font.sans-serif'] = ['SimHei'] # windows系统用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

读取原始数据

df_reviews=pd.read_csv('data/reviews.csv')
# 查看数据
df_reviews.head()

在这里插入图片描述

# 查看数据信息,是否存在异常,如果有异常需要进行处理
df_reviews.info()

在这里插入图片描述

删除异常值

# 去除异常值
df_reviews = df_reviews.dropna()
df_reviews.info()

2.2 数据处理

将文本转化为机器学习可以使用的数值型向量

# 对于评论内容空缺的，直接删除
df_reviews = df_reviews.dropna()

# 截取评论中的星级数据
# 将stars列中的星级数量提取出来,这里直接替换掉星级数量后边的内容
def get_stars(n):
    return float(n.replace(' out of 5 stars',''))

# 根据评星数量获取评价属性， 好评（4分及以上）， 中评（3分）， 差评（2分及以下）
# 根据星星数量,转换为好中差评
# 借助apply函数
def stars_cat(n):
    '''
    评分转换为好中差评  1分2分为差评, 3分中评, 4分5分好评
    '''
    if n<=2:
        return '差评'
    elif n ==3:
        return '中评'
    else:
        return '好评'

# 获取评论中的日期信息，转换成日期时间格式
def get_date(x):
    '''
    处理评论日期  Reviewed in the United States on June 24, 2020
    先用 'on ' 去拆分, 把日期文本拆分成两部分
    再用', '拆分, 把后面的部分拆分成 ['月 日','年']
    最后把前面的'月 日' 用空格拆分成 月 日
    '''
    x = x.split('on ')[1] # 把数据拆分成两部分 ['Reviewed in the United States on ','June 24, 2020']
    x = x.split(', ')
    y= x[1] # 获取到年份
    x = x[0].split(' ')
    m,d = x[0],x[1] # 获取到月份和日期

    # 将月份转换为数字
    if m=='January' or m=='Jan':
        on_date='01-'+d+'-'+y
    elif m=='February' or m=='Feb':
        on_date='02-'+d+'-'+y
    elif m=='March' or m=='Mar':
        on_date='03-'+d+'-'+y
    elif  m=='April' or m=='Apr':
        on_date='04-'+d+'-'+y
    elif  m=='May':
        on_date='05-'+d+'-'+y
    elif  m=='June' or m=='Jun':
        on_date='06-'+d+'-'+y
    elif  m=='July' or m=='Jul':
        on_date='07-'+d+'-'+y
    elif m=='August' or m=='Aug':
        on_date='08-'+d+'-'+y
    elif m=='September' or m=='Sep':
        on_date='09-'+d+'-'+y
    elif m=='October' or m=='Oct':
        on_date='10-'+d+'-'+y
    elif m=='November' or m=='Nov':
        on_date='11-'+d+'-'+y
    elif m=='December' or m=='Dec':
        on_date='12-'+d+'-'+y

    # strptime(字符串时间,格式) - 转换为日期类型
    # strftime() - 转换为字符串
    on_date=datetime.datetime.strptime(on_date, '%m-%d-%Y').strftime('%Y-%m-%d')
    return on_date

# 字数统计
df_reviews['stars_num']=df_reviews['stars'].apply(get_stars)
df_reviews['content_cat']=df_reviews['stars_num'].apply(stars_cat)
df_reviews['date_d']=df_reviews['date'].apply(get_date)

在这里插入图片描述

统计产品的数量

# 统计产品的评论数量
# 根据产品(种类或id)进行分组,对Content进行计数
# 这里每个产品都有一条评论,直接进行计数 使用value_counts()
sns.set(font_scale=1)
df_reviews['product_name'].value_counts().plot(kind='bar')

在这里插入图片描述

将日期转换为时间对象类型并提取年月

# 按时间顺序,统计发文数量,分析是否有周期性的规律
# 将字符串日期转换为日期类型
df_reviews['date_d'] = pd.to_datetime(df_reviews['date_d'])
# 获取到年月,新加一列
# df_reviews['y_m'] = df_reviews['date_d'].apply(lambda x : x.strftime("%Y-%m")) 
df_reviews['y_m'] = df_reviews['date_d'].astype('datetime64[M]')#提取日期年月
df_reviews.head()

在这里插入图片描述

按照商品类别进行分组,查看每个类别的好评和差评的数量

# col - 指定每一个小图采用哪一列进行分组,这里指定列名product_name,product_name有四个值,设置四个子图
# col_wrap - 小图有几列,每行最大子图数量
# sharex,sharey - 是否共享x轴,是否共享y轴
# height 设置图片高度
# aspect - 设置宽高比

# 设置一个网格对象
g = sns.FacetGrid(data = df_reviews,col = 'product_name',col_wrap = 2,sharex=False,sharey=False,height = 5, aspect= 1.2)
# g.map 绘制分面图中的小图，利用 sns.countplot 绘图  , 从df_reviews 利用 product_name分组  每组画 content_cat 不同类别的数量order 指定柱子的顺序

# 第一个参数指定绘图函数,绘制什么样的图形
# 第二参数,传递给绘图函数的位置参数,对那一列进行计算
# 第三个参数,传递给绘图函数的关键字参数,
g.map(sns.countplot,'content_cat',order=['好评','差评','中评'])

在这里插入图片描述

按照时间统计每种商品的评论数量

# 每种产品的发文数量按月统计
df_content=df_reviews.groupby(['product_name','y_m'])['content'].count().reset_index()
g=sns.FacetGrid(data=df_content,col='product_name',col_wrap=2,sharey=False,sharex=False,height=4,aspect =2)
# 第一个参数传入的是 要调用哪个API 绘图,  后面几个参数传的是 调用 (plt.plot 的时候需要用到哪些参数, 具体的数据传入列名就行了
g.map(plt.plot,"y_m",'content',marker='1')#marker='1' 折线图每一个点会一条短线来表示

在这里插入图片描述

按照时间查看评价的分布

# 好中差评的随时间分布
df_content=df_reviews.groupby(['product_name','y_m','content_cat'])['content'].count().reset_index()
g=sns.FacetGrid(data=df_content,col='product_name',hue='content_cat',col_wrap=2,sharey=False,sharex=False,height=4,aspect =2)
g.map(plt.plot,"y_m",'content',marker='.')#marker='.' 折线图每一个点会用一个点来表示
g.add_legend()# 添加图例

在这里插入图片描述

查看相同产品,不同类型的分布

# 同产品不同型号的分布
df_content=df_reviews.groupby(['product_name','y_m','type'])['content'].count().reset_index()
g=sns.FacetGrid(data=df_content,col='product_name',hue='type',col_wrap=2,sharey=False,sharex=False,height=4,aspect =2)
g.map(plt.plot,"y_m",'content',marker='.')
g.add_legend()

在这里插入图片描述

2.3 文本挖掘-词云图绘制

df_content = df_reviews.groupby(['product_name','y_m','content_cat'])['content'].count().reset_index()
df_content

在这里插入图片描述

df_data = df_reviews.drop_duplicates(subset={"product_name","type","date_d","content_cat","content","stars_num","name"})
df_data

# 获取评论数据
df_text=df_data['content']

在这里插入图片描述

获取评论中的正负评论

sample_positive=df_data[(df_data.product_name=='everjoys-Soprano') & (df_data.content_cat=='好评')]
sample_negative=df_data[(df_data.product_name=='everjoys-Soprano') & (df_data.content_cat=='差评')]

词型和词性还原

# 词性还原
def get_lemma(word):
    lemma=wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

# 词形还原
def replace_abbreviations(text):
    # 还原常见缩写单词还原,i'm i'd he's
    new_text = re.sub(r"(it|he|she|that|this|there|here)(\'s)",r"\1 is", text,re.I)
    # (?<=pattern)xxx，就是捕获以pattern开头的内容xxx
    new_text = re.sub(r"(?<=[a-zA-Z])n\'t"," not", new_text) # not的缩写 aren't-- are not
    new_text = re.sub(r"(?<=[a-zA-Z])\'d"," would", new_text) # would的缩写i'd -->i would--> 'i' 'would'
    new_text = re.sub(r"(?<=[a-zA-Z])\'ll"," will", new_text) # will的缩写
    new_text = re.sub(r"(?<=[I|i])\'m"," am", new_text) # am的缩写
    new_text = re.sub(r"(?<=[a-zA-Z])\'re"," are", new_text) # are的缩写
    new_text = re.sub(r"(?<=[a-zA-Z])\'ve"," have", new_text) # have的缩写
    new_text = new_text.replace('\'', ' ').replace('.', '. ')
    return new_text

去除停用词

# 去除停用词
nltk.corpus.stopwords.words('english')
nltk.download('stopwords')
punctuation = [",", ":", ";", ".", "!", "'", '"', "’", "?", "/", "-", "+", "&", "(", ")"]
stop_words=nltk.corpus.stopwords.words('english')+punctuation

整合文本处理流程

def prepare_pipeline(n):
    # 把缩写还原（词形还原）
    tx = replace_abbreviations(str(n))
    # 分词，返回一个列表
    tokens = nltk.word_tokenize(tx)
    # 还原成原始单词
    # 词性还原
    tokens = [get_lemma(token) for token in tokens]
    # 去停用词
    tokens = [ i.lower() for i in tokens if i not in stop_words]

    return tokens

每个评论分割之后，放到一个list中

clean_txt_positive = [ prepare_pipeline(s) for s in sample_positive['content']]
clean_txt_negative = [ prepare_pipeline(s) for s in sample_negative['content']]

在这里插入图片描述

def get_words(clean_text):
    # 遍历出每条评论的每一个单词, 放到列表里
    words_all = []
    # 拿到第一层的列表
    for words in clean_text:
        # 遍历所有单词,放到列表当中
        for word in words:
            # words_all列表中存储所有的单词,有重复
            words_all.append(word)

    # 去重复单词
    total_words = list(set(words_all))

    # 统计每个单词出现的次数
    all_words = Counter(words_all)

    # 计算每条评论平均单词个数(每个评论之间是有重复的)
    content_mean = len(words_all)/len(clean_text)

    # 去重之后单词数 / 去重之前单词数
    words_cap = len(all_words)/len(words_all)

    return all_words,content_mean,total_words,words_cap

对差评进行统计计数

words_all_negative,content_mean_negative,total_words,words_cap_negative=get_words(clean_txt_negative)

# 差评平均长度
content_mean_negative

# 单词的个数
words_all_negative

total_words # 去重之后的单词

# 去重之后单词数 / 去重之前单词数
words_cap_negative

对好评进行统计计数

words_all_positive,content_mean_positive,total_words,words_cap_positive=get_words(clean_txt_positive)

words_all_positive.most_common(10) # 取出数量最多的10个
positive_words_wordcloud=words_all_positive.most_common(100)# 取出前100个频率最高的单词
negative_words_wordcloud=words_all_negative.most_common(100)

(WordCloud()
    .add(series_name="好评词云",
         data_pair=positive_words_wordcloud,  #传入绘制词云图的数据
         word_size_range=[16, 80])  #word_size_range 字号大小取值范围
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="好评词云",
            title_textstyle_opts=opts.TextStyleOpts(font_size=23) # 设置标题字号
        ),
        tooltip_opts=opts.TooltipOpts(is_show=True),  # 设置为True 鼠标滑过文字会弹出提示框
    )
    .render()
)

在这里插入图片描述