数据分析演练

huangxuz

已于 2023-02-27 15:25:43 修改

阅读量432

点赞数

文章标签：数据分析 python 信息可视化

于 2023-02-17 03:24:29 首次发布

本文链接：https://blog.csdn.net/huangxuz/article/details/129077421

版权

本文展示了对双十一淘宝美妆数据和日化表数据的分析过程，包括数据预处理（去除异常值、重复值和空值）、时间范围限定以及销售量和销售额的可视化。使用Python的pandas库进行数据提取和处理，matplotlib和pyecharts进行数据可视化，同时通过RFM模型对客户价值进行了评估。主要关注了每日销售走势、品牌销售排行以及省份订购和客户分类情况。

摘要由CSDN通过智能技术生成

数据集来源于和鲸社区，分为两部分进行不同的数据分析（仅作为对数据分析几个常用库的实践）

一、双十一淘宝美妆数据

1、数据提取与处理

import pandas as pd
data = pd.read_csv('双十一淘宝美妆数据.csv')
data.head()

对数据时间进行处理

import datetime
#确保开始时间和结束时间是在11月5日到14日
startTime = datetime.datetime(2016, 11, 5)
endTime = datetime.datetime(2016, 11, 14)
data['update_time'] = pd.to_datetime(data['update_time']) #修改数据格式为datetime64[ns]
#将11月5日前的时间和14日之后的时间都删除
data.drop(index=data[data.update_time < startTime].index, inplace=True)
data.drop(index=data[data.update_time > endTime].index, inplace=True)

删除重复值

data[data.duplicated()].count()
data.drop_duplicates(inplace=True)#删除完全重复数据
data.info()

data.isnull().sum()  # 查看空值 ，销售数量和评论数有空值
data.fillna(0, inplace=True) # 空值填充
data.reset_index(drop = True , inplace = True) # 重建索引
data['sale_amount'] = data['price'] * data['sale_count'] # 增加一列销售额
data

2、数据分析与可视化

每日整体销售量走势

import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']  
plt.rcParams['font.serif'] = ['SimHei']  
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(10,6),dpi=100)
plt.grid(alpha=0.4)
x = list(result.keys())
y = list(result.values())
y1 = [i / 10000000 for i in y]
plt.ylim(2.5,3.5)
plt.plot(x,y1,color="red",marker='o')
plt.xlabel('日期')  
plt.ylabel("销售额（千万元）") 
plt.title('销售额走势',color='red',size=15) 
for a,b in zip(x,y1):  
    plt.annotate('(%.2f)'%(b),xy=(a,b),xytext=(-10,10),textcoords='offset points')
plt.show()

各个品牌售卖情况(使用pyecharts形成动态可视化图)

dts = list(data2['update_time'].unique())
dts.reverse()
from pyecharts import options as opts
from pyecharts.charts import Map, Timeline, Bar, Line, Pie
from pyecharts.components import Table
from pyecharts.options import ComponentTitleOpts

tl = Timeline()
tl.add_schema(
        is_auto_play=False,
        is_loop_play=False,
        play_interval=500,
    ) #时间轴
for dt in dts:
    item = data[data['update_time'] <= dt].groupby('店名').agg({'sale_count': 'sum', 'sale_amount': 'sum'}).sort_values(by='sale_count', ascending=False)[:10].sort_values(by='sale_count').to_dict()
    bar = (
        Bar()
        .add_xaxis([*item['sale_count'].keys()]) #拆开列表每一个元素
        .add_yaxis("销售量", [round(val/10000,2) for val in item['sale_count'].values()], label_opts=opts.LabelOpts(position="right", formatter='{@[1]/} 万'))
        .add_yaxis("销售额", [round(val/10000/10000,2) for val in item['sale_amount'].values()], label_opts=opts.LabelOpts(position="right", formatter='{@[1]/} 亿元'))
        .reversal_axis()
        .set_global_opts(
            title_opts=opts.TitleOpts("累计销售量排行 TOP10")

        )
    )

    tl.add(bar, dt)
tl.render_notebook()

二、日化表数据分析

1、数据提取与处理

import pandas as pd 
fact_order = pd.read_excel('日化.xlsx', sheet_name='销售订单表')
dim_product = pd.read_excel('日化.xlsx', sheet_name='商品信息表')

dim_product.describe()
dim_product[dim_product.duplicated()].count()
dim_product[dim_product['商品编号'].duplicated()].count()
dim_product.isnull().sum()
#检查完dim_product表，数据很干净

对fact_order表进行删除重复值，空值填充等

fact_order.info()

fact_order[fact_order.duplicated()].count()
fact_order.drop_duplicates(inplace=True) # 删除重复数据
fact_order.reset_index(drop=True, inplace=True)  # 重建索引

fact_order.isnull().sum()

fact_order.fillna(method='bfill', inplace=True) # 空值填充
fact_order.fillna(method='ffill', inplace=True) 
fact_order.isnull().sum() #再次检查空值

import datetime
#确保开始时间和结束时间
startTime = datetime.datetime(2019, 1, 1)
endTime = datetime.datetime(2019, 9, 30)
fact_order['订单日期'] = fact_order['订单日期'].apply(lambda x: pd.to_datetime(x, format='%Y#%m#%d') if isinstance(x, str) else x)
fact_order.drop(index=fact_order[fact_order.订单日期 < startTime].index, inplace=True)
fact_order.drop(index=fact_order[fact_order.订单日期 > endTime].index, inplace=True)

fact_order['订购数量'] = fact_order['订购数量'].apply(lambda x: x.strip('个') if isinstance(x, str) else x).astype('int')
fact_order['订购单价'] = fact_order['订购单价'].apply(lambda x: x.strip('元') if isinstance(x, str) else x).astype('float')
fact_order['金额'] = fact_order['金额'].astype('float')

fact_order['客户编码'] = fact_order['客户编码'].str.replace('编号', '')

2、数据分析与可视化

每月订购情况

from pyecharts import options as opts
from pyecharts.charts import Map, Bar, Line
from pyecharts.components import Table
from pyecharts.options import ComponentTitleOpts
from pyecharts.faker import Faker
import os

fact_order['订单月份'] = fact_order['订单日期'].apply(lambda x: x.month) 
item = fact_order.groupby('订单月份').agg({'订购数量': 'sum', '金额': 'sum'}).to_dict()
x = [f'{key} 月' for key in item['订购数量'].keys()]
y1 = [round(val/10000, 2) for val in item['订购数量'].values()]
y2 = [round(val/10000/10000, 2) for val in item['金额'].values()]
c = (
    Bar()
    .add_xaxis(x)
    .add_yaxis("订购数量（万件）", y1, is_selected=False)
    .add_yaxis("金额（亿元）", y2)
    .set_global_opts(title_opts=opts.TitleOpts(title="每月订购情况"))
    .set_series_opts(
        label_opts=opts.LabelOpts(is_show=True,position="top")
    )
)
c.render_notebook()

省份分布

from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB
item = fact_order.groupby('所在省份').agg({'订购数量': 'sum'}).to_dict()['订购数量']
data=[]
for key,value in item.items():
    data.append([key,value])
c = (
    Map()
    .add("订购数量",data, "china", is_map_symbol_show=True)
    .set_series_opts(label_opts=opts.LabelOpts(is_show=True))
    .set_global_opts(
        title_opts=opts.TitleOpts(title='省份分布'),
        visualmap_opts=opts.VisualMapOpts(max_=1000000),            
    )
)
c.load_javascript()
c.render_notebook()

通过 RFM 模型挖掘客户价值

df=fact_order.copy()
df.set_index('客户编码',drop=True,inplace=True) # 将客户编码设置为索引  
df['订单编码'] = 1 #订单编码设为1，方便计算下单频率
rfmdf = df.pivot_table(index=['客户编码'],  
  values=['订单日期','订单编码','金额'],  
  aggfunc={'订单日期':'max',  
  '订单编码':'sum',  
  '金额':'sum'}) #数据透视表
rfmdf

rfmdf['R'] = (rfmdf.订单日期.max()-rfmdf.订单日期).dt.days
rfmdf.rename(columns={'订单编码':'M','金额':'F'},inplace=True) 

def rfm_func(x):
    level = x.apply(lambda x: "1" if x >= 0 else '0')
    label = level.R + level.F + level.M
    d = {
        '011':'重要价值客户',
        '111':'重要唤回客户',
        '001':'重要深耕客户',
        '101':'重要挽留客户',
        '010':'潜力客户',
        '110':'一般维持客户',
        '000':'新客户',
        '100':'流失客户'
    }
    result = d[label]
    return result
rfmdf['label'] = rfmdf[['R','F','M']].apply(lambda x:x-x.mean()).apply(rfm_func,axis=1)
rfmdf_res = rfmdf.groupby('label').count()
print(rfmdf_res)

plt.figure(figsize=(10,6),dpi=100)
plt.xticks (rotation= 0)
plt.title('客户类型图',size=15)
X = list(rfmdf.label.value_counts().keys())
Y = list(rfmdf.label.value_counts())
plt.ylim(0,400)
plt.bar(X, Y, 0.4, color="steelblue")
for a,b in zip(X,Y):   #柱子上的数字显示
    plt.annotate('(%.2f)'%(b),xy=(a,b),xytext=(-20,5),textcoords='offset points')
plt.show()