【知识专栏丨python数分实战】京东某商品比价分析

最新推荐文章于 2024-10-07 01:36:35 发布

Tinalee-电商API接口呀

最新推荐文章于 2024-10-07 01:36:35 发布

阅读量442

点赞数 5

文章标签： python 开发语言 java php django json hbase

本文链接：https://blog.csdn.net/2301_79478575/article/details/140767246

版权

今天这篇文章将给大家分享一个京东某商品比价分析的案例。

项目分析思路：

1、确定分析方向，小易比较想知道同样的商品是不是自营店铺普遍比较贵（以消费者搜索的角度）
2、从京东平台上输入搜索关键字，定向爬取该关键字商品的信息（共100页）
3、数据分析验证第1小点

数据处理

数据来源：

https://www.heywhale.com/mw/project/5ff5c988840381003b05d940/dataset

import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inlinesns.set(palette="summer",font='Microsoft YaHei',font_scale=1.2)from warnings import filterwarningsfilterwarnings('ignore')

df = pd.read_csv('/home/kesci/input/1357260/csvjd.csv',encoding='gbk')

print('数据形状：{}'.format(df.shape))

输出结果：

数据形状：(5984, 5)

print('重复值：{}条'.format(df.duplicated().sum()))

输出结果：

重复值：77条

# 空值统计df.isnull().sum()

输出结果：

price       0name        0url         0comment     0shopname    0dtype: int64

# 删除重复值df.drop_duplicates(inplace=True)

df.info()

输出结果：

<class 'pandas.core.frame.DataFrame'>Int64Index: 5907 entries, 0 to 5983Data columns (total 5 columns): #   Column    Non-Null Count  Dtype  ---  ------    --------------  -----   0   price     5907 non-null   float64 1   name      5907 non-null   object  2   url       5907 non-null   object  3   comment   5907 non-null   object  4   shopname  5907 non-null   object dtypes: float64(1), object(4)memory usage: 276.9+ KB

df.head()

输出结果：

# 处理comment列数据def comment_p(x):    x = x.replace(r'+','')    if '万' in x:        x = x.replace(r'万','')        x=float(x)*10000        return x    else:        return x

df['new_comment'] = df['comment'].apply(lambda x:comment_p(x)).astype('int')

def new_group(frame):    new_group=[]    for i in range(len(frame)):        if frame.iloc[i,4].find('自营')>=0:            new_group.append('京东自营')        elif frame.iloc[i,4].find('旗舰店')>=0:            new_group.append('旗舰店')        elif frame.iloc[i,4].find('专营店')>=0:            new_group.append('专营店')        else:            new_group.append('其它')    frame['newgroup']=new_group

new_group(df)

df.describe()

输出结果：

分析过程

统计不同类型的店铺数量

# 统计这100页中共有多少家店铺print('该100页商品信息中共有：{} 家店铺'.format(df['shopname'].nunique()))

输出结果：

该100页商品信息中共有：709 家店铺

s_group = df.groupby('newgroup').shopname.nunique().reset_index(name='counts')s_group.sort_values(by='counts',ascending=False,inplace=True)plt.figure(figsize=(12,8))sns.barplot(x='counts',y='newgroup',data=s_group)con = list(s_group['counts'])con=sorted(con,reverse=True)for x,y in enumerate(con):plt.text(y+0.1,x,'%s'%y,size=14)plt.xlabel('')plt.ylabel('')plt.xticks([])plt.grid(False)plt.box(False)plt.title('店铺数量',loc='left',fontsize=20)plt.show()

输出结果：

绘制店铺类型的百分比

plt.figure(figsize=(12,8))size = s_group['counts']labels = s_group['newgroup']plt.pie(size,labels=labels,wedgeprops={'width':0.35,'edgecolor':'w'},        autopct='%.2f%%',pctdistance=0.85,startangle = 90)plt.axis('equal')plt.title('店铺总数百分比',loc='left',fontsize=20)plt.show()

输出结果：

plt.figure(figsize=(12,8))sns.countplot(y=df['newgroup'],order = df['newgroup'].value_counts().index,data=df)
con = list(df['newgroup'].value_counts().values)con=sorted(con,reverse=True)for x,y in enumerate(con):    plt.text(y+0.1,x,'%s' %y,size=14)plt.xlabel('')plt.ylabel('')plt.xticks([])plt.grid(False) plt.box(False)plt.title('商品数量',loc='left',fontsize=20)plt.show()

输出结果：

plt.figure(figsize=(12,8))size = df['newgroup'].value_counts().valueslabels = df['newgroup'].value_counts().indexplt.pie(size,labels=labels,wedgeprops={'width':0.35,'edgecolor':'w'},        autopct='%.2f%%',pctdistance=0.85,startangle = 90)plt.axis('equal')plt.title('商品总数百分比',loc='left',fontsize=20)plt.show()

输出结果：

查看整体价格分布

# 整体价格分布plt.figure(figsize=(12,8))sns.distplot(df['price'])plt.title('价格分布',loc='left',fontsize=20)plt.box(False)plt.show()

输出结果：

查看该商品主要集中在哪个价格段

result = dfresult['price_cut'] = pd.cut(x=result['price'],bins=[0,100,200,300,400,500,600,800,1000,30000],                             labels=['100以下','100-200','200-300','300-400','400-500','500-600','600-800','800-1k','1K以上'])

result2 = df[df['price']>=1000]result2['price_cut'] = pd.cut(x=result['price'],bins=[1000,2000,5000,10000,30000],labels=['1K-2K','2K-5K','5K-1W','1W以上'])result3 = pd.DataFrame((result2['price_cut'].value_counts()/result.shape[0]).round(3))

from matplotlib.patches import ConnectionPatchimport numpy as np
# make figure and assign axis objectsfig = plt.figure(figsize=(12, 8))ax1 = fig.add_subplot(121)ax2 = fig.add_subplot(122)fig.subplots_adjust(wspace=0)
# pie chart parametersratios = result.groupby('price_cut').name.count().valueslabels = result.groupby('price_cut').name.count().indexexplode = [0, 0,0,0,0,0,0,0,0.1]# rotate so that first wedge is split by the x-axisangle = -180 * ratios[8]ax1.pie(ratios, autopct='%1.1f%%', startangle=angle,        labels=labels, explode=explode,pctdistance=0.85)ax1.set_title('不同价格段的商品占比')
# bar chart parameters
xpos = 0bottom = 0ratios = result3.valueswidth = .2

for j in range(len(ratios)):    height = ratios[j]    ax2.bar(xpos, height, width, bottom=bottom)    ypos = bottom + ax2.patches[j].get_height() / 10    bottom += height    ax2.text(xpos, ypos, '%1.1f%%' % (ax2.patches[j].get_height() * 100),             ha='right')
ax2.set_title('1K以上的产品')ax2.legend((result3.index))ax2.axis('off')ax2.set_xlim(- 2.5 * width, 2.5 * width)
# use ConnectionPatch to draw lines between the two plots# get the wedge datatheta1, theta2 = ax1.patches[8].theta1, ax1.patches[8].theta2center, r = ax1.patches[8].center, ax1.patches[8].rbar_height = sum([item.get_height() for item in ax2.patches])
# draw top connecting linex = r * np.cos(np.pi / 180 * theta2) + center[0]y = r * np.sin(np.pi / 180 * theta2) + center[1]con = ConnectionPatch(xyA=(-width / 2, bar_height), coordsA=ax2.transData,                      xyB=(x, y), coordsB=ax1.transData)con.set_color([0.5, 0.5, 0.5])con.set_linewidth(2)ax2.add_artist(con)
# draw bottom connecting linex = r * np.cos(np.pi / 180 * theta1) + center[0]y = r * np.sin(np.pi / 180 * theta1) + center[1]con = ConnectionPatch(xyA=(-width / 9, 0), coordsA=ax2.transData,                      xyB=(x, y), coordsB=ax1.transData)con.set_color([0.5, 0.5, 0.5])ax2.add_artist(con)con.set_linewidth(2)
plt.show()

输出结果：

result4 = result.groupby(['newgroup','price_cut']).name.count().reset_index(name='counts')result4 = pd.DataFrame(result4)result4.columns = ['newgroup','price_cut','counts']

percent=pd.pivot_table(result4,index=['newgroup'],columns=['price_cut'])percent.columns = ['100以下','100-200','200-300','300-400','400-500','500-600','600-800','800-1k','1K以上']# percent=percent.reset_index()p_percent=percent.div(percent.sum(axis=1), axis=0)*100p_percent=p_percent.reset_index()

p_percent.plot(x = 'newgroup', kind='barh',stacked = True,mark_right = True,figsize=(16,8))df_rel=p_percent[p_percent.columns[1:]]
for n in df_rel:    for i, (cs, ab, pc) in enumerate(zip(p_percent.iloc[:, 1:].cumsum(1)[n], p_percent[n], df_rel[n])):        plt.text(cs - ab/2, i, str(np.round(pc, 1)) + '%', va='center', ha='center',size=12)
plt.title('不同类型不同价格区间的商品占各类型总数的份额',loc='left',fontsize=20)plt.legend(bbox_to_anchor=(1, -0.01),ncol=10,facecolor='None')plt.xlabel('')plt.ylabel('')plt.xticks([])plt.grid(False) plt.box(False)plt.show()

输出结果：

累计成交量

这里的累计成交量是：

因为京东上的商品只要交易成功不管是否评价，系统都会记录评价人数，因此忽略时效的问题，可当作累计成交来看，只看大概的别纠结哈~

result7 = result.groupby('price_cut').new_comment.sum().reset_index(name='total_comment')plt.figure(figsize=(12,8))size = result7['total_comment']labels = result7['price_cut']plt.pie(size,labels=labels,        autopct='%.2f%%',pctdistance=0.8,explode=[0,0,0,0,0.5,0.5,0.5,0.5,0.5])plt.title('不同价格区间累计成交量',loc='left',fontsize=16)plt.axis('equal')plt.show()

输出结果：

超 86%的人选择400元以下的商品

plt.figure(figsize=(12,8))sns.barplot(x=(result.groupby('newgroup').new_comment.sum().sort_values(ascending=False).values/10000).round(2),            y=result.groupby('newgroup').new_comment.sum().sort_values(ascending=False).index,           data=result,palette='summer')con = list((result.groupby('newgroup').new_comment.sum().sort_values(ascending=False).values/10000).round(2))# con=sorted(con,reverse=True)for x,y in enumerate(con):    plt.text(y+0.1,x,'%s (万人)' %y,size=12)plt.grid(False) plt.box(False)plt.xticks([])plt.ylabel('')plt.title('不同类型的店铺累计成交量排名',loc='left',fontsize=20)plt.show()

输出结果：

plt.figure(figsize=(12,8))size = result.groupby('newgroup').new_comment.sum()labels = size.indexplt.pie(size.values,labels=labels,autopct='%.2f%%',pctdistance=0.8,explode=[0.1,0.1,0.1,0.1])plt.axis('equal')plt.title('累计成交量百分比',loc='left',fontsize=20)plt.show()

输出结果：

result5 = result.groupby(['newgroup','price_cut']).new_comment.sum().reset_index(name='total_comment')

plt.figure(figsize=(20,4))n = 0for x in ['京东自营','旗舰店','专营店','其它']:    df = result5[result5['newgroup']==x]    n+=1    plt.subplot(1,4,n)    sns.barplot(x='price_cut',y=df['total_comment']/10000,data=df,palette='summer')    plt.title(x)    plt.xlabel('')    plt.ylabel('累计成交 ( 万单 )')    plt.xticks(rotation=45)    plt.grid(False)     plt.box(False)plt.show()

输出结果：