市场营销学数据分析【香奈儿包包在京东上的比价分析】

本文链接：https://blog.csdn.net/m0_58046481/article/details/127719462

为了分析香奈儿成功的因素以及香奈儿在市场上的具体销售情况，博主自己写了一个程序用于爬取京东商城香奈儿包包的销售数量（包括商品名称，价格，商品成交量，以及商品的店铺名称），然后对商品进行一系列的数据分析，最后筛选出对我们市场营销作业有用的的数据作为支撑来分析。

（1）通过爬虫代码获取京东商城香奈儿包包前200页（12000）条数据的信息：

from selenium import webdriver
from lxml import etree
import os
import pandas as pd
def get_page(driver, key):
    driver.get("https://www.jd.com/")
    it = driver.find_element_by_xpath('//*[@id="key"]')
    it.send_keys(key)
    button = driver.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
    button.click()
    driver.execute_script("window.scrollBy(0,100000)")

def parse_page(driver):
    html = etree.HTML(driver.page_source)
    lis = html.xpath('//*[@id="J_goodsList"]/ul/li')
    print(len(lis))
    for li in lis:
        name = li.xpath('./div/div[@class="p-name p-name-type-2"]/a/em//text()')
        name = ''.join(name).strip().replace('\n', '').replace(' ', '')
        price = li.xpath('./div/div[@class="p-price"]/strong//text()')
        price = ''.join(price).strip().replace('\n', '').replace(' ', '')
        commit = li.xpath('./div/div[@class="p-commit"]/strong//text()')
        commit = ''.join(commit).strip().replace('\n', '').replace(' ', '')
        shop = li.xpath('./div/div[@class="p-shop"]/span/a/text()')
        shop = ''.join(shop).strip().replace('\n', '').replace(' ', '')
        result = {
            '名称': name,
            '价格': price,
            '评论': commit,
            '店铺': shop
        }
        print(result)
        datas.append(result)
def next_page(driver):
    global i
    print("第"+str(i)+"页数据已经采集完成=======")
    i = i + 1
    try:
        driver.execute_script('document.querySelector("#J_bottomPage > span.p-num > a.pn-next")',
                              driver.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[9]'))
        driver.execute_script("window.scrollBy(0,100000)")
        driver.implicitly_wait(30)
    except:
        pass


def store(datas):
    abs_path = os.path.dirname(os.path.abspath(__file__))
    out_path = os.path.join(abs_path, '口红.xlsx')
    # 用pandas将字典对象data转换为DataFrame对象
    df = pd.DataFrame(datas)
    # 用内置函数直接导入Excel
    df.to_excel(out_path, engine='openpyxl')
if __name__ == '__main__':
    key = input("请输入商品名称")
    start = int(input("你要爬取的最大页数"))
    driver_path = r"F:\python项目\chromedriver.exe"
    driver = webdriver.Chrome(driver_path)
    get_page(driver, key)
    i = 1
    datas = []
    for i in range(start):
        parse_page(driver)
        next_page(driver)
    store(datas)
    driver.close()
    driver.quit()

这里使用的技术是selenuim自动化插件对商品进行采集，然后将结果保存为excel格式，获取的数据结果如下图所示。

接下来对商品数据进行数据分析，数据分析的代码如下所示：

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings

sns.set(palette="summer", font='Microsoft YaHei', font_scale=1.2)
filterwarnings('ignore')
df = pd.read_excel('F:\\python项目\\wh0922\\jd\\口红.xlsx', engine='openpyxl')
print("数据形状：{}".format(df.shape))
print("重复值:{}".format(df.duplicated().sum()))
print("空值：{}".format(df.isnull().sum()))
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.info()
print(df['价格'])


# 处理评论列
def comment_p(x):
    x = x.replace(r'+', '')
    x = x.replace(r'条评价', '')
    if '万' in x:
        x = x.replace(r'万', '')
        x = float(x) * 10000
        return x
    else:
        return x

df['new_comment'] = df['评论'].apply(lambda x : comment_p(x)).astype('int')

def price_p(x):
    x = x.replace(r'￥', '')
    x = float(x)
    return x
df['价格'] = df['价格'].apply(lambda x : price_p(x)).astype('int')
print(df['价格'])
# 创建一行新的列对商店类型进行存储
def new_group(frame):
    new_group = []
    for i in range(len(frame)):
        if frame.iloc[i, 4].find('自营') >= 0:
            new_group.append('京东自营')
        elif frame.iloc[i, 4].find('旗舰店') >= 0:
            new_group.append('旗舰店')
        elif frame.iloc[i, 4].find('专营店') >= 0:
            new_group.append('专营店')
        else:
            new_group.append('其他')
    frame['newgroup'] = new_group
new_group(df)
print(df['newgroup'][0:5])
# 统计共有多少家店铺
print('该50页商品信息中共有{}家店铺'.format(df['店铺'].nunique()))
s_group = df.groupby('newgroup').店铺.nunique().reset_index(name='counts')
s_group.sort_values(by='counts', ascending=False, inplace=True)
plt.figure(figsize=(12, 8))
sns.barplot(x='counts', y='newgroup', data=s_group)
con = list(s_group['counts'])
con = sorted(con, reverse=True)
print(con)
for x, y in enumerate(con):
    plt.text(y + 0.1, x, y, size=14)
plt.xlabel('')
plt.ylabel('')
plt.xticks([])
plt.grid(False)
plt.box(False)
plt.title('店铺数量', loc='left', fontsize=20)
plt.show()
# 绘制店铺类型百分比
plt.figure(figsize=(12, 8))
size = s_group['counts']
labels = s_group['newgroup']
plt.pie(size, labels=labels, wedgeprops={'width': 0.35, 'edgecolor': 'w'}, autopct='%.2f%%', pctdistance=0.85,
        startangle=90)
plt.axis('equal')
plt.title('店铺总数百分比', loc='left', fontsize=20)
plt.show()

# 统计不同类型店铺的商品数量
plt.figure(figsize=(12, 8))
sns.countplot(y=df['newgroup'], order=df['newgroup'].value_counts().index, data=df)
con = list(df['newgroup'].value_counts(ascending=False).values)
for x, y in enumerate(con):
    plt.text(y + 0.1, x, y, size=14)
plt.xlabel('')
plt.ylabel('')
plt.xticks([])
plt.grid(False)
plt.box(False)
plt.title('商品数量', loc='left', fontsize=20)
plt.show()

# 商品总数百分比
plt.figure(figsize=(12, 8))
size = df['newgroup'].value_counts().values
labels = df['newgroup'].value_counts().index
plt.pie(size, labels=labels, wedgeprops={'width': 0.35, 'edgecolor': 'w'}, autopct='%.2f%%', pctdistance=0.85,
        startangle=90)
plt.axis('equal')
plt.title('商品总数百分比', loc='left', fontsize=20)
# plt.show()

# 查看整体价格分布
plt.figure(figsize=(12, 8))
sns.displot(df['价格'])
plt.title('价格分布', loc='left', fontsize=20)
plt.box(False)
plt.show()

# 查看该商品主要集中在那个价格阶段，创建一列新数据，将价格分段
result = df
result['price_cut'] = pd.cut(x=result['价格'], bins=[0, 20, 40, 60, 80, 100, 120, 140],
                             labels=['20以下', '20-40', '40-60', '60-80', '80-100', '100-120', '120以上'])
print(result['price_cut'][0:5])
result2 = df[df['价格'] >= 100]
result2['price_cut'] = pd.cut(x=result['价格'], bins=[100, 110, 120, 130, 140],
                              labels=['100-110', '110-120', '120-130', '130以上'])
result3 = pd.DataFrame((result2['price_cut'].value_counts() / result.shape[0]).round(3))

from matplotlib.patches import ConnectionPatch

# make figure and assign axis objects
fig = plt.figure(figsize=(12, 8))
ax1 = fig.add_subplot(121)
fig.subplots_adjust(wspace=0)

# pie chart parameters
ratios = result.groupby('price_cut').名称.count().values
labels = result.groupby('price_cut').名称.count().index
explode = [0, 0, 0, 0, 0, 0, 0]

# rotate so that first wedge is spilt by the x-axis
ax1.pie(ratios, autopct='%1.1f%%', startangle=90, labels=labels, explode=explode, pctdistance=0.85)
ax1.set_title('不同价格段的商品占比')

# 累计成交量
result7 = result.groupby('price_cut').new_comment.sum().reset_index(name='total_comment')
plt.figure(figsize=(12, 8))
size = result7['total_comment']
labels = result7['price_cut']
plt.pie(size, labels=labels, pctdistance=0.8, explode=[0, 0, 0, 0.5, 0.5, 0.5, 0.5])
plt.title('不同价格区间累计成交量', loc='left', fontsize=16)
plt.axis('equal')
plt.show()

# 不同类型店铺累计成交量
plt.figure(figsize=(12, 8))
sns.barplot(x=(result.groupby('newgroup').new_comment.sum().sort_values(ascending=False).values / 10000).round(2), y=result.groupby('newgroup').new_comment.sum().sort_values(ascending=False).index, data=result, palette='summer')
con = list((result.groupby('newgroup').new_comment.sum().sort_values(ascending=False).values / 10000).round(2))

for x, y in enumerate(con):
    plt.text(y+0.1, x, '%s(万人)' % y, size=12)
plt.grid(False)
plt.box(False)
plt.xticks([])
plt.ylabel('')
plt.title('不同类型店铺累计成交量排名', loc='left', fontsize=20)
plt.show()

# 店铺累计成交量占比
plt.figure(figsize=(12, 8))
size = result.groupby('newgroup').new_comment.sum()
labels = size.index
plt.pie(size.values, labels=labels, autopct='%.2f%%', pctdistance=0.8, explode=[0, 0, 0, 0])
plt.axis('equal')
plt.title('累计成交量百分比', loc='left', fontsize=20)
plt.show()

处理的结果以及可视化结果如下图：