数据挖掘之京东购买意向预测

本文介绍了通过数据挖掘预测京东用户购买意向的过程,包括数据检查、特征构建、数据清洗、探索分析、特征工程、训练集和测试集构造,最后使用Xgboost模型进行建模,取得了较高的预测准确率和召回率。
摘要由CSDN通过智能技术生成

一. 数据检查

利用pd.Merge连接两个数据, 观察数据是否减少

1.1 检查用户是否一致

import pandas as pd
def user_action_check():
    df_user = pd.read_csv(r'data/JData_User.csv', encoding = 'gbk')
    df_sku = df_user.loc[:, 'user_id'].to_frame()
    df_month2 = pd.read_csv(r'data\JData_Action_201602.csv', encoding = 'gbk')
    print('Is action of Feb. from User file? ', len(df_month2) == len(pd.merge(df_sku, df_month2)))
    df_month3 = pd.read_csv(r'data\JData_Action_201603.csv', encoding = 'gbk')
    print('Is action of Mar. from User file? ', len(df_month3) == len(pd.merge(df_sku, df_month3)))
    df_month4 = pd.read_csv(r'data\JData_Action_201604.csv', encoding = 'gbk')
    print('Is action of Apr. from User file? ', len(df_month4) == len(pd.merge(df_sku, df_month4)))
    
user_action_check()

Is action of Feb. from User file? True
Is action of Mar. from User file? True
Is action of Apr. from User file? True

1.2 检查是否有重复记录

def deduplicate(filepath, filename, newpath):
    df_file = pd.read_csv(filepath, encoding = 'gbk')
    before = df_file.shape[0]
    df_file.drop_duplicates(inplace = True)
    after = df_file.shape[0]
    n_dup = before - after
    print('No. of duplicate records for ' + filename + ' is: ' + str(n_dup))
    if n_dup != 0:
        df_file.to_csv(newpath, index = None)
    else:
        print('No duplicate records in ' + filename)
        
deduplicate(r'data\JData_Action_201603.csv', 'Mar. action', '京东/JData_Action_201603_dedup.csv')
deduplicate(r'data\JData_Action_201604.csv', 'Feb. action', '京东/JData_Action_201604_dedup.csv')
deduplicate(r'data\JData_Comment.csv', 'Comment', '京东/JData_Comment_dedup.csv')
deduplicate(r'data\JData_Product.csv', 'Product', '京东/JData_Product_dedup.csv')
deduplicate(r'data\JData_User.csv', 'User', '京东/JData_User_dedup.csv')

在这里插入图片描述

df_month2 = pd.read_csv(r'Fdata\JData_Action_201602.csv', encoding = 'gbk')
IsDuplicated = df_month2.duplicated()
df_d = df_month2[IsDuplicated]
df_d.groupby('type').count()
#发现重复数据大多数都是由于浏览(1),或者点击(6)产生

在这里插入图片描述

1.3 检查注册时间

df_user = pd.read_csv(r'data\JData_User.csv', encoding = 'gbk')
df_user['user_reg_tm'] = pd.to_datetime(df_user['user_reg_tm'])
df_user.loc[df_user.user_reg_tm >= '2016-4-15']

在这里插入图片描述

df_month = pd.read_csv(r'data\JData_Action_201604.csv')
df_month['time'] = pd.to_datetime(df_month['time'])
df_month.loc[df_month.time >= '2016-4-16']

结论:说明用户没有异常操作数据,所以这一批用户不删除

1.3 INT类型转换

df_month = pd.read_csv(r'data\JData_Action_201602.csv', encoding = 'gbk')
df_month['user_id'] = df_month['user_id'].apply(lambda x: int(x))
print(df_month['user_id'].dtype)
df_month.to_csv(r'京东\JData_Action_201602.csv', index = None)

df_month = pd.read_csv(r'data\JData_Action_201603.csv', encoding = 'gbk')
df_month['user_id'] = df_month['user_id'].apply(lambda x: int(x))
print(df_month['user_id'].dtype)
df_month.to_csv(r'京东\JData_Action_201603.csv', index = None)

df_month = pd.read_csv(r'data\JData_Action_201604.csv', encoding = 'gbk')
df_month['user_id'] = df_month['user_id'].apply(lambda x: int(x))
print(df_month['user_id'].dtype)
df_month.to_csv(r'京东\JData_Action_201604.csv', index = None)

二. 构建特征表单

2.1 构建User_table

#定义文件名
ACTION_201602_FILE = "京东/JData_Action_201602.csv"
ACTION_201603_FILE = "京东/JData_Action_201603.csv"
ACTION_201604_FILE = "京东/JData_Action_201604.csv"
COMMENT_FILE = "京东/JData_Comment.csv"
PRODUCT_FILE = "京东/JData_Product.csv"
USER_FILE = "京东/JData_User.csv"
USER_TABLE_FILE = "京东/User_table.csv"
ITEM_TABLE_FILE = "京东/Item_table.csv"

import pandas as pd
import numpy as np
from collections import Counter

# 功能函数: 对每一个user分组的数据进行统计
def add_type_count(group):
    behavior_type = group.type.astype(int)
    type_cnt = Counter(behavior_type)
    group['browse_num'] = type_cnt[1]
    group['addcart_num'] = type_cnt[2]
    group['delcart_num'] = type_cnt[3]
    group['buy_num'] = type_cnt[4]
    group['favor_num'] = type_cnt[5]
    group['click_num'] = type_cnt[6]
    return group[['user_id', 'browse_num', 'addcart_num', 'delcart_num', 'buy_num', 'favor_num', 'click_num']]

# 由于用户行为数据量较大,一次性读入可能造成内存错误(Memory Error),因而使用pandas的分块(chunk)读取.
def get_from_action_data(fname, chunk_size = 50000):
    reader = pd.read_csv(fname, header = 0, iterator = True, encoding = 'gbk')
    chunks = []
    loop = True
    while loop:
        try:
            chunk = reader.get_chunk(chunk_size)[['user_id', 'type']]
            chunks.append(chunk)
        except StopIteration:
            loop = False
            print('Iteration is stopped')
    df_ac = pd.concat(chunks, ignore_index = True)
    df_ac = df_ac.groupby(['user_id'], as_index = False).apply(add_type_count)
    df_ac = df_ac.drop_duplicates('user_id')
    return df_ac

# 将各个action数据的统计量进行聚合
def merge_action_data():
    df_ac = []
    df_ac.append(get_from_action_data(fname = ACTION_201602_FILE))
    df_ac.append(get_from_action_data(fname = ACTION_201603_FILE))
    df_ac.append(get_from_action_data(fname = ACTION_201604_FILE))
    df_ac = pd.concat(df_ac, ignore_index = True)
    # 用户在不同action表中统计量求和
    df_ac = df_ac.groupby(['user_id'], as_index = False).sum()
    # 构造转化率字段
    df_ac['buy_addcart_ratio'] = df_ac['buy_num'] / df_ac['addcart_num']
    df_ac['buy_browse_ratio'] = df_ac['buy_num'] / df_ac['browse_num']
    df_ac['buy_click_ratio'] = df_ac['buy_num'] / df_ac['click_num']
    df_ac['buy_favor_ratio'] = df_ac['buy_num'] / df_ac['favor_num']
    # 将大于1的转化率字段置为1(100%)
    df_ac.loc[df_ac['buy_addcart_ratio'] > 1, 'buy_addcart_ratio'] = 1.
    df_ac.loc[df_ac['buy_browse_ratio'] > 1, 'buy_browse_ratio'] = 1.
    df_ac.loc[df_ac['buy_click_ratio'] > 1, 'buy_click_ratio'] = 1.
    df_ac.loc[df_ac['buy_favor_ratio'] > 1, 'buy_favor_ratio'] = 1.
    return df_ac

# 从FJData_User表中抽取需要的字段
def get_from_jdata_user():
    df_user = pd.read_csv(USER_FILE, header = 0)
    df_user = df_user[['user_id', 'age', 'sex', 'user_lv_cd']]
    return df_user
user_base = get_from_jdata_user()
user_behavior = merge_action_data()

user_behavior = pd.merge(user_base, user_behavior, on = ['user_id'], how = 'left')
user_behavior.to_csv(USER_TABLE_FILE, index = False)

user_table = pd.read_csv(USER_TABLE_FILE)
print(user_table.shape)
user_table.head()

在这里插入图片描述

2.2 构建Item_table

# 读取Product中商品
def get_from_jdata_product():
    df_item = pd.read_csv(PRODUCT_FILE, header = 0, encoding = 'gbk')
    return df_item

# 对每一个商品分组进行统计
def add_type_count(group):
    behavior_type = group.type.astype(int)
    type_cnt = Counter(behavior_type)
    group['browse_num'] = type_cnt[1]
    group['addcart_num'] = type_cnt[2]
    group['delcart_num'] = type_cnt[3]
    group['buy_num'] = type_cnt[4]
    group['favor_num'] = type_cnt[5]
    group['click_num'] = type_cnt[6]
    return group[['sku_id', 'browse_num', 'addcart_num', 'delcart_num', 'buy_num', 'favor_num', 'click_num']]

# 对action中的数据进行统计
def get_from_action_data(fname, chunk_size = 50000):
    reader = pd.read_csv(fname, header = 0, iterator = True)
    chunks = []
    loop = True
    while loop:
        try:
            chunk = reader.get_chunk(chunk_size)[['sku_id', 'type']]
            chunks.append(chunk)
        except StopIteration:
            loop = False
            print('Iteration is stopped')
    df_ac = pd.concat(chunks, ignore_index = True)
    df_ac = df_ac.groupby('sku_id', as_index = False).apply(add_type_count)
    df_ac = df_ac.drop_duplicates('sku_id')
    return df_ac

# 获取评论中的商品数据,如果存在某一个商品有两个日期的评论,取最晚的那一个
def get_from_jdata_comment():
    df_cmt = pd.read_csv(COMMENT_FILE, header = 0)
    df_cmt['dt'] = pd.to_datetime(df_cmt['dt'])
    idx = df_cmt.groupby(['sku_id'])['dt'].transform(max) == df_cmt['dt']
    df_cmt = df_cmt[idx]
    return df_cmt[['sku_id', 'comment_num', 'has_bad_comment', 'bad_comment_rate']]

# 数据合并
def merge_action_data():
    df_ac = []
    df_ac.append(get_from_action_data(fname = ACTION_201602_FILE))
    df_ac.append(get_from_action_data(fname = ACTION_201603_FILE))
    df_ac.append(get_from_action_data(fname = ACTION_201604_FILE))
    df_ac = pd.concat(df_ac, ignore_index = True)
    # 用户在不同action表中统计量求和
    df_ac = df_ac.groupby(['sku_id'], as_index = False).sum()
    # 构造转化率字段
    df_ac['buy_addcart_ratio'] = df_ac['buy_num'] / df_ac['addcart_num']
    df_ac['buy_browse_ratio'] = df_ac['buy_num'] / df_ac['browse_num']
    df_ac['buy_click_ratio'] = df_ac['buy_num'] / df_ac['click_num']
    df_ac['buy_favor_ratio'] = df_ac['buy_num'] / df_ac['favor_num']
    # 将大于1的转化率字段置为1(100%)
    df_ac.loc[df_ac['buy_addcart_ratio'] > 1, 'buy_addart_ratio'] = 1.
    df_ac.loc[df_ac['buy_browse_ratio'] > 1, 'buy_browse_ratio'] = 1.
    df_ac.loc[df_ac['buy_click_ratio'] > 1, 'buy_click_ratio'] = 1.
    df_ac.loc[df_ac['buy_favor_ratio'] > 1, 'buy_favor_ratio'] = 1.
    return df_ac

item_base = get_from_jdata_product()
item_behavior = merge_action_data()
item_comment = get_from_jdata_comment()

item_behavior = pd.merge(item_base, item_behavior, on = 'sku_id', how = 'left')
item_behavior = pd.merge(item_behavior, item_comment, on = 'sku_id', how = 'left')
item_behavior.to_csv(ITEM_TABLE_FILE, index = False)

item_tabel = pd.read_csv(ITEM_TABLE_FILE)
print(item_tabel.shape)
item_tabel.head()

在这里插入图片描述

三. 数据清洗

import pandas as pd
df_user = pd.read_csv('京东/User_table.csv', header = 0)
pd.options.display.float_format = '{:.3f}'.format #输出格式设置,保留三位小数
df_user.describe()

在这里插入图片描述

df_user[df_user['age'].isnull()]

在这里插入图片描述

# 删除没有age,sex的用户
delete_index = df_user[df_user['age'].isnull()].index
df_user.drop(delete_index, axis = 0, inplace = True)

# 删除无交互记录的用户
df_naction = df_user[df_user['browse_num'].isnull() & df_user['addcart_num'].isnull() & df_user['delcart_num'].isnull() & df_user['buy_num'].isnull() & df_user['favor_num'].isnull() & df_user['click_num'].isnull()]
df_user.drop(df_naction.index, axis = 0, inplace = True)
print(len(df_user))

# 统计并删除无购买记录的用户
df_bzero = df_user[df_user['buy_num'] == 0]
print(len(df_bzero))
df_user = df_user[df_user['buy_num'] != 0]
df_user.describe()

105177
75694
在这里插入图片描述

# 删除爬虫及惰性用户
bindex = df_user[df_user['buy_browse_ratio'] < 0.0005].index
print(len(bindex))
df_user.drop(bindex, axis = 0, inplace = True)

cindex = df_user[df_user['buy_click_ratio'] < 0.0005].index
print(len(cindex))
df_user.drop(cindex, axis = 0, inplace = True)

df_user.describe()

在这里插入图片描述

df_user.to_csv('京东/User_table.csv', index = None)

四. 数据探索

4.1 一周各天购买情况

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

ACTION_201602_FILE = "京东/JData_Action_201602.csv"
ACTION_201603_FILE = "京东/JData_Action_201603.csv"
ACTION_201604_FILE = "京东/JData_Action_201604.csv"
COMMENT_FILE = "京东/JData_Comment.csv"
PRODUCT_FILE = "京东/JData_Product.csv"
USER_FILE = "京东/JData_User.csv"
USER_TABLE_FILE = "京东/User_table.csv"
ITEM_TABLE_FILE = "京东/Item_table.csv"

# 提取购买(type=4)的行为数据
def get_from_action_data(fname, chunk_size 
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值