机器学习-天池新人赛(离线赛)--初步数据分析

最新推荐文章于 2020-05-28 19:12:31 发布

monkey_susu

最新推荐文章于 2020-05-28 19:12:31 发布

阅读量787

点赞数

分类专栏：数据分析文章标签：数据分析机器学习

本文链接：https://blog.csdn.net/sinat_37935727/article/details/104955844

版权

数据分析专栏收录该内容

14 篇文章 4 订阅

订阅专栏

import numpy as np
import pandas as pd
import math
from sklearn.metrics import f1_score
idx = pd.IndexSlice

%matplotlib inline

# 2. 获取数据
actions = pd.read_csv("./fresh_comp_offline/tianchi_fresh_comp_train_user.csv")
items = pd.read_csv("./fresh_comp_offline/tianchi_fresh_comp_train_item.csv")
actions.head()

	user_id	item_id	behavior_type	user_geohash	item_category	time
0	10001082	285259775	1	97lk14c	4076	2014-12-08 18
1	10001082	4368907	1	NaN	5503	2014-12-12 12
2	10001082	4368907	1	NaN	5503	2014-12-12 12
3	10001082	53616768	1	NaN	9762	2014-12-02 15
4	10001082	151466952	1	NaN	5232	2014-12-12 11

# 读取并且转换actions表， 用户的所有的行为
# TODO: 暂时忽略所有的geo信息

def prepare_data(actions, items):
    #convert time
    actions.time = pd.to_datetime(actions.time)

    #index user
    user_index = actions.user_id.drop_duplicates()
    user_index = user_index.reset_index(drop=True).reset_index().set_index("user_id")
    user_index.columns = ['user']
    actions = pd.merge(actions, user_index, left_on='user_id', right_index=True, how='left')

    #index item
    item_ids = actions.item_id.drop_duplicates()
    item_ids = item_ids.reset_index(drop=True).reset_index().set_index("item_id")
    item_ids.columns = ['item']
    actions = pd.merge(actions, item_ids, left_on='item_id', right_index=True, how='left')

    items = pd.merge(items, item_ids, left_on='item_id', right_index=True, how='left')

    # index category
    category = actions.item_category.drop_duplicates()
    category = category.reset_index(drop=True).reset_index().set_index("item_category")
    category.columns = ['category']
    actions = pd.merge(actions, category, left_on='item_category', right_index=True, how='left')

    #drop user_id, item_id
    actions = actions.drop(['user_id', 'item_id', 'item_category'], axis=1)
    items = items.drop(['item_id', 'item_category'], axis=1)
    
#     print(actions.head())

    #reoder columns
    actions = actions.loc[:,['user', 'item', 'behavior_type', 'category', 'time', 'user_geohash']]
    
    #add date and hour
    actions['date'] = actions.time.dt.date
    actions['hour'] = actions.time.dt.hour
    return actions, items, user_index, item_ids, category

# actions, items = prepare_data(actions, items)
actions, items, user_index, item_ids, _ = prepare_data(actions, items)

actions.head()

	item	behavior_type	category	time	user_geohash	date	hour
0	0	1	0	2014-12-08 18:00:00	97lk14c	2014-12-08	18
1	1	1	1	2014-12-12 12:00:00	NaN	2014-12-12	12
2	1	1	1	2014-12-12 12:00:00	NaN	2014-12-12	12
3	2	1	2	2014-12-02 15:00:00	NaN	2014-12-02	15
4	3	1	3	2014-12-12 11:00:00	NaN	2014-12-12	11

items.head()

	item_geohash	item
0	NaN	1185692
1	NaN	2222915
2	NaN	2623414
3	NaN	1772057
4	NaN	2634707

# 2 观察数据
geo = pd.concat([items.item_geohash, actions.user_geohash]).drop_duplicates()
item_geo = items.item_geohash.drop_duplicates().dropna()
print("商品的geo去重后总数的统计", item_geo.count())
action_geo = actions.user_geohash.drop_duplicates().dropna()
print("用户行为的geo去重后总数的统计",action_geo.count())
print("商品与用户行为的geo去重后总数的统计:\n", 
      "交集 / 用户行为geo:",
      len(action_geo[action_geo.isin(item_geo)]) / len(action_geo),
      "\n交集 / 商品geo:",
      len(item_geo[item_geo.isin(action_geo)]) / len(item_geo)
     )
del item_geo
del action_geo
#从结果可以看出， 大多数情况下用户和商品的地址存在匹配的情况， 少量不匹配

商品的geo去重后总数的统计 57358
用户行为的geo去重后总数的统计 1018981
商品与用户行为的geo去重后总数的统计:
 交集 / 用户行为geo: 0.025223237724746585 
交集 / 商品geo: 0.44809791136371563

ag = actions.loc[:, ['user', 'user_geohash']].dropna()
print("用户行为带有geohash的数量", len(ag))
ag = ag.drop_duplicates()
print("用户行为带有geohash的数量(去重后)", len(ag))
ag['c'] = 1
ag = ag.loc[:, ['user', 'c']].groupby('user').sum()
print(ag.describe())
del ag
#可以发现用户
#有geo hash地址的用户行为的中位数为68， 就是大多数用户所在的geohash是经常变化的
#用户在不同的时间， 处于多个不同的geo地址(也就是说这个geo的还是比较精确的， 可能离开商品的某个geo有一定的距离)
#那么可以考虑的是， 是否时间间隔越近的两个geohash地址， 意味着越近的距离

用户行为带有geohash的数量 7380017
用户行为带有geohash的数量(去重后) 1257674
                  c
count  16240.000000
mean      77.442980
std       53.782759
min        1.000000
25%       42.000000
50%       68.000000
75%      103.000000
max      709.000000

df = actions[actions.user_geohash.notna()]
print("购买的时候， 有geo信息的行为数量", len(df), "占全部行为的", len(df[df.user_geohash.isin(items.item_geohash)]) / len(df))
del df

购买的时候， 有geo信息的行为数量 7380017 占全部行为的 0.03044234179948366

# 3. 提取特征
# 首先要考虑要提取哪些特征， 这些特征需要考虑体现用户、商品、商品分类、地点等特性

# 用户: 总体行为次数，还有如何体现出用户的购买爱好， 比如针对某一类商品购买的喜好？
# 商品/分类: 总体有多少用户购买， 所有用户的总体行为计数
# 分类：总共有多少商品
# 上面特征的时间特性？
# 上面物品的地理特性
# 上面物品的交叉特性， 比如某个用户特别爱购买某个商品
# 与时间相关的特性， 用户某一天的购买行为计数， 用来计算第二天是否购买

# 3.0 保存特征
saved_actions = actions
print(len(actions))
actions.head()

23291027

	item	behavior_type	category	time	user_geohash	date	hour
0	0	1	0	2014-12-08 18:00:00	97lk14c	2014-12-08	18
1	1	1	1	2014-12-12 12:00:00	NaN	2014-12-12	12
2	1	1	1	2014-12-12 12:00:00	NaN	2014-12-12	12
3	2	1	2	2014-12-02 15:00:00	NaN	2014-12-02	15
4	3	1	3	2014-12-12 11:00:00	NaN	2014-12-12	11

# actions = saved_actions;#恢复actions

print("共计: {}条交易记录".format(actions.user.max()))

共计: 19999条交易记录

# #从用户来限制提取特征对数据额占用， 是在太卡了, 后续删除
# actions = actions.set_index("user").loc[:10000, :]
# actions = actions.reset_index()
# print(actions.user.max())
# actions.head()

# 3.1 提取用户特征
#用户总计购买了多少商品
user = actions.groupby(['user', 'behavior_type'])[['item']].count().unstack().fillna(0).astype(np.int)
user.rename(columns={'item': 'c'}, level=0, inplace=True)
user.head()

	c
behavior_type	1	2	3	4
user
0	207	0	0	4
1	456	26	1	5
2	446	1	6	8
3	800	31	1	4
4	282	0	2	0

# 统计购买商品的种类
c = actions.drop_duplicates(['user', 'behavior_type', 'item']) \
    .groupby(['user', 'behavior_type'])[['item']].count().unstack().fillna(0).astype(np.int)
user = user.merge(c, left_index=True, right_index=True, how='left')
user.head()

	c				item_x				item_y
behavior_type	1	2	3	4	1	2	3	4	1	2	3	4
user
0	207	0	0	4	89	0	0	4	89	0	0	4
1	456	26	1	5	148	25	1	5	148	25	1	5
2	446	1	6	8	201	1	5	8	201	1	5	8
3	800	31	1	4	321	30	1	4	321	30	1	4
4	282	0	2	0	151	0	2	0	151	0	2	0

#统计购买商品类别的种类
c = actions.drop_duplicates(['user', 'behavior_type', 'category']) \
    .groupby(['user', 'behavior_type'])[['category']].count().unstack().fillna(0).astype(np.int)
user = user.merge(c, left_index=True, right_index=True, how='left')
user.head()

	c				item_x				item_y				category
behavior_type	1	2	3	4	1	2	3	4	1	2	3	4	1	2	3	4
user
0	207	0	0	4	89	0	0	4	89	0	0	4	29	0	0	4
1	456	26	1	5	148	25	1	5	148	25	1	5	45	12	1	5
2	446	1	6	8	201	1	5	8	201	1	5	8	75	1	5	7
3	800	31	1	4	321	30	1	4	321	30	1	4	58	16	1	4
4	282	0	2	0	151	0	2	0	151	0	2	0	53	0	2	0

user = pd.DataFrame(user.values, index=user.index, columns=["u{}".format(i) for i in range(0, 16, 1)])
user.head()

	u0	u1	u2	u3	u4	u5	u6	u7	u8	u9	u10	u11	u12	u13	u14	u15
user
0	207	0	0	4	89	0	0	4	89	0	0	4	29	0	0	4
1	456	26	1	5	148	25	1	5	148	25	1	5	45	12	1	5
2	446	1	6	8	201	1	5	8	201	1	5	8	75	1	5	7
3	800	31	1	4	321	30	1	4	321	30	1	4	58	16	1	4
4	282	0	2	0	151	0	2	0	151	0	2	0	53	0	2	0

user = user / (user.mean() + user.std() * 3)
user.head()

	u0	u1	u2	u3	u4	u5	u6	u7	u8	u9	u10	u11	u12	u13	u14	u15
user
0	0.039515	0.000000	0.000000	0.066647	0.042137	0.000000	0.000000	0.082082	0.042137	0.000000	0.000000	0.082082	0.096836	0.000000	0.000000	0.115024
1	0.087048	0.108395	0.004484	0.083308	0.070070	0.111353	0.005754	0.102603	0.070070	0.111353	0.005754	0.102603	0.150263	0.186512	0.015255	0.143780
2	0.085139	0.004169	0.026905	0.133293	0.095163	0.004454	0.028768	0.164165	0.095163	0.004454	0.028768	0.164165	0.250439	0.015543	0.076277	0.201293
3	0.152716	0.129241	0.004484	0.066647	0.151976	0.133623	0.005754	0.082082	0.151976	0.133623	0.005754	0.082082	0.193673	0.248682	0.015255	0.115024
4	0.053833	0.000000	0.008968	0.000000	0.071490	0.000000	0.011507	0.000000	0.071490	0.000000	0.011507	0.000000	0.176977	0.000000	0.030511	0.000000

# user.to_csv("user.csv")
# del user

# 3.2 统计商品属性
#统计商品被购买的次数
good = actions.groupby(['item', 'behavior_type'])[['user']].count().unstack().fillna(0).astype(np.int)
good.rename(columns={'user': 'c'}, level=0, inplace=True)
good.head()

	c
behavior_type	1	2	3	4
item
0	78	1	1	0
1	4	0	0	0
2	87	0	3	2
3	3	0	0	0
4	7	0	0	0

#统计商品被多少用户购买过
c = actions.drop_duplicates(['user', 'behavior_type', 'item']) \
    .groupby(['item', 'behavior_type'])[['user']].count().unstack().fillna(0).astype(np.int)
good = good.merge(c, left_index=True, right_index=True, how='left')
good.head()

	c				user
behavior_type	1	2	3	4	1	2	3	4
item
0	78	1	1	0	36	1	1	0
1	4	0	0	0	2	0	0	0
2	87	0	3	2	22	0	3	2
3	3	0	0	0	1	0	0	0
4	7	0	0	0	3	0	0	0

good = pd.DataFrame(good.values, index=good.index, columns=["g{}".format(i) for i in range(0, 8, 1)])
good.head()

	g0	g1	g2	g3	g4	g5	g6	g7
item
0	78	1	1	0	36	1	1	0
1	4	0	0	0	2	0	0	0
2	87	0	3	2	22	0	3	2
3	3	0	0	0	1	0	0	0
4	7	0	0	0	3	0	0	0

good = good / (good.mean() + good.std() * 3)
good.head()

	g0	g1	g2	g3	g4	g5	g6	g7
item
0	1.876140	0.762983	0.533016	0.000000	2.389785	0.825291	0.688164	0.000000
1	0.096212	0.000000	0.000000	0.000000	0.132766	0.000000	0.000000	0.000000
2	2.092618	0.000000	1.599047	1.832312	1.460424	0.000000	2.064493	2.381005
3	0.072159	0.000000	0.000000	0.000000	0.066383	0.000000	0.000000	0.000000
4	0.168372	0.000000	0.000000	0.000000	0.199149	0.000000	0.000000	0.000000

# good.to_csv("good.csv")
# del good

# 3.3 统计商品类别特征
#统计商品类别被购买的次数
cat = actions.groupby(['category', 'behavior_type'])[['user']].count().unstack().fillna(0).astype(np.int)
cat.rename(columns={'user': 'c'}, level=0, inplace=True)
cat.head()

	c
behavior_type	1	2	3	4
category
0	8168	130	125	44
1	134719	3399	3470	690
2	7419	118	128	74
3	271839	5117	8852	3049
4	250652	5567	7268	1678

#统计商品类别被多少用户购买过
c = actions.drop_duplicates(['user', 'behavior_type', 'category']) \
    .groupby(['category', 'behavior_type'])[['user']].count().unstack().fillna(0).astype(np.int)
cat = cat.merge(c, left_index=True, right_index=True, how='left')
cat.head()

	c				user
behavior_type	1	2	3	4	1	2	3	4
category
0	8168	130	125	44	923	76	63	42
1	134719	3399	3470	690	6445	1190	1259	506
2	7419	118	128	74	556	51	70	65
3	271839	5117	8852	3049	9766	1748	2877	1873
4	250652	5567	7268	1678	8869	1716	2325	1184

#统计商品类别有多少商品
c = actions.drop_duplicates(['item', 'behavior_type', 'category']) \
    .groupby(['category', 'behavior_type'])[['item']].count().unstack().fillna(0).astype(np.int)
cat = cat.merge(c, left_index=True, right_index=True, how='left')
cat.head()

	c				user				item
behavior_type	1	2	3	4	1	2	3	4	1	2	3	4
category
0	8168	130	125	44	923	76	63	42	1354	99	92	35
1	134719	3399	3470	690	6445	1190	1259	506	34516	2803	2389	571
2	7419	118	128	74	556	51	70	65	1386	93	95	59
3	271839	5117	8852	3049	9766	1748	2877	1873	42985	3693	4811	1821
4	250652	5567	7268	1678	8869	1716	2325	1184	55323	4512	4763	1319

cat = pd.DataFrame(cat.values, index=cat.index, columns=["c{}".format(i) for i in range(0, 12, 1)])
cat.head()

	c0	c1	c2	c3	c4	c5	c6	c7	c8	c9	c10	c11
category
0	8168	130	125	44	923	76	63	42	1354	99	92	35
1	134719	3399	3470	690	6445	1190	1259	506	34516	2803	2389	571
2	7419	118	128	74	556	51	70	65	1386	93	95	59
3	271839	5117	8852	3049	9766	1748	2877	1873	42985	3693	4811	1821
4	250652	5567	7268	1678	8869	1716	2325	1184	55323	4512	4763	1319

cat = cat / (cat.mean() + cat.std() * 3)
cat.head()

	c0	c1	c2	c3	c4	c5	c6	c7	c8	c9	c10	c11
category
0	0.133160	0.094975	0.084855	0.108459	0.437772	0.227932	0.144700	0.166593	0.111956	0.093267	0.093912	0.122399
1	2.196275	2.483237	2.355563	1.700830	3.056815	3.568934	2.891698	2.007054	2.853974	2.640682	2.438651	1.996850
2	0.120949	0.086208	0.086891	0.182408	0.263707	0.152954	0.160778	0.257823	0.114602	0.087614	0.096974	0.206329
3	4.431693	3.738372	6.009060	7.515698	4.631941	5.242434	6.607956	7.429271	3.554238	3.479143	4.910988	6.368237
4	4.086289	4.067132	4.933783	4.136222	4.206500	5.146463	5.340110	4.696347	4.574413	4.250716	4.861990	4.612688

# cat.to_csv('cat.csv')

# del cat
del c

# 3.4 时间特性
# 3.5 地理特性
# 3.6 交叉特性
# 3.7 24小时内的动作

def read_csv():
    return pd.read_csv("user.csv", index_col=0)
def read_good():
    return pd.read_csv("good.csv", index_col=0)
def read_cat():
    return pd.read_csv('cat.csv', index_col=0)
def read_label():
    return pd.read_csv("label.csv", index_col=0)

# 用户第二天是否会购买的标签
label = actions[actions.behavior_type == 4].copy()
label.date = (pd.to_datetime(label.date) - np.timedelta64(1, 'D'))
# label.date = label.date.dt.date
print(label.date.dtypes)
label['buy'] = 1
# label = label.loc[:, ['date', 'user','category','item','buy']].groupby(['date', 'user','category','item']).sum()
label = label.set_index(['date', 'user']).loc[:, ['item', 'category', 'buy']].drop_duplicates()
label.set_index(['category','item',], append=True, inplace=True)
label.head()

datetime64[ns]

				buy
date	user	category	item
2014-12-01	0	2	2	1
2014-12-13	0	9	13	1
2014-12-01	0	8	11	1
2014-12-01	0	27	59	1
2014-12-12	1	30	90	1

# label.to_csv("label.csv")
# del label

# read_label().head()

# 统计用户最后一天的行为
d_action = actions.copy()
d_action['d']  = 1
d_action.date = pd.to_datetime(d_action.date)
d_action = d_action.groupby([ 'date', 'user', 'category', 'item', 'behavior_type']).sum()[['d']]
d_action = d_action / (d_action.mean() + d_action.std() * 3)
d_action = d_action.unstack().fillna(0).astype(np.float32)
d_action.columns = d_action.columns.droplevel(0)
d_action.columns = ['d_t{}'.format(i) for i in range(1, 5, 1)]
d_action.head()

				d_t1	d_t2	d_t3	d_t4
date	user	category	item
2014-11-18	1	46	129	0.342359	0.0	0.0	0.0
		49	163	0.342359	0.0	0.0	0.0
		58	176	0.342359	0.0	0.0	0.0
	3	5	478	0.342359	0.0	0.0	0.0
	3	139	445	0.513539	0.0	0.0	0.0

# d_action.to_csv('d_action.csv')

# pd.read_csv('d_action.csv', index_col=0).dtypes

#某个用户3小时的行为
x_action = actions.copy()
x_action['c']  = 1
x_action.date = pd.to_datetime(x_action.date).dt.date
#数据量太大， 只考虑最后3个小时的数据
x_action = x_action.loc[x_action.hour.isin([23, 22, 21])]
x_action.date = pd.to_datetime(x_action.date)
x_action = x_action.groupby([ 'date', 'user', 'category', 'item', 'hour', 'behavior_type']).sum()
x_action = x_action.unstack()
x_action = x_action / (x_action.mean() + x_action.std() * 3)
x_action = x_action.stack().astype(np.float32)
x_action.head()

						c
date	user	category	item	hour	behavior_type
2014-11-18	3	139	445	21	1	0.559228
			487	21	1	0.186409
			643	21	1	0.186409
	5	20	1043	22	1	0.372819
	5	20	1070	22	1	0.372819

x_action = x_action.unstack(['hour', 'behavior_type'], fill_value=0).sort_index(axis=1)
x_action.columns = x_action.columns.droplevel(0)
# print(x_action.describe())
#用如此方式来保证代码会被正确的展开成96列， 而不至于部分代码被
x_action = pd.DataFrame(x_action.values, index=x_action.index, columns=pd.MultiIndex.from_product([range(1, 5, 1), range(21, 24, 1)], names=['behavior_type','hour']))
x_action = x_action.fillna(0)
# x_action.info()
# x_action[:, :] = x_action[:, :].astype(np.int8)
# x_action.info()
# x_action = x_action.apply(lambda x: x.astype(np.int32))
x_action = pd.DataFrame(x_action.values, index=x_action.index, columns = ["h{}_{}".format(h, t) for h in range(21, 24, 1) for t in [1, 2, 3, 4]])
# print(x_action.describe())
x_action.head()

				h21_1	h21_2	h21_3	h21_4	h22_1	h22_2	h22_3	h22_4	h23_1	h23_2	h23_3	h23_4
date	user	category	item
2014-11-18	3	139	445	0.559228	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0
			487	0.186409	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0
			643	0.186409	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0
	5	20	1043	0.000000	0.0	0.0	0.0	0.372819	0.0	0.0	0.0	0.0	0.0	0.0	0.0
	5	20	1070	0.000000	0.0	0.0	0.0	0.372819	0.0	0.0	0.0	0.0	0.0	0.0	0.0

x_action = d_action.merge(x_action, left_index=True, right_index=True, how='left')
x_action.fillna(0, inplace=True)
x_action.head()

				d_t1	d_t2	d_t3	d_t4	h21_1	h21_2	h21_3	h21_4	h22_1	h22_2	h22_3	h22_4	h23_1	h23_2	h23_3	h23_4
date	user	category	item
2014-11-18	1	46	129	0.342359	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
		49	163	0.342359	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
		58	176	0.342359	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
	3	5	478	0.342359	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
	3	139	445	0.513539	0.0	0.0	0.0	0.559228	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

# 合并x， y数据， 使用how='left'可以过滤掉之前没有行为， 但是却有购买动作的数据
# 当然， 这样我也过滤到了， 我看了n个便宜的， 结果买了这类里面的一个爆品
# TODO: 以后想法处理
x_action = x_action.merge(label, left_index=True, right_index=True, how='left')
x_action.fillna(0, inplace=True)
x_action.buy = x_action.buy.astype(np.int8)
x_action.head()

				d_t1	d_t2	d_t3	d_t4	h21_1	h21_2	h21_3	h21_4	h22_1	h22_2	h22_3	h22_4	h23_1	h23_2	h23_3	h23_4	buy
date	user	category	item
2014-11-18	1	46	129	0.342359	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
		49	163	0.342359	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
		58	176	0.342359	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
	3	5	478	0.342359	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
	3	139	445	0.513539	0.0	0.0	0.0	0.559228	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0

#对应时间点行为对应的用户， 商品， 分类属性
x_action.reset_index(inplace=True)
x_action = user.merge(x_action, right_on='user', left_index=True, how='right')
x_action = good.merge(x_action, right_on='item', left_index=True, how='right')
x_action = cat.merge(x_action, right_on='category', left_index=True, how='right')
x_action.set_index(['date', 'user', 'category', 'item'], inplace=True)
x_action.head()

				c0	c1	c2	c3	c4	c5	c6	c7	c8	c9	...	h21_4	h22_1	h22_2	h22_3	h22_4	h23_1	h23_2	h23_3	h23_4	buy
date	user	category	item
2014-11-18	1	46	129	0.218716	0.149038	0.156811	0.300727	1.252132	0.437869	0.349117	0.424416	0.116091	0.110225	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
		49	163	2.285206	2.003978	2.334519	3.026985	3.575691	3.059086	3.169614	3.962542	2.383990	2.098980	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
		58	176	0.149577	0.097898	0.171746	0.340166	0.492316	0.209937	0.264134	0.456149	0.072846	0.092325	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
	3	5	478	9.422435	9.349217	9.173796	5.410612	4.985289	7.713696	7.315377	5.906132	9.139068	9.315400	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0
	3	139	445	2.903059	2.843413	1.963195	1.099377	2.688290	2.945120	1.952298	0.995594	2.503222	2.730181	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0

5 rows × 53 columns

#x_action.to_csv("x_action.csv") #数据量太大， 写入非常的慢， 如何破这个问题呢？

#pd.read_csv("x_action.csv").head()

# 3.8 优化方向
# 3.8.1 以后可能考虑加入噪音层， 不然， 某个用户可能存在只是查看了一次， 买了一次， 就被网络记忆成必买的用户

# 3.8.2 如何按组来训练， 毕竟用户一般是看一类商品， 然后选择其中一个商品来购买

# 3.8.3 目前采用的"正则化"是否合理， 是否有更好或者更加通用的数据处理方式， 或者直接用normal是否更好