1.学习在anaconda中使用jupyter notebook调试python代码
终端开启jupyter notebook,点击链接进入:http://localhost:8888/tree#
新建python 3开始逐行编写和调试代码,记住及时保存。
2.学会在阿里云天池
中学习机器学习相关知识,参与各类论坛和挑战赛,通过调试别人的代码学习做数据挖掘比赛
技巧:论坛找资源、看视频学习、多动手尝试、多看多想多请教、学会百度和翻墙很重要,不断尝试放弃敢于迎接挑战。
3.实战-<第1章 O2O优惠券使用新人赛-初试>
跑通了自己的第一个数据挖掘小项目,大致了解了做数据挖掘比赛的流程、方法和思想,了解了一些常用的包。
按照《生活大实惠:O2O优惠券使用预测 ¶》
调试成功。
代码如下:
# coding: utf-8
# In[1]:
# import libraries necessary for this project
import os, sys, pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from datetime import date
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
# display for this notebook
get_ipython().magic('matplotlib inline')
get_ipython().magic("config InlineBackend.figure_format = 'retina'")
# In[2]:
dfoff = pd.read_csv('G:\code\o2o\ccf_offline_stage1_train.csv')
dftest = pd.read_csv('G:\code\o2o\ccf_offline_stage1_test_revised.csv')
dfon = pd.read_csv('G:\code\o2occf_online_stage1_train.csv')
dfoff.head(5)
# In[3]:
dfoff.info()
# In[4]:
print('有优惠券,购买商品条数', dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] != 'null')].shape[0])
print('无优惠券,购买商品条数', dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] != 'null')].shape[0])
print('有优惠券,不购买商品条数', dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] == 'null')].shape[0])
print('无优惠券,不购买商品条数', dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] == 'null')].shape[0])
# In[5]:
# 在测试集中出现的用户但训练集没有出现
print('1. User_id in training set but not in test set', set(dftest['User_id']) - set(dfoff['User_id']))
# 在测试集中出现的商户但训练集没有出现
print('2. Merchant_id in training set but not in test set', set(dftest['Merchant_id']) - set(dfoff['Merchant_id']))# 在测试集中出现的用户但训练集没有出现
print('1. User_id in training set but not in test set', set(dftest['User_id']) - set(dfoff['User_id']))
# 在测试集中出现的商户但训练集没有出现
print('2. Merchant_id in training set but not in test set', set(dftest['Merchant_id']) - set(dfoff['Merchant_id']))
# In[6]:
print('Discount_rate 类型:',dfoff['Discount_rate'].unique())
print('Distance 类型:', dfoff['Distance'].unique())
# In[7]:
# convert Discount_rate and Distance
def getDiscountType(row):
if row == 'null':
return 'null'
elif ':' in row:
return 1
else:
return 0
def convertRate(row):
"""Convert discount to rate"""
if row == 'null':
return 1.0
elif ':' in row:
rows = row.split(':')
return 1.0 - float(rows[1])/float(rows[0])
else:
return float(row)
def getDiscountMan(row):
if ':' in row:
rows = row.split(':')
return int(rows[0])
else:
return 0
def getDiscountJian(row):
if ':' in row:
rows = row.split(':')
return int(rows[1])
else:
return 0
def processData(df):
# convert discunt_rate
df['discount_rate'] = df['Discount_rate'].apply(convertRate)
df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
print(df['discount_rate'].unique())
# convert distance
df['distance'] = df['Distance'].replace('null', -1).astype(int)
print(df['distance'].unique())
return df
dfoff = processData(dfoff)
dftest = processData(dftest)
# In[8]:
dfoff.head(2)
# In[9]:
dftest.head(2)
# In[10]:
date_received = dfoff['Date_received'].unique()
date_received = sorted(date_received[date_received != 'null'])
date_buy = dfoff['Date'].unique()
date_buy = sorted(date_buy[date_buy != 'null'])
date_buy = sorted(dfoff[dfoff['Date'] != 'null']['Date'])
print('优惠券收到日期从',date_received[0],'到', date_received[-1])
print('消费日期从', date_buy[0], '到', date_buy[-1])
# In[11]:
couponbydate = dfoff[dfoff['Date_received'] != 'null'][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
couponbydate.columns = ['Date_received','count']
buybydate = dfoff[(dfoff['Date'] != 'null') & (dfoff['Date_received'] != 'null')][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
buybydate.columns = ['Date_received','count']
# In[13]:
sns.set_style('ticks')
sns.set_context("notebook", font_scale= 1.4)
plt.figure(figsize = (12,8))
date_received_dt = pd.to_datetime(date_received, format='%Y%m%d')
plt.subplot(211)
plt.bar(date_received_dt, couponbydate['count'], label = 'number of coupon received' )
plt.bar(date_received_dt, buybydate['count'], label = 'number of coupon used')
plt.yscale('log')
plt.ylabel('Count')
plt.legend()
plt.subplot(212)
plt.bar(date_received_dt, buybydate['count']/couponbydate['count'])
plt.ylabel('Ratio(coupon used/coupon received)')
plt.tight_layout()
# In[14]:
def getWeekday(row):
if row == 'null':
return row
else:
return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1
dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)
# weekday_type : 周六和周日为1,其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )
dftest['weekday_type'] = dftest['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )
# In[15]:
# change weekday to one-hot encoding
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)
tmpdf = pd.get_dummies(dfoff['weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf
tmpdf = pd.get_dummies(dftest['weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf
# In[16]:
def label(row):
if row['Date_received'] == 'null':
return -1
if row['Date'] != 'null':
td = pd.to_datetime(row['Date'], format='%Y%m%d') - pd.to_datetime(row['Date_received'], format='%Y%m%d')
if td <= pd.Timedelta(15, 'D'):
return 1
return 0
dfoff['label'] = dfoff.apply(label, axis = 1)
# In[17]:
print(dfoff['label'].value_counts())
# In[18]:
print('已有columns:',dfoff.columns.tolist())
# In[19]:
dfoff.head(2)
# In[20]:
# data split
df = dfoff[dfoff['label'] != -1].copy()
train = df[(df['Date_received'] < '20160516')].copy()
valid = df[(df['Date_received'] >= '20160516') & (df['Date_received'] <= '20160615')].copy()
print(train['label'].value_counts())
print(valid['label'].value_counts())
# In[21]:
# feature
original_feature = ['discount_rate','discount_type','discount_man', 'discount_jian','distance', 'weekday', 'weekday_type'] + weekdaycols
print(len(original_feature),original_feature)
# In[22]:
# model1
predictors = original_feature
print(predictors)
def check_model(data, predictors):
classifier = lambda: SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
max_iter=100,
shuffle=True,
n_jobs=1,
class_weight=None)
model = Pipeline(steps=[
('ss', StandardScaler()),
('en', classifier())
])
parameters = {
'en__alpha': [ 0.001, 0.01, 0.1],
'en__l1_ratio': [ 0.001, 0.01, 0.1]
}
folder = StratifiedKFold(n_splits=3, shuffle=True)
grid_search = GridSearchCV(
model,
parameters,
cv=folder,
n_jobs=-1,
verbose=1)
grid_search = grid_search.fit(data[predictors],
data['label'])
return grid_search
if not os.path.isfile('1_model.pkl'):
model = check_model(train, predictors)
print(model.best_score_)
print(model.best_params_)
with open('1_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('1_model.pkl', 'rb') as f:
model = pickle.load(f)
# In[24]:
# valid predict
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]
valid1.head(2)
# In[25]:
# avgAUC calculation
vg = valid1.groupby(['Coupon_id'])
aucs = []
for i in vg:
tmpdf = i[1]
if len(tmpdf['label'].unique()) != 2:
continue
fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
aucs.append(auc(fpr, tpr))
print(np.average(aucs))
# In[26]:
# test prediction for submission
y_test_pred = model.predict_proba(dftest[predictors])
dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['label'] = y_test_pred[:,1]
dftest1.to_csv('submit1.csv', index=False, header=False)
dftest1.head()
# In[27]:
feature = dfoff[(dfoff['Date'] < '20160516') | ((dfoff['Date'] == 'null') & (dfoff['Date_received'] < '20160516'))].copy()
data = dfoff[(dfoff['Date_received'] >= '20160516') & (dfoff['Date_received'] <= '20160615')].copy()
print(data['label'].value_counts())
# In[28]:
fdf = feature.copy()
# In[29]:
# key of user
u = fdf[['User_id']].copy().drop_duplicates()
# In[30]:
# u_coupon_count : num of coupon received by user
u1 = fdf[fdf['Date_received'] != 'null'][['User_id']].copy()
u1['u_coupon_count'] = 1
u1 = u1.groupby(['User_id'], as_index = False).count()
u1.head(2)
# In[31]:
# u_buy_count : times of user buy offline (with or without coupon)
u2 = fdf[fdf['Date'] != 'null'][['User_id']].copy()
u2['u_buy_count'] = 1
u2 = u2.groupby(['User_id'], as_index = False).count()
u2.head(2)
# In[32]:
# u_buy_with_coupon : times of user buy offline (with coupon)
u3 = fdf[((fdf['Date'] != 'null') & (fdf['Date_received'] != 'null'))][['User_id']].copy()
u3['u_buy_with_coupon'] = 1
u3 = u3.groupby(['User_id'], as_index = False).count()
u3.head(2)
# In[33]:
# u_merchant_count : num of merchant user bought from
u4 = fdf[fdf['Date'] != 'null'][['User_id', 'Merchant_id']].copy()
u4.drop_duplicates(inplace = True)
u4 = u4.groupby(['User_id'], as_index = False).count()
u4.rename(columns = {'Merchant_id':'u_merchant_count'}, inplace = True)
u4.head(2)
# In[34]:
# u_min_distance
utmp = fdf[(fdf['Date'] != 'null') & (fdf['Date_received'] != 'null')][['User_id', 'distance']].copy()
utmp.replace(-1, np.nan, inplace = True)
u5 = utmp.groupby(['User_id'], as_index = False).min()
u5.rename(columns = {'distance':'u_min_distance'}, inplace = True)
u6 = utmp.groupby(['User_id'], as_index = False).max()
u6.rename(columns = {'distance':'u_max_distance'}, inplace = True)
u7 = utmp.groupby(['User_id'], as_index = False).mean()
u7.rename(columns = {'distance':'u_mean_distance'}, inplace = True)
u8 = utmp.groupby(['User_id'], as_index = False).median()
u8.rename(columns = {'distance':'u_median_distance'}, inplace = True)
u8.head(2)
# In[35]:
u.shape, u1.shape, u2.shape, u3.shape, u4.shape, u5.shape, u6.shape, u7.shape, u8.shape
# In[36]:
# merge all the features on key User_id
user_feature = pd.merge(u, u1, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u2, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u3, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u4, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u5, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u6, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u7, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u8, on = &#