记录下第一次打比赛

天池入门赛代码 baseline

第一次打比赛 使用随机森林做了baseline 后续会有升级版本
主要问题是还没有处理数据不平衡问题 而且在对数据的处理部分也并不是很好 
最大的疑问是线下测试 AUC高达 0.998 为什么这么高??? 
如果这样 直接冲击NO.1了 哈哈哈 想多了 想多了

总结下:纸上得来终觉浅 还要多实践

数据处理部分参考博客点这里

import pandas as pd
import numpy as np
%matplotlib inline
train_online = pd.read_csv('G:\毕业设计\TianChio2o\ccf_online_stage1_train.csv',engine="python")#因为包含中文 engine="python" 默认是C
train_offline = pd.read_csv('G:\毕业设计\TianChio2o\ccf_offline_stage1_train.csv',engine="python")
test = pd.read_csv('G:\毕业设计\TianChio2o\ccf_offline_stage1_test_revised.csv',engine="python")
train_online.head(10)#看一下数据
User_idMerchant_idActionCoupon_idDiscount_rateDate_receivedDate
013740231189072100017492500:5020160513.0NaN
113740231348051NaNNaNNaN20160321.0
214336199189070NaNNaNNaN20160618.0
314336199189070NaNNaNNaN20160618.0
414336199189070NaNNaNNaN20160618.0
514336199189070NaNNaNNaN20160618.0
614336199189070NaNNaNNaN20160618.0
714336199189070NaNNaNNaN20160618.0
814336199189070NaNNaNNaN20160618.0
914336199189070NaNNaNNaN20160618.0
train_offline.head(10)
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receivedDate
014394082632NaNNaN0.0NaN20160217.0
11439408466311002.0150:201.020160528.0NaN
2143940826328591.020:10.020160217.0NaN
3143940826321078.020:10.020160319.0NaN
4143940826328591.020:10.020160613.0NaN
514394082632NaNNaN0.0NaN20160516.0
6143940826328591.020:10.020160516.020160613.0
7183262433817610.0200:200.020160429.0NaN
82029232338111951.0200:201.020160129.0NaN
920292324501532.030:50.020160530.0NaN
test.head()
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_received
04129537450998330:51.020160712
169493781300342930:5NaN20160706
2216652971136928200:205.020160727
3216652971131808100:105.020160727
461721627605650030:12.020160708
train_offline.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 7 columns):
User_id          int64
Merchant_id      int64
Coupon_id        float64
Discount_rate    object
Distance         float64
Date_received    float64
Date             float64
dtypes: float64(4), int64(2), object(1)
memory usage: 93.7+ MB
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113640 entries, 0 to 113639
Data columns (total 6 columns):
User_id          113640 non-null int64
Merchant_id      113640 non-null int64
Coupon_id        113640 non-null int64
Discount_rate    113640 non-null object
Distance         101576 non-null float64
Date_received    113640 non-null int64
dtypes: float64(1), int64(4), object(1)
memory usage: 5.2+ MB
train_online.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11429826 entries, 0 to 11429825
Data columns (total 7 columns):
User_id          int64
Merchant_id      int64
Action           int64
Coupon_id        object
Discount_rate    object
Date_received    float64
Date             float64
dtypes: float64(2), int64(3), object(2)
memory usage: 610.4+ MB
#合并数据 方便处理
all_offline = pd.concat([train_offline,test],sort=False)
all_offline.head()
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receivedDate
014394082632NaNNaN0.0NaN20160217.0
11439408466311002.0150:201.020160528.0NaN
2143940826328591.020:10.020160217.0NaN
3143940826321078.020:10.020160319.0NaN
4143940826328591.020:10.020160613.0NaN
#查看每一列的异常值
all_offline.apply(lambda x :sum(x.isnull()))
User_id                0
Merchant_id            0
Coupon_id         701602
Discount_rate     701602
Distance          118067
Date_received     701602
Date             1091540
dtype: int64
#将data的空值赋值null 统一空值的格式
all_offline['Date']=all_offline["Date"].fillna("null")
#将 on line和off line的数据合并
pd.merge(all_offline,train_online,on=['User_id','Merchant_id'])
all_offline.head()
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receivedDate
014394082632NaNNaN0.0NaN2.01602e+07
11439408466311002.0150:201.020160528.0null
2143940826328591.020:10.020160217.0null
3143940826321078.020:10.020160319.0null
4143940826328591.020:10.020160613.0null
all_offline = all_offline.fillna("null")#所有的空值都用null代替
#根据赛题的提示将正负样本 标注出来
# all_offline.drop(columns="is_used",inplace=True)
def is_used(column):
    if column["Coupon_id"] != 'null' and column["Date"] != 'null':
        return 1
    elif column["Coupon_id"] !="null" and column["Date"] == 'null':
        return -1
    else :
        return 0
all_offline["is_used"] = all_offline.apply(is_used,axis=1)
all_offline.head(10)
    
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receivedDateis_used
014394082632nullnull0null2.01602e+070
11439408466311002150:2012.01605e+07null-1
214394082632859120:102.01602e+07null-1
314394082632107820:102.01603e+07null-1
414394082632859120:102.01606e+07null-1
514394082632nullnull0null2.01605e+070
614394082632859120:102.01605e+072.01606e+071
7183262433817610200:2002.01604e+07null-1
82029232338111951200:2012.01601e+07null-1
92029232450153230:502.01605e+07null-1
#Coupon_id 的具体数值没有意义 我们将其转化为 是否具有优惠券
all_offline["has_coupopn"] = all_offline.apply(lambda x:1 if x["Coupon_id"] != "null" else 0,axis=1)#等于1 表示有优惠券 0 表示没有优惠卷
#将折扣转化为算法可以使用的格式
import re
regex = re.compile(r'^\d+:\d+$')
def percent(column):
    if column["Coupon_id"]=="null" or column["Discount_rate"]== 'null':
        return "null"
    elif re.match(regex,column["Discount_rate"]):
        num_max ,num_min = column["Discount_rate"].split(":")
        return float(num_min)/float(num_max)
    else:
        return column["Discount_rate"]
all_offline["discount_percent"] = all_offline.apply(percent,axis=1)
# all_offline.head(10)
#满N元 才可以使用优惠券 那么这个N对是否使用优惠券的影响
def discount_limit(column):
    if column["Coupon_id"] == 'null' or column["Discount_rate"] == 'null':
        return "null"
    elif re.match(regex,column["Discount_rate"]):
        num_max,mun_min = column["Discount_rate"].split(":")
        return num_max
    else :
        return 0
all_offline["discount_limit"] = all_offline.apply(discount_limit,axis = 1)
all_offline.head(10)
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receivedDateis_usedhas_coupopndiscount_percentdiscount_limit
014394082632nullnull0null2.01602e+0700nullnull
11439408466311002150:2012.01605e+07null-110.133333150
214394082632859120:102.01602e+07null-110.0520
314394082632107820:102.01603e+07null-110.0520
414394082632859120:102.01606e+07null-110.0520
514394082632nullnull0null2.01605e+0700nullnull
614394082632859120:102.01605e+072.01606e+07110.0520
7183262433817610200:2002.01604e+07null-110.1200
82029232338111951200:2012.01601e+07null-110.1200
92029232450153230:502.01605e+07null-110.16666730
#预测领劵后15天内的使用频率
#因此需要在is_used的基础上,再对领劵时间Date_reveied和使用时间Date,进行比较 判断是否是在15天内使用
'''
datetime.datetime.strptime(column["Date"],'%Y%M%d') strptime参数1 是字符串型 参数二 是匹配的 
'''
import datetime
#标注15天内使用优惠券的情况
def used_in_15day(column):
    if column["is_used"] == 1 and column["Date_received"] !="null" and column["Date"] != "null":
        days =(datetime.datetime.strptime(column["Date"],'%Y%M%d') - datetime.datetime.strptime(column["Date_received"],'%Y%M%d')).days# 返回是int
        if days < 15:
            return 1
        else:
            return 0
    else:
        return "null"    

all_offline["Date"]=all_offline["Date"].apply(lambda x: str(int(x)) if x!="null" else x)


all_offline["Date_received"]=all_offline["Date_received"].apply(lambda x: str(int(x)) if x!="null" else x)


all_offline["is_used_in_15day"] = all_offline.apply(used_in_15day,axis=1)

#is used是用来作为label的 包括0 1 null 0表示使用了 但是不是15天内 1表示15天内用了
#null 表示 没有领劵或者没有使用
#对于null还需要进一步的处理 所以需要将null改为0
'''
null    1793142
1         71875
0          3507
我认为可以再多一类 即将null 看为第三类 先2分吧
'''
all_offline["is_used_in_15day"] = all_offline["is_used_in_15day"].apply(lambda x : 0 if x=="null" else x)
all_offline["is_used_in_15day"].value_counts()
0    1796649
1      71875
Name: is_used_in_15day, dtype: int64

all_offline["is_used_in_15day"].value_counts()
0    1796649
1      71875
Name: is_used_in_15day, dtype: int64
all_offline.head(20)
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receivedDateis_usedhas_coupopndiscount_percentdiscount_limitis_used_in_15day
014394082632nullnull0null2016021700nullnull0
11439408466311002150:20120160528null-110.1333331500
214394082632859120:1020160217null-110.05200
314394082632107820:1020160319null-110.05200
414394082632859120:1020160613null-110.05200
514394082632nullnull0null2016051600nullnull0
614394082632859120:102016051620160613110.05201
7183262433817610200:20020160429null-110.12000
82029232338111951200:20120160129null-110.12000
92029232450153230:5020160530null-110.166667300
10202923264591273720:1020160519null-110.05200
1120292326459nullnull0null2016062600nullnull0
1220292326459nullnull0null2016051900nullnull0
1327477446901109750:10null20160606null-110.2500
141963421579nullnull1null2016060600nullnull0
1519634215791069820:1120160606null-110.05200
1622239683381977610:5220160129null-110.5100
1773611209912034100:10null20160207null-110.11000
1816360615695054200:301020160421null-110.152000
19327305648337802200:201020160130null-110.12000
#看分布
all_offline["discount_percent"].value_counts()
null                   701602
0.1                    403589
0.16666666666666666    330255
0.25                   103748
0.05                    69099
0.2                     56084
0.3                     38425
0.15                    29585
0.5                     28785
0.03333333333333333     22195
0.95                    21559
0.13333333333333333     17685
0.3333333333333333      13497
0.9                      8912
0.4                      8300
0.06666666666666667      5452
0.8                      4176
0.02                     3693
0.85                      650
0.01                      551
0.5                       196
0.75                      121
0.2                       110
0.025                      75
0.6                        59
0.6666666666666666         56
0.7                        55
0.6                         9
0.06                        1
Name: discount_percent, dtype: int64
#可以看出discount_percent中的null比较多,给中值什么的不合适 因为它代表的是没有领优惠券
#这时候就需要把null当做特征留下来 其他数值就要给它分段了

#分段
def discount_percent_layer(column):
    if column == "null":
        return "null"
    column =float(column)
    if column <=0.1:
        return 0.1
    elif column <=0.2:
        return 0.2
    elif column <=0.3:
        return 0.3
    elif column  <= 0.4:
        return 0.4
    else :
        return 0.5
all_offline["discount_percent_layer"] = all_offline["discount_percent"].apply(discount_percent_layer)
all_offline["discount_percent_layer"].value_counts()
null    701602
0.1     504655
0.2     433719
0.3     142173
0.5      64578
0.4      21797
Name: discount_percent_layer, dtype: int64
#将discount_limit分段
def discount_limit_layer(column):
    if column == "null":
        return "null"
    column = float(column)
    if column <=10:
        return 10
    elif column <=20:
        return 20
    elif column <=30:
        return 30
    elif column <50:
        return 50
    elif column <=100:
        return 100
    elif column <= 200:
        return 200
    else :
        return 300
all_offline["discount_limit_layer"] = all_offline["discount_limit"].apply(discount_limit_layer)
all_offline["discount_limit_layer"].value_counts()
    
null    701602
30      364855
100     347253
200     177333
20      161645
10       86406
300      29430
Name: discount_limit_layer, dtype: int64
#看下处理后的数据
all_offline.head(10)
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receivedDateis_usedhas_coupopndiscount_percentdiscount_limitis_used_in_15daydiscount_percent_layerdiscount_limit_layer
014394082632nullnull0null2016021700nullnull0nullnull
11439408466311002150:20120160528null-110.13333315000.2200
214394082632859120:1020160217null-110.052000.120
314394082632107820:1020160319null-110.052000.120
414394082632859120:1020160613null-110.052000.120
514394082632nullnull0null2016051600nullnull0nullnull
614394082632859120:102016051620160613110.052010.120
7183262433817610200:20020160429null-110.120000.1200
82029232338111951200:20120160129null-110.120000.1200
92029232450153230:5020160530null-110.1666673000.230
'''
coupon_id 被处理为: has_coupon 1表示有优惠券
Date,Date_received 被处理为:is_used_in_15day 1表示15天内使用过
Discount_rate 被处理为:discount_percent和discount_limit
merchant_id  use_id 是unicode值 不需要进行处理
最后处理下Discount
'''
#看下分布
all_offline["Distance"].value_counts()
0.0     869937
1.0     245695
10.0    216837
2.0     127936
null    118067
3.0      82532
4.0      59091
5.0      44429
6.0      34662
7.0      27366
8.0      22795
9.0      19177
Name: Distance, dtype: int64
#Distance还有部分null 表示距离未知 null可以被处理成均值 众数 中位数等 也可以保留null当做一个单独的特征 在这里就不处理了 需要时候进行one-hot编码
#保存一下数据 记得分开测试集与训练集
#encode=utf-8
train_finally,test_finally = all_offline[:train_offline.shape[0]],all_offline[train_offline.shape[0]:]
file1 = open("G:\TianChio2o\\all_offline.csv","w")#注意\\all_offline.csv 前面的// 一个会报错 找了很久。。。
all_offline.to_csv(file1)
file2 = open("G:\TianChio2o\\train_finally.csv","w")
train_finally.to_csv(file2)
file3 = open("G:\TianChio2o\\test_finally.csv","w")
test_finally.to_csv(file3)
#one_hot处理
print(all_offline.columns)
Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'is_used', 'has_coupopn', 'discount_percent',
       'discount_limit', 'is_used_in_15day', 'discount_percent_layer',
       'discount_limit_layer'],
      dtype='object')
#one_hot处理  处理之前 先把需要的字段提取出来
print(all_offline.columns)
all_offline_new = all_offline[['Distance','is_used','has_coupopn','is_used_in_15day','discount_percent_layer','discount_limit_layer']]
all_offline_new =pd.get_dummies(all_offline_new,dummy_na=True)#dummy_na=True null 也要当做特征
Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'is_used', 'has_coupopn', 'discount_percent',
       'discount_limit', 'is_used_in_15day', 'discount_percent_layer',
       'discount_limit_layer'],
      dtype='object')
all_offline_new.head()
is_usedhas_coupopnis_used_in_15dayDistance_0.0Distance_1.0Distance_2.0Distance_3.0Distance_4.0Distance_5.0Distance_6.0...discount_percent_layer_nulldiscount_percent_layer_nandiscount_limit_layer_10discount_limit_layer_20discount_limit_layer_30discount_limit_layer_100discount_limit_layer_200discount_limit_layer_300discount_limit_layer_nulldiscount_limit_layer_nan
00001000000...1000000010
1-1100100000...0000001000
2-1101000000...0001000000
3-1101000000...0001000000
4-1101000000...0001000000

5 rows × 31 columns

#分开测试集和验证集
train_,test_ = all_offline_new[:train_offline.shape[0]],all_offline_new[train_offline.shape[0]:]
#预测的是领卷15天内的 使用概率
#去掉没有领劵的
train_ = train_[train_["has_coupopn"]==1]
#由于测试与训练集都"has_coupopn"=1 所以去掉它
train_=train_.drop(["has_coupopn"],axis=1)#自己用来训练模型的
test_=test_.drop(["has_coupopn"],axis=1)#需要提交的
x_train=train_.drop(["is_used_in_15day"],axis=1)
y_train=pd.DataFrame({"is_used_in_15day":train_["is_used_in_15day"]})
x_test=test_.drop(["is_used_in_15day"],axis=1)
print(x_test.columns)
Index(['is_used', 'Distance_0.0', 'Distance_1.0', 'Distance_2.0',
       'Distance_3.0', 'Distance_4.0', 'Distance_5.0', 'Distance_6.0',
       'Distance_7.0', 'Distance_8.0', 'Distance_9.0', 'Distance_10.0',
       'Distance_null', 'Distance_nan', 'discount_percent_layer_0.1',
       'discount_percent_layer_0.2', 'discount_percent_layer_0.3',
       'discount_percent_layer_0.4', 'discount_percent_layer_0.5',
       'discount_percent_layer_null', 'discount_percent_layer_nan',
       'discount_limit_layer_10', 'discount_limit_layer_20',
       'discount_limit_layer_30', 'discount_limit_layer_100',
       'discount_limit_layer_200', 'discount_limit_layer_300',
       'discount_limit_layer_null', 'discount_limit_layer_nan'],
      dtype='object')
#题目要求得到概率值
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import Imputer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn import tree   
import warnings
warnings.filterwarnings('ignore') 
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.cross_validation import train_test_split
# data (全部数据)   labels(全部目标值)     X_train 训练集(全部特征)  Y_train 训练集的目标值
X_train, X_test, Y_train, Y_test = train_test_split(x_train,y_train, test_size=0.3, random_state=0) #这里训练集75%:测试集25%
def computeAUC(y_true,y_score):
    auc = roc_auc_score(y_true,y_score)
    print("auc=",auc)
    return auc
rf = RandomForestClassifier(n_estimators=100,
                                max_features=4,                               
                                oob_score= True,#是否使用袋外的数据评估模型 设置为True 更能反映出泛化能力
                                min_samples_split=4,
                                min_samples_leaf=50,
                                n_jobs=-1,
                                class_weight='balanced_subsample',
                                bootstrap=True)
#b.使用具有CrossValidation的网格搜索执行参数调整
param_grid = {"n_estimators": [100], "min_samples_split":[4]}
grid_search = GridSearchCV(rf, cv=1, scoring='roc_auc', param_grid=param_grid, iid=False)
#c.输出最佳模型并对测试数据进行预测
#使用最优参数和training_new数据构建模型
grid_search.fit(X_train, Y_train)
print("the best parameter:", grid_search.best_params_)
print("the best score:", grid_search.best_score_)
the best parameter: {'min_samples_split': 4, 'n_estimators': 100}
the best score: 0.9983629720690491

#使用训练的模型来预测train_new数据
predicted_probs_train = grid_search.predict_proba(X_train)
predicted_probs_train = [x[1] for  x in predicted_probs_train]
computeAUC(Y_train, predicted_probs_train)
#使用训练的模型来预测test_new数据(validataion data)
predicted_probs_test_new = grid_search.predict_proba(X_test)
predicted_probs_test_new = [x[1] for x in predicted_probs_test_new]
computeAUC(Y_test, predicted_probs_test_new) 

auc= 0.998383275512854
auc= 0.998349975332741





0.998349975332741
# result=grid_search.predict_proba(x_test)
# rf.fit(x_train,y_train)
# train_pre=rf.predict_proba(x_train)
result=rf.predict_proba(x_test)[:,1]
# result= [x[1] for x in result]
predict=result
result=pd.read_csv("G:\TianChio2o\\ccf_offline_stage1_test_revised.csv")
result["Probability"]=predict
result=result.drop(["Merchant_id","Discount_rate","Distance"],axis=1)
#概率不可以是负的
result["Probability"]=result["Probability"].apply(lambda x:0 if x<0 else x)
file= open("G:\TianChio2o\sample_submission.csv","w")#文件已经存在的时候是不行的
result.to_csv(file)
result.head()
User_idCoupon_idDate_receivedProbability
041295379983201607120.027536
169493783429201607060.024217
221665296928201607270.025548
321665291808201607270.000000
461721626500201607080.028085
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值