xgboost进行分类具体实现

简单介绍:
赛题来源是天池大数据的 “商场中精确定位用户所在店铺”。原数据有114万条,计算起来非常困难。为了让初学者有一个更好的学习体验,也更加基础,我将数据集缩小了之后放在这里,密码:ndfd。供大家下载。

import pandas as pd
import xgboost as xgb
from sklearn import preprocessing
 
 
train = pd.read_csv('train.csv')
tests = pd.read_csv('test.csv')
train
user_idshop_idtime_stamplongitudelatitudewifi_id1wifi_strong1con_sta1wifi_id2wifi_strong2...con_sta7wifi_id8wifi_strong8con_sta8wifi_id9wifi_strong9con_sta9wifi_id10wifi_strong10con_sta10
0u_376s_28717182017/8/6 21:20122.30829132.088040b_6396480-67Falseb_41124514-86...FALSEb_56326644-89.0FALSEb_56328155-77.0FALSEb_5857369-55.0false\n
1u_376s_28717182017/8/6 21:20122.30816232.087970b_6396480-67Falseb_56328155-73...FALSEb_6396479-57.0FALSEb_31100514-89.0FALSEb_5857369-57.0false\n
2u_1041s_1816372017/8/2 13:10117.36525540.638214b_8006367-78Falseb_2485110-52...FALSEb_8006521-74.0FALSEb_35013153-56.0FALSEb_37608251-84.0false\n
3u_1158s_6094702017/8/13 12:30121.13445131.197416b_26250579-73Falseb_26250580-64...FALSEb_30424471-60.0FALSEb_26250578-72.0FALSEb_29510856-80.0false\n
4u_1654s_38167662017/8/25 19:50122.25586731.351320b_39004150-66Falseb_39004148-58...FALSEb_6805211-80.0FALSEb_1845687-72.0FALSEb_21685901-91.0false\n
..................................................................
495u_83642s_3980212017/8/24 20:10121.73109132.602940b_40778712-36Trueb_40778713-53...FALSEb_30772238-79.0FALSEb_19291072-77.0FALSEb_52688309-63.0false\n
496u_84447s_3863822017/8/3 18:10111.34136431.216452b_13303539-62Falseb_47973407-70...FALSEb_13299121-62.0FALSEb_47973408-66.0FALSEb_56326651-49.0false\n
497u_84524s_3224712017/8/12 18:20122.59603631.581866b_54461743-46Falseb_38143992-73...FALSEb_54461973-45.0FALSEb_2837595-58.0FALSEb_35405625-73.0false\n
498u_84860s_3900532017/8/6 21:00121.36575232.316147b_7962419-51Falseb_46165431-65...FALSEb_26725258-80.0FALSEb_30465621-82.0FALSEb_22564180-53.0true\n
499u_83642s_3980212017/8/24 20:10121.73109132.602940b_40778712-36Trueb_40778713-53...FALSEb_30772238-79.0FALSEb_19291072-77.0FALSEb_52688309-63.0false\n

500 rows × 35 columns

tests
row_idshop_iduser_idtime_stamplongitudelatitudewifi_id1wifi_strong1con_sta1wifi_id2...con_sta7wifi_id8wifi_strong8con_sta8wifi_id9wifi_strong9con_sta9wifi_id10wifi_strong10con_sta10
0118742NaNu_300971422017/9/5 13:00122.14101139.818847b_34366982-82Falseb_37756289...FALSEb_28978909-62.0FALSEb_21518966-68.0FALSEb_13748229-72.0false\n
1118743NaNu_300978032017/9/6 13:10118.19190732.855858b_36722251-81Falseb_10537579...FALSEb_21694478-80.0FALSEb_44551973-72.0FALSEb_21694477-85.0false\n
2118744NaNu_300978892017/9/6 17:40119.19211032.424667b_30026291-74Falseb_30026290...FALSEb_50235613-75.0FALSEb_17955238-85.0FALSEb_40924464-54.0false\n
3118745NaNu_300989962017/9/3 12:10120.61220134.055249b_33412374-77Falseb_22084893...FALSEb_21282193-87.0FALSEb_33334040-71.0FALSEb_29623262-68.0false\n
4118746NaNu_300991702017/9/2 20:40116.86198940.326858b_19882704-77Falseb_2241462...FALSEb_585687-57.0FALSEb_37967785-62.0FALSEb_29284311-42.0false\n
..................................................................
495119237NaNu_302573492017/9/5 17:50120.74549430.815596b_19907372-91Falseb_40767122...FALSEb_56692079-69.0FALSEb_2069544-89.0FALSEb_36484904-59.0false\n
496119238NaNu_302573712017/9/2 16:50120.69446331.953709b_39339718-46Falseb_52367573...FALSEb_21638417-63.0FALSEb_21638416-61.0FALSEb_19054839-62.0false\n
497119239NaNu_302578342017/9/6 16:00119.19283532.424525b_32449092-36Falseb_28588685...FALSEb_6899715-69.0FALSEb_31951717-67.0FALSEb_40924426-75.0false\n
498119240NaNu_302578342017/9/6 16:00119.19279632.424623b_49195203-69Falseb_57271624...FALSEb_17365028-60.0FALSEb_28588685-54.0FALSEb_28870484-56.0false\n
499119241NaNu_302581382017/9/1 17:50114.47413831.080863b_4337554-85Falseb_12683769...FALSEb_25093262-73.0FALSEb_11542050-86.0FALSEb_36907324-83.0false\n

500 rows × 36 columns

将时间的string转化成python datetime

使用pandas的自带api pd.to_datetime()

train['time_stamp'] = pd.to_datetime(pd.Series(train['time_stamp']))
tests['time_stamp'] = pd.to_datetime(pd.Series(tests['time_stamp']))
 
train['time_stamp']
0     2017-08-06 21:20:00
1     2017-08-06 21:20:00
2     2017-08-02 13:10:00
3     2017-08-13 12:30:00
4     2017-08-25 19:50:00
              ...        
495   2017-08-24 20:10:00
496   2017-08-03 18:10:00
497   2017-08-12 18:20:00
498   2017-08-06 21:00:00
499   2017-08-24 20:10:00
Name: time_stamp, Length: 500, dtype: datetime64[ns]
print(train.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 35 columns):
user_id          500 non-null object
shop_id          500 non-null object
time_stamp       500 non-null datetime64[ns]
longitude        500 non-null float64
latitude         500 non-null float64
wifi_id1         500 non-null object
wifi_strong1     500 non-null int64
con_sta1         500 non-null bool
wifi_id2         500 non-null object
wifi_strong2     500 non-null int64
con_sta2         500 non-null object
wifi_id3         499 non-null object
wifi_strong3     499 non-null float64
con_sta3         499 non-null object
wifi_id4         497 non-null object
wifi_strong4     497 non-null float64
con_sta4         497 non-null object
wifi_id5         496 non-null object
wifi_strong5     496 non-null float64
con_sta5         496 non-null object
wifi_id6         495 non-null object
wifi_strong6     495 non-null float64
con_sta6         495 non-null object
wifi_id7         494 non-null object
wifi_strong7     494 non-null float64
con_sta7         494 non-null object
wifi_id8         486 non-null object
wifi_strong8     486 non-null float64
con_sta8         486 non-null object
wifi_id9         478 non-null object
wifi_strong9     478 non-null float64
con_sta9         478 non-null object
wifi_id10        467 non-null object
wifi_strong10    467 non-null float64
con_sta10        467 non-null object
dtypes: bool(1), datetime64[ns](1), float64(10), int64(2), object(21)
memory usage: 133.4+ KB
None

将时间datetime细分为year,month,weekday,time

train['Year'] = train['time_stamp'].apply(lambda x:x.year)
train['Month'] = train['time_stamp'].apply(lambda x: x.month)
train['weekday'] = train['time_stamp'].apply(lambda x: x.weekday())
train['time'] = train['time_stamp'].dt.time

tests['Year'] = tests['time_stamp'].apply(lambda x: x.year)
tests['Month'] = tests['time_stamp'].apply(lambda x: x.month)
tests['weekday'] = tests['time_stamp'].dt.dayofweek
tests['time'] = tests['time_stamp'].dt.time
train['Year']
0      2017
1      2017
2      2017
3      2017
4      2017
       ... 
495    2017
496    2017
497    2017
498    2017
499    2017
Name: Year, Length: 500, dtype: int64
train['Month']
0      8
1      8
2      8
3      8
4      8
      ..
495    8
496    8
497    8
498    8
499    8
Name: Month, Length: 500, dtype: int64
train['weekday']
0      6
1      6
2      2
3      6
4      4
      ..
495    3
496    3
497    5
498    6
499    3
Name: weekday, Length: 500, dtype: int64
train['time']
0      21:20:00
1      21:20:00
2      13:10:00
3      12:30:00
4      19:50:00
         ...   
495    20:10:00
496    18:10:00
497    18:20:00
498    21:00:00
499    20:10:00
Name: time, Length: 500, dtype: object

删除’time_stamp’以节约内存

train = train.drop('time_stamp', axis=1)
train = train.dropna(axis=0)
tests = tests.drop('time_stamp', axis=1)

#pad/ffill:用前一个非缺失值去填充该缺失值
backfill/bfill:用下一个非缺失值填充该缺失值

None:指定一个值去替换缺失值

将类别信息用one_hot编码

for f in train.columns:
    if train[f].dtype=='object':
        if f != 'shop_id':
            print(f)
            lbl = preprocessing.LabelEncoder()
            train[f] = lbl.fit_transform(list(train[f].values))
user_id
wifi_id1
wifi_id2
con_sta2
wifi_id3
con_sta3
wifi_id4
con_sta4
wifi_id5
con_sta5
wifi_id6
con_sta6
wifi_id7
con_sta7
wifi_id8
con_sta8
wifi_id9
con_sta9
wifi_id10
con_sta10
time
train
user_idshop_idlongitudelatitudewifi_id1wifi_strong1con_sta1wifi_id2wifi_strong2con_sta2...wifi_id9wifi_strong9con_sta9wifi_id10wifi_strong10con_sta10YearMonthweekdaytime
0108s_2871718122.30829132.088040411-67False272-860...385-77.00402-55.0020178673
1108s_2871718122.30816232.087970411-67False374-730...195-89.00402-57.0020178673
22s_181637117.36525540.638214434-78False128-520...232-56.00253-84.0020178224
34s_609470121.13445131.197416143-73False147-640...144-72.00176-80.0020178620
423s_3816766122.25586731.351320259-66False250-580...91-72.0099-91.0020178464
..................................................................
495306s_398021121.73109132.602940275-36True268-530...98-77.00367-63.0020178366
496307s_386382111.34136431.21645231-62False316-700...323-66.00391-49.0020178354
497308s_322471122.59603631.581866382-46False248-730...171-58.00241-73.0020178555
498309s_390053121.36575232.316147433-51False298-650...188-82.00111-53.0120178671
499306s_398021121.73109132.602940275-36True268-530...98-77.00367-63.0020178366

467 rows × 38 columns

对测试数据集应用同样的方法

for f in tests.columns:
    if tests[f].dtype == 'object':
        print(f)
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(tests[f].values))
        tests[f] = lbl.transform(list(tests[f].values))

选取需要的特征

feature_columns_to_use = ['Year', 'Month', 'weekday',
'time', 'longitude', 'latitude',
'wifi_id1', 'wifi_strong1', 'con_sta1',
 'wifi_id2', 'wifi_strong2', 'con_sta2',
'wifi_id3', 'wifi_strong3', 'con_sta3',
'wifi_id4', 'wifi_strong4', 'con_sta4',
'wifi_id5', 'wifi_strong5', 'con_sta5',
'wifi_id6', 'wifi_strong6', 'con_sta6',
'wifi_id7', 'wifi_strong7', 'con_sta7',
'wifi_id8', 'wifi_strong8', 'con_sta8',
'wifi_id9', 'wifi_strong9', 'con_sta9',
'wifi_id10', 'wifi_strong10', 'con_sta10',]
big_train = train[feature_columns_to_use]
big_test = tests[feature_columns_to_use]
train_X = big_train.to_numpy()
test_X = big_test.to_numpy()
big_train
YearMonthweekdaytimelongitudelatitudewifi_id1wifi_strong1con_sta1wifi_id2...con_sta7wifi_id8wifi_strong8con_sta8wifi_id9wifi_strong9con_sta9wifi_id10wifi_strong10con_sta10
020178673122.30829132.088040411-67False272...0386-89.00385-77.00402-55.00
120178673122.30816232.087970411-67False374...0405-57.00195-89.00402-57.00
220178224117.36525540.638214434-78False128...0433-74.00232-56.00253-84.00
320178620121.13445131.197416143-73False147...0184-60.00144-72.00176-80.00
420178464122.25586731.351320259-66False250...0408-80.0091-72.0099-91.00
..................................................................
49520178366121.73109132.602940275-36True268...0189-79.0098-77.00367-63.00
49620178354111.34136431.21645231-62False316...026-62.00323-66.00391-49.00
49720178555122.59603631.581866382-46False248...0366-45.00171-58.00241-73.00
49820178671121.36575232.316147433-51False298...0151-80.00188-82.00111-53.01
49920178366121.73109132.602940275-36True268...0189-79.0098-77.00367-63.00

467 rows × 36 columns

train_X[0]
array([2017, 8, 6, 73, 122.308291, 32.08804, 411, -67, False, 272, -86, 0,
       160, -90.0, 0, 403, -55.0, 0, 446, -90.0, 0, 208, -74.0, 0, 405,
       -68.0, 0, 386, -89.0, 0, 385, -77.0, 0, 402, -55.0, 0],
      dtype=object)
train_y = train['shop_id']
 
gbm = xgb.XGBClassifier(silent=1, max_depth=10,
                    n_estimators=1000, learning_rate=0.05)
gbm.fit(train_X, train_y)
predictions = gbm.predict(test_X)

提交预测

submission = pd.DataFrame({'row_id': tests['row_id'],
                            'shop_id': predictions})
print(submission)
submission.to_csv("submission.csv",index=False)

完整的代码是这样。

import pandas as pd
import xgboost as xgb
from sklearn import preprocessing
 
 
train = pd.read_csv('train.csv')
tests = pd.read_csv('test.csv')
 
train['time_stamp'] = pd.to_datetime(pd.Series(train['time_stamp']))
tests['time_stamp'] = pd.to_datetime(pd.Series(tests['time_stamp']))
 
print(train.info())
 
train['Year'] = train['time_stamp'].apply(lambda x:x.year)
train['Month'] = train['time_stamp'].apply(lambda x: x.month)
train['weekday'] = train['time_stamp'].apply(lambda x: x.weekday())
train['time'] = train['time_stamp'].dt.time
tests['Year'] = tests['time_stamp'].apply(lambda x: x.year)
tests['Month'] = tests['time_stamp'].apply(lambda x: x.month)
tests['weekday'] = tests['time_stamp'].dt.dayofweek
tests['time'] = tests['time_stamp'].dt.time
train = train.drop('time_stamp', axis=1)
train = train.dropna(axis=0)
tests = tests.drop('time_stamp', axis=1)
tests = tests.fillna(method='pad')
for f in train.columns:
    if train[f].dtype=='object':
        if f != 'shop_id':
            print(f)
            lbl = preprocessing.LabelEncoder()
            train[f] = lbl.fit_transform(list(train[f].values))
for f in tests.columns:
    if tests[f].dtype == 'object':
        print(f)
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(tests[f].values))
        tests[f] = lbl.transform(list(tests[f].values))
 
 
feature_columns_to_use = ['Year', 'Month', 'weekday',
'time', 'longitude', 'latitude',
'wifi_id1', 'wifi_strong1', 'con_sta1',
 'wifi_id2', 'wifi_strong2', 'con_sta2',
'wifi_id3', 'wifi_strong3', 'con_sta3',
'wifi_id4', 'wifi_strong4', 'con_sta4',
'wifi_id5', 'wifi_strong5', 'con_sta5',
'wifi_id6', 'wifi_strong6', 'con_sta6',
'wifi_id7', 'wifi_strong7', 'con_sta7',
'wifi_id8', 'wifi_strong8', 'con_sta8',
'wifi_id9', 'wifi_strong9', 'con_sta9',
'wifi_id10', 'wifi_strong10', 'con_sta10',]

big_train = train[feature_columns_to_use]
big_test = tests[feature_columns_to_use]
train_X = big_train.to_numpy()
test_X = big_test.to_numpy()
train_y = train['shop_id']
 
gbm = xgb.XGBClassifier(silent=1, max_depth=10,
                    n_estimators=1000, learning_rate=0.05)
gbm.fit(train_X, train_y)
predictions = gbm.predict(test_X)
 
submission = pd.DataFrame({'row_id': tests['row_id'],
                            'shop_id': predictions})
print(submission)
submission.to_csv("submission.csv",index=False)
 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值