调用skilearn中的xgboost模型实现二分类完成demo如下,包含模型保存和导出。
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn import metrics
import time
#记录程序运行时间
start_time = time.time()
#读入数据
X = pd.read_csv("/Users/XXX/filter_feature_20000.csv", header=0, nrows = 16000).values
y = pd.read_csv("/Users/XXX/Downloads/label_new_20000.csv", header=0, nrows = 16000).values
val_X = pd.read_csv("/Users/XXX/Downloads/filter_feature_20000.csv", header=0).tail(4000).values
val_y = pd.read_csv("/Users/XXX/Downloads/label_new_20000.csv", header=0).tail(4000).values
test_X = pd.read_csv("/Users/XXX/Downloads/filter_feature_100000.csv", header=0).values
params={
'booster':'gbtree',
'objective': 'binary:logistic',
'gamma':0, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
'max_depth':6, # 构建树的深度,越大越容易过拟合
'lambda':1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
'subsample':1, # 随机采样训练样本
'colsample_bytree':1, # 生成树时进行的列采样
'min_child_weight':1,
'silent':0,
'eta': 0.2, # 如同学习率
'seed':1000,
'eval_metric': 'auc',
'nthread':3 # cpu 线程数
}
plst = list(params.items())
num_rounds =500 # 迭代次数
xgb_val = xgb.DMatrix(val_X,label=val_y)
xgb_train = xgb.DMatrix(X, label=y)
xgb_test = xgb.DMatrix(test_X)
# print(type(X))
# print(type(xgb_test))
# time.sleep(10000)
watchlist = [(xgb_train, 'train'),(xgb_val, 'val')]
# training model
# early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
model = xgb.train(plst, xgb_train, num_rounds, watchlist,early_stopping_rounds=100)
model.save_model('./xgb.model') # 用于存储训练出的模型
print model.dump_model
print ("best best_ntree_limit",model.best_ntree_limit)
#构造feature map文件
#featmap.txt: <featureid> <featurename> <q or i or int>
#feature id: 从0开始直到特征的个数为止,从小到大排列。
#featurename: 特征字段名
#i表示是二分类特征
#q表示数值变量,如年龄,时间等。
#int: 特征为整数(when int is hinted, the decision boundary will be integer)
file_name = 'feature_map.txt'
outfile = open(file_name, 'w')
for i, feat in enumerate(features):
outfile.write('{0}\t{1}\tq\n'.format(i, feat)) #feature type, use i for indicator and q for quantity outfile.close()
outfile.close
# 保存模型为text或者json文件
# xgb.dump_model('xgboost_model.txt',fmap=file_name, with_stats=False)
model.dump_model('xgboost_model', file_name, with_stats = True,dump_format='json')
#输出运行时长
cost_time = time.time()-start_time
print ("xgboost success!",'\n',"cost time:",cost_time,"(s)")
# print ("跑到这里了model.predict")
preds = model.predict(xgb_test,ntree_limit=model.best_ntree_limit)
preds_list=preds.tolist()
preds_list_=[]
for item in preds_list:
preds_list_.append([item])
np.savetxt('xgb_submission.csv',np.c_[range(1,len(tests)+1),preds],delimiter=',',header='ID,y',comments='',fmt='%d')
with open(os.path.join(current_dir, 'result.csv'), 'w', encoding='utf8', newline='') as csvfile:
writer = csv.writer(csvfile)
#先写入columns_name
writer.writerow(["pre_y"])
#写入多行用writerows
writer.writerows(preds_list_)
#输出运行时长
cost_time = time.time()-start_time
print ("xgboost success!",'\n',"cost time:",cost_time,"(s)")
模型导出结果为txt如下:
booster[0]:
0:[c33<0.5] yes=1,no=2,missing=1,gain=659.56543,cover=4000
1:[c1272<0.5] yes=3,no=4,missing=3,gain=81.151062,cover=1080.5
3:[c3<0.5] yes=7,no=8,missing=7,gain=41.0314331,cover=865
7:[c1712<0.5] yes=15,no=16,missing=15,gain=21.5354309,cover=731
15:[c2133<0.5] yes=31,no=32,missing=31,gain=17.3103333,cover=553.75
31:[c1768<0.5] yes=59,no=60,missing=59,gain=10.9403687,cover=465
59:leaf=0.140259743,cover=422.5
60:leaf=0.248275861,cover=42.5
32:[c1449<0.5] yes=61,no=62,missing=61,gain=5.55184937,cover=88.75
61:leaf=0.271571904,cover=73.75
62:leaf=0.125,cover=15
16:[c2038<0.5] yes=33,no=34,missing=33,gain=22.4194565,cover=177.25
33:[c1354<0.5] yes=63,no=64,missing=63,gain=13.6695414,cover=94.5
63:leaf=-0.0195439737,cover=75.75
64:leaf=0.167088613,cover=18.75
34:[c29<0.5] yes=65,no=66,missing=65,gain=16.6235809,cover=82.75
65:leaf=0.0774193555,cover=45.5
66:leaf=0.258823544,cover=37.25
8:[c1506<0.5] yes=17,no=18,missing=17,gain=12.3612061,cover=134
17:[c1271<0.5] yes=35,no=36,missing=35,gain=8.44068623,cover=114.25
35:[c1716<0.5] yes=67,no=68,missing=67,gain=6.99994373,cover=69.5
67:leaf=-0.0120171672,cover=57.25
68:leaf=-0.173584908,cover=12.25
36:[c2248<0.5] yes=69,no=70,missing=69,gain=9.48196793,cover=44.75
69:leaf=0.0993939415,cover=40.25
70:leaf=-0.181818187,cover=4.5
18:[c1140<0.5] yes=37,no=38,missing=37,gain=3.07007694,cover=19.75
37:[c2268<0.5] yes=71,no=72,missing=71,gain=2.29551125,cover=18.75
71:leaf=0.205405399,cover=17.5
72:leaf=-0.0444444455,cover=1.25
38:leaf=-0.100000001,cover=1
4:[c697<0.5] yes=9,no=10,missing=9,gain=6.76394653,cover=215.5
9:[c779<0.5] yes=19,no=20,missing=19,gain=5.40783691,cover=197.5
19:[c1938<0.5] yes=39,no=40,missing=39,gain=4.21405029,cover=190
39:[c1679<0.5] yes=73,no=74,missing=73,gain=5.22348022,cover=185.5
73:leaf=0.291745603,cover=183.75
74:leaf=-0.0363636389,cover=1.75
40:[c13<0.5] yes=75,no=76,missing=75,gain=4.87272739,cover=4.5
75:leaf=0.200000003,cover=3
76:leaf=-0.160000011,cover=1.5
20:[c1118<0.5] yes=41,no=42,missing=41,gain=7.13214016,cover=7.5
41:[c34<0.5] yes=77,no=78,missing=77,gain=3.28787899,cover=2.75
77:leaf=0.100000001,cover=1
78:leaf=-0.25454545,cover=1.75
42:[c1329<0.5] yes=79,no=80,missing=79,gain=1.743083,cover=4.75
79:leaf=0.300000012,cover=3
80:leaf=0.0363636389,cover=1.75
10:[c1603<0.5] yes=21,no=22,missing=21,gain=7.41825104,cover=18
21:[c1169<0.5] yes=43,no=44,missing=43,gain=5.17241478,cover=16.75
43:[c91<0.5] yes=81,no=82,missing=81,gain=5.05558014,cover=13.75
81:leaf=0.264150947,cover=12.25
82:leaf=-0.0800000057,cover=1.5
44:[c1354<0.5] yes=83,no=84,missing=83,gain=3.34595942,cover=3
83:leaf=0.109090917,cover=1.75
84:leaf=-0.222222239,cover=1.25
22:leaf=-0.222222239,cover=1.25
2:[c1272<0.5] yes=5,no=6,missing=5,gain=277.877716,cover=2919.5
5:[c2196<0.5] yes=11,no=12,missing=11,gain=126.877335,cover=2655.75
11:[c1712<0.5] yes=23,no=24,missing=23,gain=37.1381226,cover=1669.5
23:[c3<0.5] yes=45,no=46,missing=45,gain=33.6924438,cover=1266.75
45:[c927<0.5] yes=85,no=86,missing=85,gain=23.6303349,cover=653.25
85:leaf=-0.0463382155,cover=562.25
86:leaf=0.0630434826,cover=91
46:[c1273<0.5] yes=87,no=88,missing=87,gain=27.3742218,cover=613.5
87:leaf=-0.107198611,cover=575.5
88:leaf=0.0666666701,cover=38
24:[c1290<0.5] yes=47,no=48,missing=47,gain=13.1829071,cover=402.75
47:[c7<0.5] yes=89,no=90,missing=89,gain=10.1228333,cover=394.75
89:leaf=-0.152526796,cover=325.5
90:leaf=-0.0669039115,cover=69.25
48:[c26<0.5] yes=91,no=92,missing=91,gain=6.54553127,cover=8
91:leaf=0.273684233,cover=3.75
92:leaf=-0.0571428612,cover=4.25
12:[c927<0.5] yes=25,no=26,missing=25,gain=39.9882202,cover=986.25
25:[c12<0.5] yes=49,no=50,missing=49,gain=22.2378578,cover=754.25
49:[c1337<0.5] yes=93,no=94,missing=93,gain=15.014286,cover=141.25
93:leaf=0.0289361719,cover=116.5
94:leaf=0.198058248,cover=24.75
50:[c2023<0.5] yes=95,no=96,missing=95,gain=18.6097565,cover=613
95:leaf=-0.0151431207,cover=540.5
96:leaf=-0.122448981,cover=72.5
26:[c1719<0.5] yes=51,no=52,missing=51,gain=13.9255447,cover=232
51:[c2153<0.5] yes=97,no=98,missing=97,gain=9.67037582,cover=198.75
97:leaf=0.0775453299,cover=178.25
98:leaf=-0.0651162788,cover=20.5
52:[c1315<0.5] yes=99,no=100,missing=99,gain=13.5209045,cover=33.25
99:leaf=0.266666681,cover=26
100:leaf=-0.0363636389,cover=7.25
6:[c549<0.5] yes=13,no=14,missing=13,gain=17.6135254,cover=263.75
13:[c118<0.5] yes=27,no=28,missing=27,gain=8.44517517,cover=245.75
27:[c1290<0.5] yes=53,no=54,missing=53,gain=7.69490051,cover=244.5
53:[c2059<0.5] yes=101,no=102,missing=101,gain=6.32978821,cover=170
101:leaf=0.16711916,cover=164.75
102:leaf=-0.0480000004,cover=5.25
54:[c293<0.5] yes=103,no=104,missing=103,gain=6.55356598,cover=74.5
103:leaf=0.249158248,cover=73.25
104:leaf=-0.13333334,cover=1.25
28:leaf=-0.222222239,cover=1.25
14:[c2189<0.5] yes=29,no=30,missing=29,gain=4.91767883,cover=18
29:[c2091<0.5] yes=55,no=56,missing=55,gain=6.63676596,cover=15.25
55:leaf=0.24000001,cover=1.5
56:[c1302<0.5] yes=105,no=106,missing=105,gain=7.41748428,cover=13.75
105:leaf=-0.200000003,cover=10
106:leaf=0.105263166,cover=3.75
30:[c1290<0.5] yes=57,no=58,missing=57,gain=1.18787885,cover=2.75
57:leaf=0.25454545,cover=1.75
58:leaf=-0,cover=1
booster[1]:
0:[c33<0.5] yes=1,no=2,missing=1,gain=427.845093,cover=3982.50781
1:[c1272<0.5] yes=3,no=4,missing=3,gain=54.8223572,cover=1070.89758
3:[c2196<0.5] yes=7,no=8,missing=7,gain=30.242691,cover=859.639832
7:[c1271<0.5] yes=15,no=16,missing=15,gain=13.0224113,cover=281.554596
15:[c1475<0.5] yes=31,no=32,missing=31,gain=13.6810684,cover=146.456284
31:[c2076<0.5] yes=57,no=58,missing=57,gain=10.0567083,cover=96.4491196
57:leaf=0.0157024488,cover=59.3216629
58:leaf=-0.11572402,cover=37.1274529
32:[c1618<0.5] yes=59,no=60,missing=59,gain=4.72074223,cover=50.0071716
59:leaf=0.0694069788,cover=45.5354538
60:leaf=0.270091563,cover=4.4717207
16:[c1447<0.5] yes=33,no=34,missing=33,gain=7.08228302,cover=135.098312
33:[c1315<0.5] yes=61,no=62,missing=61,gain=6.59377289,cover=133.609009
61:leaf=0.121101692,cover=108.705887
62:leaf=0.00682065869,cover=24.9031239
34:leaf=-0.245720625,cover=1.48929155
8:[c1952<0.5] yes=17,no=18,missing=17,gain=13.0849762,cover=578.085205
17:[c481<0.5] yes=35,no=36,missing=35,gain=12.0555115,cover=435.25177
35:[c32<0.5] yes=63,no=64,missing=63,gain=11.2629852,cover=390.296692
63:leaf=0.10475228,cover=303.488831
64:leaf=0.187419161,cover=86.807869
36:[c668<0.5] yes=65,no=66,missing=65,gain=7.5410738,cover=44.9550896
65:leaf=-0.0259418767,cover=36.7734833
66:leaf=0.176206753,cover=8.18160725
18:[c2057<0.5] yes=37,no=38,missing=37,gain=6.20710754,cover=142.83345
37:[c63<0.5] yes=67,no=68,missing=67,gain=3.84857941,cover=44.1217384
67:leaf=0.257277936,cover=42.8795776
68:leaf=-0.0592717826,cover=1.24215901
38:[c85<0.5] yes=69,no=70,missing=69,gain=7.20003128,cover=98.7117157
69:leaf=0.161803916,cover=95.7366409
70:leaf=-0.123544648,cover=2.97508097
4:[c9<0.5] yes=9,no=10,missing=9,gain=5.99909973,cover=211.257812
9:[c744<0.5] yes=19,no=20,missing=19,gain=5.7590332,cover=208.557236
19:[c1680<0.5] yes=39,no=40,missing=39,gain=5.66558838,cover=200.946335
39:[c1938<0.5] yes=71,no=72,missing=71,gain=5.28115845,cover=197.488068
71:leaf=0.237334177,cover=193.272552
72:leaf=0.0088988198,cover=4.21552372
40:[c1992<0.5] yes=73,no=74,missing=73,gain=3.19810057,cover=3.45826054
73:leaf=-0.156537995,cover=1.97446072
74:leaf=0.151286513,cover=1.48379982
20:[c2267<0.5] yes=41,no=42,missing=41,gain=5.3044486,cover=7.61089611
41:leaf=-0.232033879,cover=1.22718644
42:[c2243<0.5] yes=75,no=76,missing=75,gain=4.38424873,cover=6.38370943
75:leaf=0.197442099,cover=5.14841366
76:leaf=-0.141550943,cover=1.23529589
10:[c2040<0.5] yes=21,no=22,missing=21,gain=2.46140671,cover=2.70058417
21:leaf=0.107783794,cover=1.22893667
22:leaf=-0.186112493,cover=1.47164738
2:[c1272<0.5] yes=5,no=6,missing=5,gain=179.928802,cover=2911.61011
5:[c2196<0.5] yes=11,no=12,missing=11,gain=81.6617813,cover=2650.31323
11:[c1273<0.5] yes=23,no=24,missing=23,gain=25.5287323,cover=1665.37952
23:[c1271<0.5] yes=43,no=44,missing=43,gain=39.3595123,cover=1561.60522
43:[c2057<0.5] yes=77,no=78,missing=77,gain=19.5219879,cover=763.56073
77:leaf=-0.0642930269,cover=315.744019
78:leaf=-0.129576445,cover=447.816742
44:[c2036<0.5] yes=79,no=80,missing=79,gain=31.8942204,cover=798.044434
79:leaf=-0.0159163754,cover=597.888855
80:leaf=-0.108017288,cover=200.155563
24:[c1274<0.5] yes=45,no=46,missing=45,gain=9.23001671,cover=103.774307
45:[c1712<0.5] yes=81,no=82,missing=81,gain=8.56442451,cover=97.0483093
81:leaf=0.0528198071,cover=68.9394989
82:leaf=-0.0763231069,cover=28.1088161
46:[c1394<0.5] yes=83,no=84,missing=83,gain=1.092803,cover=6.72599506
83:leaf=0.290282816,cover=4.7329421
84:leaf=0.0696449354,cover=1.9930532
12:[c29<0.5] yes=25,no=26,missing=25,gain=28.1107922,cover=984.933838
25:[c927<0.5] yes=47,no=48,missing=47,gain=15.08564,cover=708.926697
47:[c1315<0.5] yes=85,no=86,missing=85,gain=14.1605644,cover=555.629639
85:leaf=-0.0119863721,cover=448.730988
86:leaf=-0.0927194133,cover=106.898651
48:[c2252<0.5] yes=87,no=88,missing=87,gain=8.26338768,cover=153.297028
87:leaf=0.0316573568,cover=145.855804
88:leaf=0.235712335,cover=7.44123077
26:[c2023<0.5] yes=49,no=50,missing=49,gain=12.1558895,cover=276.007141
49:[c1938<0.5] yes=89,no=90,missing=89,gain=11.8624725,cover=247.600189
89:leaf=0.0864320472,cover=236.885422
90:leaf=-0.120999791,cover=10.7147703
50:[c1712<0.5] yes=91,no=92,missing=91,gain=5.94400311,cover=28.4069595
91:leaf=0.019198468,cover=16.4460449
92:leaf=-0.160894528,cover=11.9609146
6:[c544<0.5] yes=13,no=14,missing=13,gain=12.1526718,cover=261.296814
13:[c2032<0.5] yes=27,no=28,missing=27,gain=6.42970276,cover=246.43045
27:[c118<0.5] yes=51,no=52,missing=51,gain=6.42436218,cover=235.034409
51:[c1671<0.5] yes=93,no=94,missing=93,gain=6.46975708,cover=233.799713
93:leaf=0.166887745,cover=215.226166
94:leaf=0.0412377529,cover=18.5735512
52:leaf=-0.198985606,cover=1.234694
28:[c2196<0.5] yes=53,no=54,missing=53,gain=7.55905104,cover=11.3960485
53:leaf=-0.253086418,cover=2.48001337
54:[c2272<0.5] yes=95,no=96,missing=95,gain=5.57294369,cover=8.91603565
95:leaf=0.170312315,cover=6.93561363
96:leaf=-0.155639395,cover=1.98042178
14:[c1057<0.5] yes=29,no=30,missing=29,gain=5.63266706,cover=14.8663702
29:[c793<0.5] yes=55,no=56,missing=55,gain=5.00396013,cover=11.8914061
55:[c2034<0.5] yes=97,no=98,missing=97,gain=4.62844086,cover=8.66823292
97:leaf=-0.0177141353,cover=5.20225906
98:leaf=0.256727427,cover=3.46597409
56:[c606<0.5] yes=99,no=100,missing=99,gain=1.0299964,cover=3.22317266
99:leaf=-0.237570032,cover=1.48697209
100:leaf=-0.0221777,cover=1.73620045
30:leaf=-0.24320662,cover=2.97496462