xgboost demo


#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 19 13:19:26 2017

@author: luogan
"""

import pandas as pd
#df = pd.read_excel('data.xlsx')

from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import matplotlib.pylab as plt

train = pd.read_csv('train.csv')

train=train.drop(train.columns[0],axis=1)

print('load train success')
train = shuffle(train )
target = ['label','code','date']
#train=train.iloc[:100000]

test=pd.read_csv('test.csv')

test=test.drop(train.columns[0],axis=1)

print('load test success')
x_train=train.drop(target,axis=1).values
y_train=train['label'].values
x_test=test.drop(target,axis=1).values

'''
train1=train.drop('label',axis=1)
test1=test.drop('label',axis=1)
mean=train1.mean()
std=train1.std()
x_train_mt=(train1-mean)/std
x_test_mt=(test1-mean)/std
'''


#XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='binary:logistic',
#              booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, 
#              colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
#              base_score=0.5, random_state=0, seed=None, missing=None, **kwargs)


clf = XGBClassifier(
#    n_estimators=10,  # 迭代次数
#    learning_rate=0.03,  # 步长
#    max_depth=6,  # 树的最大深度

    n_estimators=300,  # Number of boosted trees to fit.
    learning_rate=0.01,  # 步长 Boosting learning rate (xgb’s “eta”)
    max_depth=30,  # 树的最大深度Maximum tree depth for base learners.

    min_child_weight=1,  # 决定最小叶子节点样本权重和
    silent=1,  # 输出运行信息
    subsample=0.8,  # 每个决策树所用的子样本占总样本的比例(作用于样本)
    colsample_bytree=0.4,  # 建立树时对特征随机采样的比例(作用于特征)典型值:0.5-1
    objective='multi:softmax',  # 多分类!!!!!!
    num_class=5,
#    nthread=50,
    n_jobs=50,
    max_delta_step=10,
    reg_lambda=1,
    reg_alpha=0,
    seed=27)

print ("training...")
clf.fit(x_train, y_train,  verbose=True)

clf.save_model('tree300.model')

print('training is ok')
fit_pred = clf.predict(x_test)
print (fit_pred)

y_test=test['label'].values

count=0
for i in range(len(fit_pred)):
    if fit_pred[i] == y_test[i]:
        count += 1
print ("len:", count/len(y_test))

'''
y_test=test['label'].values
tar = xgb.Booster(model_file='tree200.model')
x_test1 = xgb.DMatrix(x_test)
fit_pred1 = tar.predict(x_test1)
count=0
for i in range(len(fit_pred1)):
    arg=np.argmax(fit_pred1[i])
    if arg == y_test[i]:
        count += 1
print ("len:", count/len(y_test))
'''

inde=list(train.drop(target,axis=1).columns)


ww=(clf.feature_importances_)
print(ww)            
feat_imp = pd.Series(ww,index=inde).sort_values(ascending=False)

feat_imp.to_excel('feature_importance.xlsx')

#print(feat_imp)
#plt.set_size_inches(20,10) 
feat_imp.plot(kind='bar', title='Feature Importances')


plt.ylabel('Feature Importance Score')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值