np.split()的用法
np.split(m , (3,) ,axis = 1)
鸢尾花数据集概览:
逗号隔开
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split # cross_validation
def iris_type(s):
it = {b'Iris-setosa': 0,
b'Iris-versicolor': 1,
b'Iris-virginica': 2}
return it[s]
if __name__ == "__main__":
path = 'C:/Users/lb/Desktop/test/iris.data' # 数据文件路径
data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})
#
x, y = np.split(data, (4,), axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=50)
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
#
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 2, 'eta': 0.3, 'silent': 1, 'objective': 'multi:softmax', 'num_class': 3}
#
bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list)
y_hat = bst.predict(data_test)
result = y_test.reshape(1, -1) == y_hat
print('正确率:\t', float(np.sum(result)) / len(y_hat))
print('END.....\n')
参数那里的3 是指有三个分类:因为数据表那里有三个类别
红酒数据概览:
逻辑回归和XGB对比
# /usr/bin/python
# -*- encoding:utf-8 -*-
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split # cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")
def show_accuracy(a, b, tip):
acc = a.ravel() == b.ravel()
print(acc)
print(tip + '正确率:\t', float(acc.sum()) / a.size)
if __name__ == "__main__":
data = np.loadtxt('./data/wine.data', dtype=float, delimiter=',')
# (1,):第一列是标记 ,剩下的且不包含这一列为特征/13列
y, x = np.split(data, (1,), axis=1) # 以列为划分
# x = StandardScaler().fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.5)
# Logistic回归
lr = LogisticRegression(penalty='l2') # LR正则
lr.fit(x_train, y_train.ravel())
y_hat = lr.predict(x_test)
show_accuracy(y_hat, y_test, 'Logistic回归 ')
# XGBoost
y_train[y_train == 3] = 0 # 第3个类别标记为0
y_test[y_test == 3] = 0
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
# 多分类
param = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3}
bst = xgb.train(param, data_train, num_boost_round=4, evals=watch_list)
y_hat = bst.predict(data_test)
show_accuracy(y_hat, y_test, 'XGBoost ')