关于提升---XGBoost

1.提升

提升是一个机器学习技术,可以用于回归和分类问题,每一步产生一个弱预测模型,并加权累加到总模型中;如果每一步的弱预测模型生成都是依据损失函数的梯度方向,则称之为梯度提升。
提升的理论意义:如果一个问题存在弱分类器,则可以通过提升的办法得到强分类器。

2.提升算法

当平方和作为目标值时,将样本求均值,作为初始最优解。
当绝对值作为目标时,中位数是最小最优解。(中位数就是大小排序在中间的数)

相对于传统的GBDT,XGBoost使用了二阶信息,可以更快的在训练集上收敛。由于“随机森林族”本身具备过拟合的有事,因此XGBoost仍然一定程度的具有该特性。XGBoost的实现中使用了许多并行/多核计算,因此训练速度快;同时它的原生语言为C/C++。一般来说,XGBoost的速度和性能优于sklearn.ensemble.GradientBoostingClassifer类。

AdaBoost算法可以看做利用指数损失函数的提升方法。

Bagging能够减少训练方差,对于不剪枝的决策树、神经网络等学习器有良好的集成效果;
Boosting减少偏差,能够基于泛化能力较弱的学习器构造强学习器。

代码:http://github.com/dmlc/xgboost/

3.代码

这里得到的结果会很好,其实质为过拟合的结果。
xgBoost_Intro.py

# /usr/bin/python
# -*- encoding:utf-8 -*-

import xgboost as xgb
import numpy as np

# 1、xgBoost的基本使用
# 2、自定义损失函数的梯度和二阶导
# 3、binary:logistic/logitraw


# 定义f: theta * x
def log_reg(y_hat, y):
    p = 1.0 / (1.0 + np.exp(-y_hat)) #sigmoid函数
    g = p - y.get_label() #真实值和预测值的差
    h = p * (1.0-p) #再求导
    return g, h


def error_rate(y_hat, y):
    return 'error', float(sum(y.get_label() != (y_hat > 0.5))) / len(y_hat)


if __name__ == "__main__":
    # 读取数据,训练和测试
    data_train = xgb.DMatrix('12.agaricus_train.txt')
    data_test = xgb.DMatrix('12.agaricus_test.txt')

    # 设置参数,树的最大深度,防止过拟合的eta值,数的生成过程是否输出
    param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logitraw'} # logitraw用来提供梯度信息
    # param = {'max_depth': 3, 'eta': 0.3, 'silent': 1, 'objective': 'reg:logistic'}
    watchlist = [(data_test, 'eval'), (data_train, 'train')]
    n_round = 3 #树的数目
    # bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist)
    # 训练以及参数输入
    bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist, obj=log_reg, feval=error_rate)

    # 计算错误率
    y_hat = bst.predict(data_test)
    y = data_test.get_label() #获取标记
    print y_hat
    print y
    error = sum(y != (y_hat > 0)) #与0进行比较计算错误值
    error_rate = float(error) / len(y_hat)
    print '样本总数:\t', len(y_hat)
    print '错误数目:\t%4d' % error
    print '错误率:\t%.5f%%' % (100*error_rate)

xgBoost_Predict,py

# /usr/bin/python
# -*- encoding:utf-8 -*-

import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split   # cross_validation


def iris_type(s):
    it = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
    return it[s]


if __name__ == "__main__":
    path = u'..\\8.Regression\\8.iris.data'  # 数据文件路径
    data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})
    x, y = np.split(data, (4,), axis=1)
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=50)

    data_train = xgb.DMatrix(x_train, label=y_train)
    data_test = xgb.DMatrix(x_test, label=y_test)
    watch_list = [(data_test, 'eval'), (data_train, 'train')]
    param = {'max_depth': 3, 'eta': 1, 'silent': 1, 'objective': 'multi:softmax', 'num_class': 3} #参数

    bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list) #六棵树
    y_hat = bst.predict(data_test)
    result = y_test.reshape(1, -1) == y_hat #相等则预测正确
    print '正确率:\t', float(np.sum(result)) / len(y_hat)
    print 'END.....\n'

xgBoost_Wine.py

# /usr/bin/python
# -*- encoding:utf-8 -*-

import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split   # cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


def show_accuracy(a, b, tip):
    acc = a.ravel() == b.ravel()
    print acc
    print tip + '正确率:\t', float(acc.sum()) / a.size


if __name__ == "__main__":
    data = np.loadtxt('12.wine.data', dtype=float, delimiter=',') #浮点型,逗号分隔
    y, x = np.split(data, (1,), axis=1) #以列划分,第一列除去
    # x = StandardScaler().fit_transform(x)
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.5) #定好比例分数据集

    # 方法1:Logistic回归
    lr = LogisticRegression(penalty='l2')
    lr.fit(x_train, y_train.ravel())
    y_hat = lr.predict(x_test)
    show_accuracy(y_hat, y_test, 'Logistic回归 ')

    # 方法2:XGBoost,类别数从0开始
    y_train[y_train == 3] = 0
    y_test[y_test == 3] = 0
    data_train = xgb.DMatrix(x_train, label=y_train) #数据包装
    data_test = xgb.DMatrix(x_test, label=y_test)
    watch_list = [(data_test, 'eval'), (data_train, 'train')]
    param = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3} #三分类
    bst = xgb.train(param, data_train, num_boost_round=4, evals=watch_list) #四棵树
    y_hat = bst.predict(data_test)
    show_accuracy(y_hat, y_test, 'XGBoost ') #显示精确度

人工读数据,可以直接用数据包装方法代替
xgBoost_Readdata.py

# /usr/bin/python
# -*- coding:utf-8 -*-

import xgboost as xgb
import numpy as np
import scipy.sparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


def read_data(path): #参数为路径
    y = []
    row = []
    col = []
    values = []
    r = 0       # 首行
    for d in open(path):
        d = d.strip().split()      # 以空格分开
        y.append(int(d[0]))
        d = d[1:]
        for c in d: #循环储存
            key, value = c.split(':')
            row.append(r) #行数
            col.append(int(key)) #列数
            values.append(float(value)) #值
        r += 1
    x = scipy.sparse.csr_matrix((values, (row, col))).toarray() #做出稀疏矩阵
    y = np.array(y) #将稀疏矩阵变成稠密矩阵
    return x, y


def show_accuracy(a, b, tip):
    acc = a.ravel() == b.ravel()
    print acc
    print tip + '正确率:\t', float(acc.sum()) / a.size


if __name__ == '__main__':
    x, y = read_data('12.agaricus_train.txt')
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6)

    # Logistic回归
    lr = LogisticRegression(penalty='l2')
    lr.fit(x_train, y_train.ravel())
    y_hat = lr.predict(x_test)
    show_accuracy(y_hat, y_test, 'Logistic回归 ')

    # XGBoost
    y_train[y_train == 3] = 0
    y_test[y_test == 3] = 0
    data_train = xgb.DMatrix(x_train, label=y_train)
    data_test = xgb.DMatrix(x_test, label=y_test)
    watch_list = [(data_test, 'eval'), (data_train, 'train')]
    param = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3}
    bst = xgb.train(param, data_train, num_boost_round=4, evals=watch_list)
    y_hat = bst.predict(data_test)
    show_accuracy(y_hat, y_test, 'XGBoost ')

泰坦尼克乘客预测
在数据清洗时,对于缺失的数据还可以用随机森林做预测。(随机森林可以做分类还可以做回归)

Titanic.py

# /usr/bin/python
# -*- encoding:utf-8 -*-

import xgboost as xgb
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import csv


def show_accuracy(a, b, tip):
    acc = a.ravel() == b.ravel()
    acc_rate = 100 * float(acc.sum()) / a.size
    # print '%s正确率:%.3f%%' % (tip, acc_rate)
    return acc_rate


def load_data(file_name, is_train): #区分数据
    data = pd.read_csv(file_name)  # 数据文件路径,读csv用pandas
    # print data.describe()可以打印有关数据的信息,包括均值方差等等

    # 性别
    data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}).astype(int) #将性别特征转化为字典

    # 补齐船票价格缺失值,特征较关键且缺失值较少,用相关特征的均值进行填充
    if len(data.Fare[data.Fare.isnull()]) > 0:
        fare = np.zeros(3)
        for f in range(0, 3): #空的值扔去然后取中位数
            fare[f] = data[data.Pclass == f + 1]['Fare'].dropna().median()
        for f in range(0, 3):  # loop 0 to 2
            data.loc[(data.Fare.isnull()) & (data.Pclass == f + 1), 'Fare'] = fare[f]

    # 年龄:使用均值(中位数)代替缺失值,但不准确
    # mean_age = data['Age'].dropna().mean()
    # data.loc[(data.Age.isnull()), 'Age'] = mean_age
    if is_train:
        # 年龄:使用随机森林预测年龄缺失值
        print '随机森林预测缺失年龄:--start--'
        data_for_age = data[['Age', 'Survived', 'Fare', 'Parch', 'SibSp', 'Pclass']]
        age_exist = data_for_age.loc[(data.Age.notnull())]   # 提取年龄不缺失的数据
        age_null = data_for_age.loc[(data.Age.isnull())]   #缺失
        # print age_exist
        x = age_exist.values[:, 1:]
        y = age_exist.values[:, 0]
        rfr = RandomForestRegressor(n_estimators=1000) #年龄视为连续值,做随机森林回归
        rfr.fit(x, y)
        age_hat = rfr.predict(age_null.values[:, 1:]) #得到预测值
        # print age_hat
        data.loc[(data.Age.isnull()), 'Age'] = age_hat #将预测值填到空值中
        print '随机森林预测缺失年龄:--over--'
    else:
        print '随机森林预测缺失年龄2:--start--'
        data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
        age_exist = data_for_age.loc[(data.Age.notnull())]  # 年龄不缺失的数据
        age_null = data_for_age.loc[(data.Age.isnull())]
        # print age_exist
        x = age_exist.values[:, 1:]
        y = age_exist.values[:, 0]
        rfr = RandomForestRegressor(n_estimators=1000)
        rfr.fit(x, y)
        age_hat = rfr.predict(age_null.values[:, 1:])
        # print age_hat
        data.loc[(data.Age.isnull()), 'Age'] = age_hat
        print '随机森林预测缺失年龄2:--over--'

    # 起始城市,有少量缺失值。简单方法是统计哪个城市最多,将对多的城市给空值。还可以根据年龄进行推测。
    data.loc[(data.Embarked.isnull()), 'Embarked'] = 'S'  # 保留缺失出发城市
    # data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2, 'U': 0}).astype(int)
    # print data['Embarked']
    embarked_data = pd.get_dummies(data.Embarked)
    # print embarked_data
    # embarked_data = embarked_data.rename(columns={'S': 'Southampton', 'C': 'Cherbourg', 'Q': 'Queenstown', 'U': 'UnknownCity'})
    embarked_data = embarked_data.rename(columns=lambda x: 'Embarked_' + str(x)) #将值拿出来加前缀得到三列新的特征,即编码成001、010、100
    data = pd.concat([data, embarked_data], axis=1)
    print data.describe()
    data.to_csv('New_Data.csv') #保存

    x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']] #新加了三个特征
    # x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    y = None
    if 'Survived' in data:
        y = data['Survived']

    x = np.array(x)
    y = np.array(y)

    # 思考:这样做,其实发生了什么?
    x = np.tile(x, (5, 1))
    y = np.tile(y, (5, ))
    if is_train:
        return x, y
    return x, data['PassengerId']


def write_result(c, c_type):
    file_name = '12.Titanic.test.csv'
    x, passenger_id = load_data(file_name, False)

    if type == 3:
        x = xgb.DMatrix(x)
    y = c.predict(x)
    y[y > 0.5] = 1
    y[~(y > 0.5)] = 0

    predictions_file = open("Prediction_%d.csv" % c_type, "wb")
    open_file_object = csv.writer(predictions_file)
    open_file_object.writerow(["PassengerId", "Survived"])
    open_file_object.writerows(zip(passenger_id, y))
    predictions_file.close()


if __name__ == "__main__":
    x, y = load_data('12.Titanic.train.csv', True)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=1) #分出数据集
	
	#逻辑回归
    lr = LogisticRegression(penalty='l2')
    lr.fit(x_train, y_train)
    y_hat = lr.predict(x_test)
    lr_rate = show_accuracy(y_hat, y_test, 'Logistic回归 ')
    # write_result(lr, 1)

	# 随机森林
    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(x_train, y_train)
    y_hat = rfc.predict(x_test)
    rfc_rate = show_accuracy(y_hat, y_test, '随机森林 ')
    # write_result(rfc, 2)

    # XGBoost
    data_train = xgb.DMatrix(x_train, label=y_train)
    data_test = xgb.DMatrix(x_test, label=y_test)
    watch_list = [(data_test, 'eval'), (data_train, 'train')]
    param = {'max_depth': 3, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic'} #二分类
             # 'subsample': 1, 'alpha': 0, 'lambda': 0, 'min_child_weight': 1}
    bst = xgb.train(param, data_train, num_boost_round=100, evals=watch_list) #一百棵树
    y_hat = bst.predict(data_test) #测试
    
    # write_result(bst, 3)
    y_hat[y_hat > 0.5] = 1 #从0-1,所以以0.5为阈值
    y_hat[~(y_hat > 0.5)] = 0
    xgb_rate = show_accuracy(y_hat, y_test, 'XGBoost ')

    print 'Logistic回归:%.3f%%' % lr_rate
    print '随机森林:%.3f%%' % rfc_rate
    print 'XGBoost:%.3f%%' % xgb_rate

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值