
    上世纪八十年代Breiman等人发明分类树的算法(Breiman et al. 1984),通过反复二分数据进行分类或回归,
计算量大大降低。2001年Breiman把分类树组合成随机森林(Breiman 2001a),即在变量(列)的使用和数据(行)
(Breiman 2001b),被誉为当前最好的算法之一(Iverson et al. 2008)。

train_test_split是交叉验证中常用的函数,功能是从样本中随机的按比例选取train data和testdata,形式为:
X_train,X_test, y_train, y_test =
cross_validation.train_test_split(train_data,train_target,test_size=0.4, random_state=0)

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.datasets import load_iris
#X =[:, pair]
# Load data
iris = load_iris()

from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(,,test_size=0.25,random_state=33)

clf = RandomForestClassifier(n_estimators=10)
clf =, y_train)

predicted =clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))
             precision    recall  f1-score   support

          0       1.00      1.00      1.00         8
          1       0.79      1.00      0.88        11
          2       1.00      0.84      0.91        19

avg / total       0.94      0.92      0.92        38
机器学习中的学习算法的目标是为了优化或者说最小化loss Function, Gradient boosting的思想是迭代生多个(M个)
上比较小的缩减系数(学习率<0.1),有些GBDT的实现加入了随机抽样(subsample 0.5<=f <=0.8)提高模型的泛化能力。
每个人的年龄-预测年龄)^2 的总和 / N,或者说是每个人的预测误差平方和 除以 N。这很好理解,被预测出错的人数越多,


# -*- coding:utf-8 -*-

import numpy as np 

import pandas as pd 

import xgboost as xgb 

import math

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer,LabelEncoder
from sklearn import cross_validation, metrics 
from sklearn.model_selection import GridSearchCV,StratifiedKFold  
import matplotlib.pyplot as plt

np.random.seed(19260817) # 设置一下种子,看一下博客园有没有能看懂的
pd_train = pd.read_csv('data/train.csv')
pd_test = pd.read_csv('data/test.csv')
pd_gender = pd.read_csv('data/result.csv')

print(pd_train.shape, pd_test.shape)
sex_count = pd_train.groupby(['Sex', 'Survived'])['Survived'].count()
(891, 12) (418, 11)
Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
Name: Survived, dtype: int64
# 性别 将性别字段Sex中的值 female用0,male用1代替,类型 int
pd_train['Sex'] = pd_train['Sex'].map({'female': 0, 'male': 1}).astype(int)
embark_dummies  = pd.get_dummies(pd_train['Embarked'])
pd_train = pd_train.join(embark_dummies)
pd_train.drop(['Embarked','PassengerId'], axis=1,inplace=True)
pd_train['Fare_Category'] = pd_train['Fare'].map(fare_category)
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
# 将类型变量转换位连续变量

for f in pd_train.columns:

    if pd_train[f].dtype == 'object':

        label = LabelEncoder()[f].values))

        pd_train[f] = label.transform(list(pd_train[f].values))

# 统计缺失的列


na_train = pd_train.isnull().sum().sort_values(ascending=False)


# 使用均值填充缺失值

train_data= pd_train.values

imput = Imputer(missing_values="NaN", strategy="mean", axis=0)

imput =

train_data = imput.fit_transform(train_data)

pd_train = pd.DataFrame(train_data, index=None, columns=columns)

na_train = pd_train.isnull().sum().sort_values(ascending=False)

# print("缺失值处理后:")

# print(na_train)

# print(pd_train.head())

# 保存新数据


Age              177
Fare_Category      0
S                  0
Q                  0
C                  0
Cabin              0
Fare               0
Ticket             0
Parch              0
SibSp              0
Sex                0
Name               0
Pclass             0
Survived           0
dtype: int64
def fare_category(fare):

        if fare <= 4:

            return 0

        elif fare <= 10:

            return 1

        elif fare <= 30:

            return 2

        elif fare <= 45:

            return 3


            return 4

def load_data():

    train_data = pd.read_csv('data/new_train.csv')

    test_data = pd.read_csv('data/new_test.csv')

    X = train_data.drop(['Survived'], 1)

    y = train_data['Survived']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

    return X_train, X_test, y_train, y_test

def calc_accuracy(y_pred, y_true):




    accuracy = metrics.accuracy_score(y_true, y_pred)

    # rmse=np.sqrt(metrics.mean_squared_error(y_pred,y_true))

    rmse = np.sqrt(np.mean((y_pred - y_true)**2))

    return accuracy, rmse

def total_survival(y_pred):




    total = 0

    for value in y_pred:

        if value == 1:

            total += 1

    return total

def train():

    lg_rate, lg_rmse, lg_total = train_logreistic()

    rf_rate, rf_rmse, rf_total = train_randomForster()

    xg_rate, xg_rmse, xg_total = train_XGBoost()

    print("LogisticRegression acc_rate:{0:.4f},RMS:{1:.4f},存活:{2}".format( lg_rate, lg_rmse, lg_total))

    print("RandomForestClassifier acc_rate:{0:.4f},RMS:{1:.4f},存活:{2}".format(rf_rate, rf_rmse, rf_total))

    print("XGBClassifier acc_rate:{0:.4f},RMS:{1:.4f},存活:{2}".format(xg_rate, xg_rmse, xg_total))

    # size = 3

    # total_width, n = 0.8, 3

    # width = total_width / n

    # x = np.arange(size)

    # x = x - (total_width - width) / 2

    # a = [lg_rate, rf_rate, xg_rate]

    # b = [lg_rmse, rf_rmse, xg_rmse]

    # c = [lg_total, rf_total, xg_total]

    #, a,  width=width, label='a')

    # + width, b, width=width, label='b')

    # + 2 * width, c, width=width, label='c')

    # plt.legend()





X_train, X_test, y_train, y_test = load_data()

model = LogisticRegression(penalty='l2'), y_train)
y_pred = model.predict(X_test)
rfc_rate, rmse = calc_accuracy(y_pred, y_test)
print(rfc_rate, rmse )
total = total_survival(y_pred)
print(total )

from sklearn.metrics import classification_report
print('Accruacy of LR Classifier:',model.score(X_test,y_test))
#  #利用classification_report模块获得召回率,精确率和F1值三个指标

0.7597765363128491 0.4901259671626783
Accruacy of LR Classifier: 0.7597765363128491
             precision    recall  f1-score   support

        0.0       0.79      0.84      0.81       110
        1.0       0.71      0.64      0.67        69

avg / total       0.76      0.76      0.76       179
''' 随机森林'''
X_train, X_test, y_train, y_test = load_data()

model = RandomForestClassifier(n_estimators=500,max_depth=6,random_state=7),y_train)

y_pred = model.predict(X_test)

rfc_rate, rmse = calc_accuracy(y_pred, y_test)

total = total_survival(y_pred)

# RandomForestClassifier acc_rate:82.6816,RMS:0.4162,存活:54

print(rfc_rate, rmse )

print(total )
from sklearn.metrics import classification_report
print('Accruacy of LR Classifier:',model.score(X_test,y_test))
#  #利用classification_report模块获得召回率,精确率和F1值三个指标
0.7988826815642458 0.4484610556511615
Accruacy of LR Classifier: 0.7988826815642458
             precision    recall  f1-score   support

        0.0       0.79      0.92      0.85       110
        1.0       0.82      0.61      0.70        69

avg / total       0.80      0.80      0.79       179

'''xgboost '''

X_train, X_test, y_train, y_test = load_data()

model = xgb.XGBClassifier(max_depth=8, learning_rate=0.06, n_estimators=100, objective="binary:logistic",

#classxgboost.DMatrix(数据,标签 = 无,缺失 = 无,权重 = 无,静默 = False, feature_names = 无, feature_types = 无, nthread = none)
#data, label=None, missing=None, weight=None, silent=False, feature_names=None, feature_types=None, nthread=None

eval_data = [(X_test, y_test)], y_train, eval_set=eval_data, early_stopping_rounds=30)

y_pred = model.predict(X_test)

rfc_rate, rmse = calc_accuracy(y_pred, y_test)

total = total_survival(y_pred)

print(rfc_rate, rmse )

print(total )

# XGBClassifier acc_rate:80.4469,RMS:0.4422,存活:56

from sklearn.metrics import classification_report
print('Accruacy of LR Classifier:',model.score(X_test,y_test))
#  #利用classification_report模块获得召回率,精确率和F1值三个指标
[0] validation_0-error:0.240223
Will train until validation_0-error hasn't improved in 30 rounds.
[1] validation_0-error:0.240223
[2] validation_0-error:0.240223
[3] validation_0-error:0.240223
[4] validation_0-error:0.240223
[5] validation_0-error:0.22905
[6] validation_0-error:0.22905
[7] validation_0-error:0.22905
[8] validation_0-error:0.234637
[9] validation_0-error:0.234637
[10]    validation_0-error:0.223464
[11]    validation_0-error:0.223464
[12]    validation_0-error:0.223464
[13]    validation_0-error:0.223464
[14]    validation_0-error:0.22905
[15]    validation_0-error:0.223464
[16]    validation_0-error:0.22905
[17]    validation_0-error:0.223464
[18]    validation_0-error:0.22905
[19]    validation_0-error:0.22905
[20]    validation_0-error:0.223464
[21]    validation_0-error:0.22905
[22]    validation_0-error:0.22905
[23]    validation_0-error:0.22905
[24]    validation_0-error:0.22905
[25]    validation_0-error:0.22905
[26]    validation_0-error:0.217877
[27]    validation_0-error:0.217877
[28]    validation_0-error:0.217877
[29]    validation_0-error:0.21229
[30]    validation_0-error:0.201117
[31]    validation_0-error:0.21229
[32]    validation_0-error:0.206704
[33]    validation_0-error:0.206704
[34]    validation_0-error:0.201117
[35]    validation_0-error:0.195531
[36]    validation_0-error:0.189944
[37]    validation_0-error:0.189944
[38]    validation_0-error:0.195531
[39]    validation_0-error:0.195531
[40]    validation_0-error:0.184358
[41]    validation_0-error:0.189944
[42]    validation_0-error:0.184358
[43]    validation_0-error:0.189944
[44]    validation_0-error:0.184358
[45]    validation_0-error:0.184358
[46]    validation_0-error:0.184358
[47]    validation_0-error:0.189944
[48]    validation_0-error:0.189944
[49]    validation_0-error:0.184358
[50]    validation_0-error:0.189944
[51]    validation_0-error:0.189944
[52]    validation_0-error:0.189944
[53]    validation_0-error:0.189944
[54]    validation_0-error:0.184358
[55]    validation_0-error:0.184358
[56]    validation_0-error:0.184358
[57]    validation_0-error:0.178771
[58]    validation_0-error:0.173184
[59]    validation_0-error:0.184358
[60]    validation_0-error:0.167598
[61]    validation_0-error:0.173184
[62]    validation_0-error:0.173184
[63]    validation_0-error:0.173184
[64]    validation_0-error:0.173184
[65]    validation_0-error:0.173184
[66]    validation_0-error:0.173184
[67]    validation_0-error:0.173184
[68]    validation_0-error:0.173184
[69]    validation_0-error:0.173184
[70]    validation_0-error:0.173184
[71]    validation_0-error:0.173184
[72]    validation_0-error:0.173184
[73]    validation_0-error:0.173184
[74]    validation_0-error:0.173184
[75]    validation_0-error:0.173184
[76]    validation_0-error:0.173184
[77]    validation_0-error:0.173184
[78]    validation_0-error:0.173184
[79]    validation_0-error:0.173184
[80]    validation_0-error:0.167598
[81]    validation_0-error:0.167598
[82]    validation_0-error:0.167598
[83]    validation_0-error:0.167598
[84]    validation_0-error:0.167598
[85]    validation_0-error:0.167598
[86]    validation_0-error:0.167598
[87]    validation_0-error:0.167598
[88]    validation_0-error:0.167598
[89]    validation_0-error:0.167598
[90]    validation_0-error:0.167598
Stopping. Best iteration:
[60]    validation_0-error:0.167598

0.8324022346368715 0.40938706057120133
Accruacy of LR Classifier: 0.8324022346368715
             precision    recall  f1-score   support

        0.0       0.82      0.94      0.87       110
        1.0       0.87      0.67      0.75        69

avg / total       0.84      0.83      0.83       179

{'reg_alpha': 0.1}

C:\anaconda3\lib\site-packages\sklearn\preprocessing\ DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\anaconda3\lib\site-packages\sklearn\preprocessing\ DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:




