这里测试了使用adaboost的效果,代码如下:
#coding=utf-8
import pandas as pd
from pandas import Series,DataFrame
import random
import numpy as np
from datetime import date
import datetime as dt
from numpy import nan as NA
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
import warnings
warnings.filterwarnings("ignore")
#读取数据
traindata = pd.read_csv("train.csv",header=0)
testdata = pd.read_csv("test.csv",header=0)
traindata.Age[traindata.Age.isnull()] = 30
print(traindata.Age.describe())
testdata.Age[testdata.Age.isnull()] = 30
print(testdata.Age.describe())
#新加一列,儿童,年轻还是老年人
traindata['Age1'] = 1
testdata['Age1'] = 1
traindata.Age1[traindata.Age<12] = 0
testdata.Age1[testdata.Age<12] = 0
traindata.Age1[traindata.Age>50] = 2
testdata.Age1[testdata.Age>50] = 2
#新加一列,是否人多
traindata['Parch1'] = 0
testdata['Parch1'] = 0
traindata.Parch1[traindata.Parch>1] = 1
testdata.Parch1[testdata.Parch>1] = 1
print(traindata.describe())
traindata.Sex[traindata.Sex=='male'] = 0
traindata.Sex[traindata.Sex=='female'] = 1
print(traindata.Sex.describe())
testdata.Sex[testdata.Sex=='male'] = 0
testdata.Sex[testdata.Sex=='female'] = 1
print(testdata.Sex.describe())
testdata.Fare[testdata.Fare.isnull()]=35
print(testdata.Fare.describe())
UseFlag = traindata['Survived'].values
UseFeature = traindata[['Pclass','Sex','Age','SibSp','Parch','Fare','Age1','Parch1']].values
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(UseFeature)
scaler.transform(UseFeature)
TestFeature = testdata[['Pclass','Sex','Age','SibSp','Parch','Fare','Age1','Parch1']].values
scaler.transform(TestFeature)
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100) #迭代100次
clf.fit(UseFeature,UseFlag)#进行模型的训练
temp = clf.predict(TestFeature)
testdata['tadaboost']=temp
temp = clf.predict(TestFeature)
testdata['Survived']=0
testdata.Survived = testdata.tadaboost
testdata.Survived = testdata.Survived.astype(int)
outdata = testdata[['PassengerId','Survived']]#提取出需要的列
outdata.to_csv("test_2018_2_26_adaboost.csv",index=False,header=True)#保存数据集
最终结果:0.74162