import pandas as pd
path = "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt"
titanic = pd. read_csv( path)
titanic. head( )
row.names pclass survived name age embarked home.dest room ticket boat sex 0 1 1st 1 Allen, Miss Elisabeth Walton 29.0000 Southampton St Louis, MO B-5 24160 L221 2 female 1 2 1st 0 Allison, Miss Helen Loraine 2.0000 Southampton Montreal, PQ / Chesterville, ON C26 NaN NaN female 2 3 1st 0 Allison, Mr Hudson Joshua Creighton 30.0000 Southampton Montreal, PQ / Chesterville, ON C26 NaN (135) male 3 4 1st 0 Allison, Mrs Hudson J.C. (Bessie Waldo Daniels) 25.0000 Southampton Montreal, PQ / Chesterville, ON C26 NaN NaN female 4 5 1st 1 Allison, Master Hudson Trevor 0.9167 Southampton Montreal, PQ / Chesterville, ON C22 NaN 11 male
x = titanic[ [ "pclass" , "age" , "sex" ] ]
y = titanic[ "survived" ]
x. head( )
pclass age sex 0 1st 29.0000 female 1 1st 2.0000 female 2 1st 30.0000 male 3 1st 25.0000 female 4 1st 0.9167 male
y. head( )
0 1
1 0
2 0
3 0
4 1
Name: survived, dtype: int64
x[ "age" ] . fillna( x[ "age" ] . mean( ) , inplace= True )
D:\anaconda3\lib\site-packages\pandas\core\generic.py:6245: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self._update_inplace(new_data)
x = x. to_dict( orient= "records" )
from sklearn. model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split( x, y, random_state= 22 )
from sklearn. feature_extraction import DictVectorizer
from sklearn. tree import DecisionTreeClassifier, export_graphviz
transfer = DictVectorizer( )
x_train = transfer. fit_transform( x_train)
x_test = transfer. transform( x_test)
estimator = DecisionTreeClassifier( criterion= "entropy" , max_depth= 8 )
estimator. fit( x_train, y_train)
y_predict = estimator. predict( x_test)
print ( "y_predict:\n" , y_predict)
print ( "直接比对真实值和预测值:\n" , y_test == y_predict)
score = estimator. score( x_test, y_test)
print ( "准确率为:\n" , score)
export_graphviz( estimator, out_file= "titanic_tree.dot" , feature_names= transfer. get_feature_names( ) )
y_predict:
[0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0
0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1
0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1
1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0
0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0
0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0
0 1 1 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1
0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1]
直接比对真实值和预测值:
831 True
261 True
1210 True
1155 True
255 True
...
1146 True
1125 False
386 True
1025 False
337 True
Name: survived, Length: 329, dtype: bool
准确率为:
0.7811550151975684
随机森林对泰坦尼克号乘客的生存进行预测
from sklearn. ensemble import RandomForestClassifier
from sklearn. model_selection import GridSearchCV
estimator = RandomForestClassifier( )
param_dict = { "n_estimators" : [ 120 , 200 , 300 , 500 , 800 , 1200 ] , "max_depth" : [ 5 , 8 , 15 , 25 , 30 ] }
estimator = GridSearchCV( estimator, param_grid= param_dict, cv= 3 )
estimator. fit( x_train, y_train)
y_predict = estimator. predict( x_test)
print ( "y_predict:\n" , y_predict)
print ( "直接比对真实值和预测值:\n" , y_test == y_predict)
score = estimator. score( x_test, y_test)
print ( "准确率为:\n" , score)
print ( "最佳参数:\n" , estimator. best_params_)
print ( "最佳结果:\n" , estimator. best_score_)
print ( "最佳估计器:\n" , estimator. best_estimator_)
print ( "交叉验证结果:\n" , estimator. cv_results_)
y_predict:
[0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0
0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1
0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1
1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0
0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0
0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0
0 1 1 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1
0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1]
直接比对真实值和预测值:
831 True
261 True
1210 True
1155 True
255 True
762 True
615 True
507 True
1175 True
301 True
1134 True
177 True
183 False
125 False
1093 True
1304 False
1124 True
798 False
1101 True
1239 False
1153 True
1068 False
846 True
148 True
478 True
642 True
1298 True
540 True
28 True
130 True
...
194 True
663 True
1209 True
117 False
595 False
1151 False
1143 True
1216 True
874 True
246 True
160 True
1208 True
682 True
307 True
67 True
961 True
400 True
923 False
866 True
134 True
613 True
242 True
320 False
829 True
94 True
1146 True
1125 False
386 True
1025 False
337 True
Name: survived, Length: 329, dtype: bool
准确率为:
0.7872340425531915
最佳参数:
{'max_depth': 5, 'n_estimators': 120}
最佳结果:
0.8363821138211383
最佳估计器:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=5, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=None,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
交叉验证结果:
{'mean_fit_time': array([0.11182229, 0.19149677, 0.27871044, 0.4505314 , 0.72257209,
1.21950404, 0.15458934, 0.23542873, 0.37338622, 0.55880507,
0.90250031, 1.44036126, 0.13625924, 0.24566126, 0.39018901,
0.57973933, 0.94357061, 1.46748765, 0.15806643, 0.25924444,
0.3800021 , 0.60227998, 0.98656511, 1.5208021 , 0.15277807,
0.25416827, 0.37849299, 0.61238893, 1.00995 , 1.51009766]), 'std_fit_time': array([0.00438099, 0.00391445, 0.00445387, 0.00552127, 0.0178945 ,
0.05956372, 0.00696462, 0.01180214, 0.01545986, 0.02345017,
0.01762821, 0.07661026, 0.00448709, 0.00753101, 0.01337304,
0.02401102, 0.02824846, 0.00723971, 0.00559061, 0.00539144,
0.03176938, 0.00900011, 0.0357836 , 0.02412509, 0.01049831,
0.00312499, 0.02043117, 0.03736237, 0.03896 , 0.01708367]), 'mean_score_time': array([0.01055225, 0.02124031, 0.02604191, 0.04676072, 0.06393997,
0.1221021 , 0.01392762, 0.02117666, 0.03027145, 0.04542494,
0.08080705, 0.11298935, 0.01059707, 0.02046402, 0.02975106,
0.04587412, 0.07316939, 0.14350526, 0.0142649 , 0.02011824,
0.02920715, 0.0444289 , 0.07418664, 0.11165055, 0.01248868,
0.02353628, 0.03232622, 0.04952399, 0.08569598, 0.11799375]), 'std_score_time': array([1.09863734e-03, 2.29822618e-03, 3.20843508e-03, 4.00866766e-03,
1.42997845e-03, 1.48818168e-02, 2.37098736e-03, 8.80449078e-04,
1.62827120e-03, 1.83137647e-03, 9.86835991e-03, 9.71738484e-03,
5.51943914e-04, 1.00782641e-03, 2.11610207e-03, 1.98464255e-03,
3.04582952e-03, 9.81828652e-03, 1.69302449e-03, 1.37694072e-03,
1.67724778e-03, 7.58986198e-05, 3.23449160e-03, 3.78348887e-03,
1.02684570e-03, 5.07326308e-03, 4.72586897e-03, 2.47344396e-03,
1.11438683e-02, 4.31988881e-03]), 'param_max_depth': masked_array(data=[5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 15, 15, 15, 15, 15,
15, 25, 25, 25, 25, 25, 25, 30, 30, 30, 30, 30, 30],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False],
fill_value='?',
dtype=object), 'param_n_estimators': masked_array(data=[120, 200, 300, 500, 800, 1200, 120, 200, 300, 500, 800,
1200, 120, 200, 300, 500, 800, 1200, 120, 200, 300,
500, 800, 1200, 120, 200, 300, 500, 800, 1200],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False],
fill_value='?',
dtype=object), 'params': [{'max_depth': 5, 'n_estimators': 120}, {'max_depth': 5, 'n_estimators': 200}, {'max_depth': 5, 'n_estimators': 300}, {'max_depth': 5, 'n_estimators': 500}, {'max_depth': 5, 'n_estimators': 800}, {'max_depth': 5, 'n_estimators': 1200}, {'max_depth': 8, 'n_estimators': 120}, {'max_depth': 8, 'n_estimators': 200}, {'max_depth': 8, 'n_estimators': 300}, {'max_depth': 8, 'n_estimators': 500}, {'max_depth': 8, 'n_estimators': 800}, {'max_depth': 8, 'n_estimators': 1200}, {'max_depth': 15, 'n_estimators': 120}, {'max_depth': 15, 'n_estimators': 200}, {'max_depth': 15, 'n_estimators': 300}, {'max_depth': 15, 'n_estimators': 500}, {'max_depth': 15, 'n_estimators': 800}, {'max_depth': 15, 'n_estimators': 1200}, {'max_depth': 25, 'n_estimators': 120}, {'max_depth': 25, 'n_estimators': 200}, {'max_depth': 25, 'n_estimators': 300}, {'max_depth': 25, 'n_estimators': 500}, {'max_depth': 25, 'n_estimators': 800}, {'max_depth': 25, 'n_estimators': 1200}, {'max_depth': 30, 'n_estimators': 120}, {'max_depth': 30, 'n_estimators': 200}, {'max_depth': 30, 'n_estimators': 300}, {'max_depth': 30, 'n_estimators': 500}, {'max_depth': 30, 'n_estimators': 800}, {'max_depth': 30, 'n_estimators': 1200}], 'split0_test_score': array([0.82674772, 0.82066869, 0.82674772, 0.82674772, 0.82674772,
0.82674772, 0.80547112, 0.80851064, 0.80243161, 0.80243161,
0.80243161, 0.81155015, 0.79027356, 0.79635258, 0.79635258,
0.79331307, 0.79027356, 0.79027356, 0.79635258, 0.79027356,
0.79635258, 0.79331307, 0.79027356, 0.79331307, 0.79027356,
0.79635258, 0.79635258, 0.79331307, 0.7993921 , 0.79331307]), 'split1_test_score': array([0.85670732, 0.85670732, 0.85365854, 0.85365854, 0.85365854,
0.85365854, 0.85060976, 0.8597561 , 0.84756098, 0.85670732,
0.85670732, 0.85670732, 0.85365854, 0.85365854, 0.85365854,
0.8597561 , 0.85060976, 0.85670732, 0.85670732, 0.85365854,
0.85670732, 0.85365854, 0.85670732, 0.85060976, 0.85670732,
0.8597561 , 0.84756098, 0.85670732, 0.85060976, 0.85060976]), 'split2_test_score': array([0.82568807, 0.82262997, 0.82262997, 0.82568807, 0.82262997,
0.82262997, 0.80122324, 0.79510703, 0.80122324, 0.80428135,
0.80122324, 0.80122324, 0.80428135, 0.80122324, 0.80122324,
0.80122324, 0.80428135, 0.80428135, 0.80428135, 0.80733945,
0.80122324, 0.80122324, 0.80122324, 0.80122324, 0.79816514,
0.80428135, 0.80122324, 0.80428135, 0.80122324, 0.80122324]), 'mean_test_score': array([0.83638211, 0.83333333, 0.83434959, 0.83536585, 0.83434959,
0.83434959, 0.81910569, 0.82113821, 0.81707317, 0.82113821,
0.82012195, 0.82317073, 0.81605691, 0.81707317, 0.81707317,
0.81808943, 0.81504065, 0.81707317, 0.81910569, 0.81707317,
0.81808943, 0.81605691, 0.81605691, 0.81504065, 0.81504065,
0.82012195, 0.81504065, 0.81808943, 0.81707317, 0.81504065]), 'std_test_score': array([0.0143786 , 0.01654729, 0.01375658, 0.01294211, 0.01375658,
0.01375658, 0.02234414, 0.02784983, 0.02156378, 0.02516249,
0.02587446, 0.02408579, 0.02719639, 0.02594607, 0.02594607,
0.02963923, 0.02579309, 0.02860307, 0.02678467, 0.02679151,
0.02737927, 0.02678375, 0.02908969, 0.02535762, 0.0296384 ,
0.02821188, 0.02308115, 0.02767166, 0.02372573, 0.02535762]), 'rank_test_score': array([ 1, 6, 3, 2, 3, 3, 12, 8, 17, 8, 10, 7, 23, 17, 17, 14, 26,
17, 12, 17, 14, 23, 23, 26, 26, 10, 26, 14, 17, 26], dtype=int32), 'split0_train_score': array([0.85801527, 0.85954198, 0.85648855, 0.85801527, 0.85648855,
0.85648855, 0.87633588, 0.87633588, 0.87633588, 0.87633588,
0.87633588, 0.87480916, 0.88244275, 0.88244275, 0.88244275,
0.88244275, 0.88244275, 0.88244275, 0.88244275, 0.88244275,
0.88244275, 0.88244275, 0.88244275, 0.88244275, 0.88244275,
0.88244275, 0.88244275, 0.88244275, 0.88244275, 0.88244275]), 'split1_train_score': array([0.84603659, 0.84756098, 0.84756098, 0.84756098, 0.84756098,
0.84756098, 0.86128049, 0.86128049, 0.86128049, 0.86128049,
0.86128049, 0.86128049, 0.86128049, 0.86128049, 0.86128049,
0.86128049, 0.86128049, 0.86128049, 0.86128049, 0.86128049,
0.86128049, 0.86128049, 0.86128049, 0.86128049, 0.86128049,
0.86128049, 0.86128049, 0.86128049, 0.86128049, 0.86128049]), 'split2_train_score': array([0.87214612, 0.87214612, 0.86757991, 0.87214612, 0.86757991,
0.86757991, 0.88736682, 0.88736682, 0.88584475, 0.88584475,
0.88584475, 0.88584475, 0.88736682, 0.88736682, 0.88736682,
0.88736682, 0.88736682, 0.88736682, 0.88736682, 0.88736682,
0.88736682, 0.88736682, 0.88736682, 0.88736682, 0.88736682,
0.88736682, 0.88736682, 0.88736682, 0.88736682, 0.88736682]), 'mean_train_score': array([0.85873266, 0.85974969, 0.85720981, 0.85924079, 0.85720981,
0.85720981, 0.87499439, 0.87499439, 0.87448704, 0.87448704,
0.87448704, 0.87397813, 0.87703002, 0.87703002, 0.87703002,
0.87703002, 0.87703002, 0.87703002, 0.87703002, 0.87703002,
0.87703002, 0.87703002, 0.87703002, 0.87703002, 0.87703002,
0.87703002, 0.87703002, 0.87703002, 0.87703002, 0.87703002]), 'std_train_score': array([0.01067124, 0.01003792, 0.00818859, 0.01007418, 0.00818859,
0.00818859, 0.01069186, 0.01069186, 0.01011317, 0.01011317,
0.01011317, 0.01004552, 0.01131658, 0.01131658, 0.01131658,
0.01131658, 0.01131658, 0.01131658, 0.01131658, 0.01131658,
0.01131658, 0.01131658, 0.01131658, 0.01131658, 0.01131658,
0.01131658, 0.01131658, 0.01131658, 0.01131658, 0.01131658])}