注:本案例为黑马的课堂案例,上传仅为方便查看
import pandas as pd
import numpy as np
from sklearn. model_selection import train_test_split, GridSearchCV
from sklearn. feature_extraction import DictVectorizer
from sklearn. ensemble import RandomForestClassifier
data = pd. read_csv( "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt" )
data
row.names pclass survived name age embarked home.dest room ticket boat sex 0 1 1st 1 Allen, Miss Elisabeth Walton 29.0000 Southampton St Louis, MO B-5 24160 L221 2 female 1 2 1st 0 Allison, Miss Helen Loraine 2.0000 Southampton Montreal, PQ / Chesterville, ON C26 NaN NaN female 2 3 1st 0 Allison, Mr Hudson Joshua Creighton 30.0000 Southampton Montreal, PQ / Chesterville, ON C26 NaN (135) male 3 4 1st 0 Allison, Mrs Hudson J.C. (Bessie Waldo Daniels) 25.0000 Southampton Montreal, PQ / Chesterville, ON C26 NaN NaN female 4 5 1st 1 Allison, Master Hudson Trevor 0.9167 Southampton Montreal, PQ / Chesterville, ON C22 NaN 11 male 5 6 1st 1 Anderson, Mr Harry 47.0000 Southampton New York, NY E-12 NaN 3 male 6 7 1st 1 Andrews, Miss Kornelia Theodosia 63.0000 Southampton Hudson, NY D-7 13502 L77 10 female 7 8 1st 0 Andrews, Mr Thomas, jr 39.0000 Southampton Belfast, NI A-36 NaN NaN male 8 9 1st 1 Appleton, Mrs Edward Dale (Charlotte Lamson) 58.0000 Southampton Bayside, Queens, NY C-101 NaN 2 female 9 10 1st 0 Artagaveytia, Mr Ramon 71.0000 Cherbourg Montevideo, Uruguay NaN NaN (22) male 10 11 1st 0 Astor, Colonel John Jacob 47.0000 Cherbourg New York, NY NaN 17754 L224 10s 6d (124) male 11 12 1st 1 Astor, Mrs John Jacob (Madeleine Talmadge Force) 19.0000 Cherbourg New York, NY NaN 17754 L224 10s 6d 4 female 12 13 1st 1 Aubert, Mrs Leontine Pauline NaN Cherbourg Paris, France B-35 17477 L69 6s 9 female 13 14 1st 1 Barkworth, Mr Algernon H. NaN Southampton Hessle, Yorks A-23 NaN B male 14 15 1st 0 Baumann, Mr John D. NaN Southampton New York, NY NaN NaN NaN male 15 16 1st 1 Baxter, Mrs James (Helene DeLaudeniere Chaput) 50.0000 Cherbourg Montreal, PQ B-58/60 NaN 6 female 16 17 1st 0 Baxter, Mr Quigg Edmond 24.0000 Cherbourg Montreal, PQ B-58/60 NaN NaN male 17 18 1st 0 Beattie, Mr Thomson 36.0000 Cherbourg Winnipeg, MN C-6 NaN NaN male 18 19 1st 1 Beckwith, Mr Richard Leonard 37.0000 Southampton New York, NY D-35 NaN 5 male 19 20 1st 1 Beckwith, Mrs Richard Leonard (Sallie Monypeny) 47.0000 Southampton New York, NY D-35 NaN 5 female 20 21 1st 1 Behr, Mr Karl Howell 26.0000 Cherbourg New York, NY C-148 NaN 5 male 21 22 1st 0 Birnbaum, Mr Jakob 25.0000 Cherbourg San Francisco, CA NaN NaN (148) male 22 23 1st 1 Bishop, Mr Dickinson H. 25.0000 Cherbourg Dowagiac, MI B-49 NaN 7 male 23 24 1st 1 Bishop, Mrs Dickinson H. (Helen Walton) 19.0000 Cherbourg Dowagiac, MI B-49 NaN 7 female 24 25 1st 1 Bjornstrm-Steffansson, Mr Mauritz Hakan 28.0000 Southampton Stockholm, Sweden / Washington, DC NaN D male 25 26 1st 0 Blackwell, Mr Stephen Weart 45.0000 Southampton Trenton, NJ NaN NaN (241) male 26 27 1st 1 Blank, Mr Henry 39.0000 Cherbourg Glen Ridge, NJ A-31 NaN 7 male 27 28 1st 1 Bonnell, Miss Caroline 30.0000 Southampton Youngstown, OH C-7 NaN 8 female 28 29 1st 1 Bonnell, Miss Elizabeth 58.0000 Southampton Birkdale, England Cleveland, Ohio C-103 NaN 8 female 29 30 1st 0 Borebank, Mr John James NaN Southampton London / Winnipeg, MB D-21/2 NaN NaN male ... ... ... ... ... ... ... ... ... ... ... ... 1283 1284 3rd 0 Vestrom, Miss Hulda Amanda Adolfina NaN NaN NaN NaN NaN NaN female 1284 1285 3rd 0 Vonk, Mr Jenko NaN NaN NaN NaN NaN NaN male 1285 1286 3rd 0 Ware, Mr Frederick NaN NaN NaN NaN NaN NaN male 1286 1287 3rd 0 Warren, Mr Charles William NaN NaN NaN NaN NaN NaN male 1287 1288 3rd 0 Wazli, Mr Yousif NaN NaN NaN NaN NaN NaN male 1288 1289 3rd 0 Webber, Mr James NaN NaN NaN NaN NaN NaN male 1289 1290 3rd 1 Wennerstrom, Mr August Edvard NaN NaN NaN NaN NaN NaN male 1290 1291 3rd 0 Wenzel, Mr Linhart NaN NaN NaN NaN NaN NaN male 1291 1292 3rd 0 Widegren, Mr Charles Peter NaN NaN NaN NaN NaN NaN male 1292 1293 3rd 0 Wiklund, Mr Jacob Alfred NaN NaN NaN NaN NaN NaN male 1293 1294 3rd 1 Wilkes, Mrs Ellen NaN NaN NaN NaN NaN NaN female 1294 1295 3rd 0 Willer, Mr Aaron NaN NaN NaN NaN NaN NaN male 1295 1296 3rd 0 Willey, Mr Edward NaN NaN NaN NaN NaN NaN male 1296 1297 3rd 0 Williams, Mr Howard Hugh NaN NaN NaN NaN NaN NaN male 1297 1298 3rd 0 Williams, Mr Leslie NaN NaN NaN NaN NaN NaN male 1298 1299 3rd 0 Windelov, Mr Einar NaN NaN NaN NaN NaN NaN male 1299 1300 3rd 0 Wirz, Mr Albert NaN NaN NaN NaN NaN NaN male 1300 1301 3rd 0 Wiseman, Mr Phillippe NaN NaN NaN NaN NaN NaN male 1301 1302 3rd 0 Wittevrongel, Mr Camiel NaN NaN NaN NaN NaN NaN male 1302 1303 3rd 1 Yalsevac, Mr Ivan NaN NaN NaN NaN NaN NaN male 1303 1304 3rd 0 Yasbeck, Mr Antoni NaN NaN NaN NaN NaN NaN male 1304 1305 3rd 1 Yasbeck, Mrs Antoni NaN NaN NaN NaN NaN NaN female 1305 1306 3rd 0 Youssef, Mr Gerios NaN NaN NaN NaN NaN NaN male 1306 1307 3rd 0 Zabour, Miss Hileni NaN NaN NaN NaN NaN NaN female 1307 1308 3rd 0 Zabour, Miss Tamini NaN NaN NaN NaN NaN NaN female 1308 1309 3rd 0 Zakarian, Mr Artun NaN NaN NaN NaN NaN NaN male 1309 1310 3rd 0 Zakarian, Mr Maprieder NaN NaN NaN NaN NaN NaN male 1310 1311 3rd 0 Zenn, Mr Philip NaN NaN NaN NaN NaN NaN male 1311 1312 3rd 0 Zievens, Rene NaN NaN NaN NaN NaN NaN female 1312 1313 3rd 0 Zimmerman, Leo NaN NaN NaN NaN NaN NaN male
1313 rows × 11 columns
data. describe( )
row.names survived age count 1313.000000 1313.000000 633.000000 mean 657.000000 0.341965 31.194181 std 379.174762 0.474549 14.747525 min 1.000000 0.000000 0.166700 25% 329.000000 0.000000 21.000000 50% 657.000000 0.000000 30.000000 75% 985.000000 1.000000 41.000000 max 1313.000000 1.000000 71.000000
x = data[ [ "pclass" , "age" , "sex" ] ]
x. head( )
pclass age sex 0 1st 29.0000 female 1 1st 2.0000 female 2 1st 30.0000 male 3 1st 25.0000 female 4 1st 0.9167 male
y = data[ "survived" ]
y. head( )
0 1
1 0
2 0
3 0
4 1
Name: survived, dtype: int64
x[ "age" ] . fillna( value= data[ "age" ] . mean( ) , inplace= True )
/Users/sherwin/workspaces/ai/lib/python3.6/site-packages/pandas/core/generic.py:3660: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self._update_inplace(new_data)
x. head( )
pclass age sex 0 1st 29.0000 female 1 1st 2.0000 female 2 1st 30.0000 male 3 1st 25.0000 female 4 1st 0.9167 male
x_train, x_test, y_train, y_test = train_test_split( x, y, random_state= 22 , test_size= 0.2 )
x. head( )
pclass age sex 0 1st 29.0000 female 1 1st 2.0000 female 2 1st 30.0000 male 3 1st 25.0000 female 4 1st 0.9167 male
x_train = x_train. to_dict( orient= "records" )
x_test = x_test. to_dict( orient= "records" )
x_train
[{'pclass': '3rd', 'age': 45.0, 'sex': 'female'},
… {‘pclass’: ‘3rd’, ‘age’: 31.19418104265403, ‘sex’: ‘male’}, {‘pclass’: ‘1st’, ‘age’: 36.0, ‘sex’: ‘male’}, …]
transfer = DictVectorizer( )
x_train = transfer. fit_transform( x_train)
x_test = transfer. fit_transform( x_test)
x_train
<1050x6 sparse matrix of type '<class 'numpy.float64'>'
with 3150 stored elements in Compressed Sparse Row format>
estimator = RandomForestClassifier( )
param_grid = { "n_estimators" : [ 120 , 200 , 300 , 500 , 800 , 1200 ] , "max_depth" : [ 5 , 8 , 15 , 25 , 30 ] }
estimator = GridSearchCV( estimator, param_grid= param_grid, cv= 5 )
estimator. fit( x_train, y_train)
GridSearchCV(cv=5, error_score='raise',
estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False),
fit_params=None, iid=True, n_jobs=1,
param_grid={'n_estimators': [120, 200, 300, 500, 800, 1200], 'max_depth': [5, 8, 15, 25, 30]},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring=None, verbose=0)
estimator. score( x_test, y_test)
0.7908745247148289
estimator. best_estimator_
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=5, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)