泰坦尼克号预测

import pandas as pd
# 1、获取数据
path = "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt"
titanic = pd.read_csv(path)
titanic.head()
row.namespclasssurvivednameageembarkedhome.destroomticketboatsex
011st1Allen, Miss Elisabeth Walton29.0000SouthamptonSt Louis, MOB-524160 L2212female
121st0Allison, Miss Helen Loraine2.0000SouthamptonMontreal, PQ / Chesterville, ONC26NaNNaNfemale
231st0Allison, Mr Hudson Joshua Creighton30.0000SouthamptonMontreal, PQ / Chesterville, ONC26NaN(135)male
341st0Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)25.0000SouthamptonMontreal, PQ / Chesterville, ONC26NaNNaNfemale
451st1Allison, Master Hudson Trevor0.9167SouthamptonMontreal, PQ / Chesterville, ONC22NaN11male
# 筛选特征值和目标值
x = titanic[["pclass", "age", "sex"]]
y = titanic["survived"]
x.head()
pclassagesex
01st29.0000female
11st2.0000female
21st30.0000male
31st25.0000female
41st0.9167male
y.head()
0    1
1    0
2    0
3    0
4    1
Name: survived, dtype: int64
# 2、数据处理
# 1)缺失值处理
x["age"].fillna(x["age"].mean(), inplace=True)
D:\anaconda3\lib\site-packages\pandas\core\generic.py:6245: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
# 2) 转换成字典
x = x.to_dict(orient="records")
from sklearn.model_selection import train_test_split
# 3、数据集划分
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)
# 4、字典特征抽取
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
transfer = DictVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 3)决策树预估器
estimator = DecisionTreeClassifier(criterion="entropy", max_depth=8)
estimator.fit(x_train, y_train)

# 4)模型评估
# 方法1:直接比对真实值和预测值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值:\n", y_test == y_predict)

# 方法2:计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)

# 可视化决策树
export_graphviz(estimator, out_file="titanic_tree.dot", feature_names=transfer.get_feature_names())

y_predict:
 [0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0
 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0
 0 1 1 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1]
直接比对真实值和预测值:
 831      True
261      True
1210     True
1155     True
255      True
        ...  
1146     True
1125    False
386      True
1025    False
337      True
Name: survived, Length: 329, dtype: bool
准确率为:
 0.7811550151975684

随机森林对泰坦尼克号乘客的生存进行预测

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
estimator = RandomForestClassifier()
# 加入网格搜索与交叉验证
# 参数准备
param_dict = {"n_estimators": [120,200,300,500,800,1200], "max_depth": [5,8,15,25,30]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
estimator.fit(x_train, y_train)

# 5)模型评估
# 方法1:直接比对真实值和预测值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值:\n", y_test == y_predict)

# 方法2:计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)

# 最佳参数:best_params_
print("最佳参数:\n", estimator.best_params_)
# 最佳结果:best_score_
print("最佳结果:\n", estimator.best_score_)
# 最佳估计器:best_estimator_
print("最佳估计器:\n", estimator.best_estimator_)
# 交叉验证结果:cv_results_
print("交叉验证结果:\n", estimator.cv_results_)
y_predict:
 [0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0
 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0
 0 1 1 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1]
直接比对真实值和预测值:
 831      True
261      True
1210     True
1155     True
255      True
762      True
615      True
507      True
1175     True
301      True
1134     True
177      True
183     False
125     False
1093     True
1304    False
1124     True
798     False
1101     True
1239    False
1153     True
1068    False
846      True
148      True
478      True
642      True
1298     True
540      True
28       True
130      True
        ...  
194      True
663      True
1209     True
117     False
595     False
1151    False
1143     True
1216     True
874      True
246      True
160      True
1208     True
682      True
307      True
67       True
961      True
400      True
923     False
866      True
134      True
613      True
242      True
320     False
829      True
94       True
1146     True
1125    False
386      True
1025    False
337      True
Name: survived, Length: 329, dtype: bool
准确率为:
 0.7872340425531915
最佳参数:
 {'max_depth': 5, 'n_estimators': 120}
最佳结果:
 0.8363821138211383
最佳估计器:
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
交叉验证结果:
 {'mean_fit_time': array([0.11182229, 0.19149677, 0.27871044, 0.4505314 , 0.72257209,
       1.21950404, 0.15458934, 0.23542873, 0.37338622, 0.55880507,
       0.90250031, 1.44036126, 0.13625924, 0.24566126, 0.39018901,
       0.57973933, 0.94357061, 1.46748765, 0.15806643, 0.25924444,
       0.3800021 , 0.60227998, 0.98656511, 1.5208021 , 0.15277807,
       0.25416827, 0.37849299, 0.61238893, 1.00995   , 1.51009766]), 'std_fit_time': array([0.00438099, 0.00391445, 0.00445387, 0.00552127, 0.0178945 ,
       0.05956372, 0.00696462, 0.01180214, 0.01545986, 0.02345017,
       0.01762821, 0.07661026, 0.00448709, 0.00753101, 0.01337304,
       0.02401102, 0.02824846, 0.00723971, 0.00559061, 0.00539144,
       0.03176938, 0.00900011, 0.0357836 , 0.02412509, 0.01049831,
       0.00312499, 0.02043117, 0.03736237, 0.03896   , 0.01708367]), 'mean_score_time': array([0.01055225, 0.02124031, 0.02604191, 0.04676072, 0.06393997,
       0.1221021 , 0.01392762, 0.02117666, 0.03027145, 0.04542494,
       0.08080705, 0.11298935, 0.01059707, 0.02046402, 0.02975106,
       0.04587412, 0.07316939, 0.14350526, 0.0142649 , 0.02011824,
       0.02920715, 0.0444289 , 0.07418664, 0.11165055, 0.01248868,
       0.02353628, 0.03232622, 0.04952399, 0.08569598, 0.11799375]), 'std_score_time': array([1.09863734e-03, 2.29822618e-03, 3.20843508e-03, 4.00866766e-03,
       1.42997845e-03, 1.48818168e-02, 2.37098736e-03, 8.80449078e-04,
       1.62827120e-03, 1.83137647e-03, 9.86835991e-03, 9.71738484e-03,
       5.51943914e-04, 1.00782641e-03, 2.11610207e-03, 1.98464255e-03,
       3.04582952e-03, 9.81828652e-03, 1.69302449e-03, 1.37694072e-03,
       1.67724778e-03, 7.58986198e-05, 3.23449160e-03, 3.78348887e-03,
       1.02684570e-03, 5.07326308e-03, 4.72586897e-03, 2.47344396e-03,
       1.11438683e-02, 4.31988881e-03]), 'param_max_depth': masked_array(data=[5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 15, 15, 15, 15, 15,
                   15, 25, 25, 25, 25, 25, 25, 30, 30, 30, 30, 30, 30],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[120, 200, 300, 500, 800, 1200, 120, 200, 300, 500, 800,
                   1200, 120, 200, 300, 500, 800, 1200, 120, 200, 300,
                   500, 800, 1200, 120, 200, 300, 500, 800, 1200],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_depth': 5, 'n_estimators': 120}, {'max_depth': 5, 'n_estimators': 200}, {'max_depth': 5, 'n_estimators': 300}, {'max_depth': 5, 'n_estimators': 500}, {'max_depth': 5, 'n_estimators': 800}, {'max_depth': 5, 'n_estimators': 1200}, {'max_depth': 8, 'n_estimators': 120}, {'max_depth': 8, 'n_estimators': 200}, {'max_depth': 8, 'n_estimators': 300}, {'max_depth': 8, 'n_estimators': 500}, {'max_depth': 8, 'n_estimators': 800}, {'max_depth': 8, 'n_estimators': 1200}, {'max_depth': 15, 'n_estimators': 120}, {'max_depth': 15, 'n_estimators': 200}, {'max_depth': 15, 'n_estimators': 300}, {'max_depth': 15, 'n_estimators': 500}, {'max_depth': 15, 'n_estimators': 800}, {'max_depth': 15, 'n_estimators': 1200}, {'max_depth': 25, 'n_estimators': 120}, {'max_depth': 25, 'n_estimators': 200}, {'max_depth': 25, 'n_estimators': 300}, {'max_depth': 25, 'n_estimators': 500}, {'max_depth': 25, 'n_estimators': 800}, {'max_depth': 25, 'n_estimators': 1200}, {'max_depth': 30, 'n_estimators': 120}, {'max_depth': 30, 'n_estimators': 200}, {'max_depth': 30, 'n_estimators': 300}, {'max_depth': 30, 'n_estimators': 500}, {'max_depth': 30, 'n_estimators': 800}, {'max_depth': 30, 'n_estimators': 1200}], 'split0_test_score': array([0.82674772, 0.82066869, 0.82674772, 0.82674772, 0.82674772,
       0.82674772, 0.80547112, 0.80851064, 0.80243161, 0.80243161,
       0.80243161, 0.81155015, 0.79027356, 0.79635258, 0.79635258,
       0.79331307, 0.79027356, 0.79027356, 0.79635258, 0.79027356,
       0.79635258, 0.79331307, 0.79027356, 0.79331307, 0.79027356,
       0.79635258, 0.79635258, 0.79331307, 0.7993921 , 0.79331307]), 'split1_test_score': array([0.85670732, 0.85670732, 0.85365854, 0.85365854, 0.85365854,
       0.85365854, 0.85060976, 0.8597561 , 0.84756098, 0.85670732,
       0.85670732, 0.85670732, 0.85365854, 0.85365854, 0.85365854,
       0.8597561 , 0.85060976, 0.85670732, 0.85670732, 0.85365854,
       0.85670732, 0.85365854, 0.85670732, 0.85060976, 0.85670732,
       0.8597561 , 0.84756098, 0.85670732, 0.85060976, 0.85060976]), 'split2_test_score': array([0.82568807, 0.82262997, 0.82262997, 0.82568807, 0.82262997,
       0.82262997, 0.80122324, 0.79510703, 0.80122324, 0.80428135,
       0.80122324, 0.80122324, 0.80428135, 0.80122324, 0.80122324,
       0.80122324, 0.80428135, 0.80428135, 0.80428135, 0.80733945,
       0.80122324, 0.80122324, 0.80122324, 0.80122324, 0.79816514,
       0.80428135, 0.80122324, 0.80428135, 0.80122324, 0.80122324]), 'mean_test_score': array([0.83638211, 0.83333333, 0.83434959, 0.83536585, 0.83434959,
       0.83434959, 0.81910569, 0.82113821, 0.81707317, 0.82113821,
       0.82012195, 0.82317073, 0.81605691, 0.81707317, 0.81707317,
       0.81808943, 0.81504065, 0.81707317, 0.81910569, 0.81707317,
       0.81808943, 0.81605691, 0.81605691, 0.81504065, 0.81504065,
       0.82012195, 0.81504065, 0.81808943, 0.81707317, 0.81504065]), 'std_test_score': array([0.0143786 , 0.01654729, 0.01375658, 0.01294211, 0.01375658,
       0.01375658, 0.02234414, 0.02784983, 0.02156378, 0.02516249,
       0.02587446, 0.02408579, 0.02719639, 0.02594607, 0.02594607,
       0.02963923, 0.02579309, 0.02860307, 0.02678467, 0.02679151,
       0.02737927, 0.02678375, 0.02908969, 0.02535762, 0.0296384 ,
       0.02821188, 0.02308115, 0.02767166, 0.02372573, 0.02535762]), 'rank_test_score': array([ 1,  6,  3,  2,  3,  3, 12,  8, 17,  8, 10,  7, 23, 17, 17, 14, 26,
       17, 12, 17, 14, 23, 23, 26, 26, 10, 26, 14, 17, 26], dtype=int32), 'split0_train_score': array([0.85801527, 0.85954198, 0.85648855, 0.85801527, 0.85648855,
       0.85648855, 0.87633588, 0.87633588, 0.87633588, 0.87633588,
       0.87633588, 0.87480916, 0.88244275, 0.88244275, 0.88244275,
       0.88244275, 0.88244275, 0.88244275, 0.88244275, 0.88244275,
       0.88244275, 0.88244275, 0.88244275, 0.88244275, 0.88244275,
       0.88244275, 0.88244275, 0.88244275, 0.88244275, 0.88244275]), 'split1_train_score': array([0.84603659, 0.84756098, 0.84756098, 0.84756098, 0.84756098,
       0.84756098, 0.86128049, 0.86128049, 0.86128049, 0.86128049,
       0.86128049, 0.86128049, 0.86128049, 0.86128049, 0.86128049,
       0.86128049, 0.86128049, 0.86128049, 0.86128049, 0.86128049,
       0.86128049, 0.86128049, 0.86128049, 0.86128049, 0.86128049,
       0.86128049, 0.86128049, 0.86128049, 0.86128049, 0.86128049]), 'split2_train_score': array([0.87214612, 0.87214612, 0.86757991, 0.87214612, 0.86757991,
       0.86757991, 0.88736682, 0.88736682, 0.88584475, 0.88584475,
       0.88584475, 0.88584475, 0.88736682, 0.88736682, 0.88736682,
       0.88736682, 0.88736682, 0.88736682, 0.88736682, 0.88736682,
       0.88736682, 0.88736682, 0.88736682, 0.88736682, 0.88736682,
       0.88736682, 0.88736682, 0.88736682, 0.88736682, 0.88736682]), 'mean_train_score': array([0.85873266, 0.85974969, 0.85720981, 0.85924079, 0.85720981,
       0.85720981, 0.87499439, 0.87499439, 0.87448704, 0.87448704,
       0.87448704, 0.87397813, 0.87703002, 0.87703002, 0.87703002,
       0.87703002, 0.87703002, 0.87703002, 0.87703002, 0.87703002,
       0.87703002, 0.87703002, 0.87703002, 0.87703002, 0.87703002,
       0.87703002, 0.87703002, 0.87703002, 0.87703002, 0.87703002]), 'std_train_score': array([0.01067124, 0.01003792, 0.00818859, 0.01007418, 0.00818859,
       0.00818859, 0.01069186, 0.01069186, 0.01011317, 0.01011317,
       0.01011317, 0.01004552, 0.01131658, 0.01131658, 0.01131658,
       0.01131658, 0.01131658, 0.01131658, 0.01131658, 0.01131658,
       0.01131658, 0.01131658, 0.01131658, 0.01131658, 0.01131658,
       0.01131658, 0.01131658, 0.01131658, 0.01131658, 0.01131658])}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值