
import pandas as pd
# 1、获取数据
path = "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt"
titanic = pd.read_csv(path)
011st1Allen, Miss Elisabeth Walton29.0000SouthamptonSt Louis, MOB-524160 L2212female
121st0Allison, Miss Helen Loraine2.0000SouthamptonMontreal, PQ / Chesterville, ONC26NaNNaNfemale
231st0Allison, Mr Hudson Joshua Creighton30.0000SouthamptonMontreal, PQ / Chesterville, ONC26NaN(135)male
341st0Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)25.0000SouthamptonMontreal, PQ / Chesterville, ONC26NaNNaNfemale
451st1Allison, Master Hudson Trevor0.9167SouthamptonMontreal, PQ / Chesterville, ONC22NaN11male
# 筛选特征值和目标值
x = titanic[["pclass", "age", "sex"]]
y = titanic["survived"]
0    1
1    0
2    0
3    0
4    1
Name: survived, dtype: int64
# 2、数据处理
# 1)缺失值处理
x["age"].fillna(x["age"].mean(), inplace=True)
D:\anaconda3\lib\site-packages\pandas\core\generic.py:6245: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# 2) 转换成字典
x = x.to_dict(orient="records")
from sklearn.model_selection import train_test_split
# 3、数据集划分
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)
# 4、字典特征抽取
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
transfer = DictVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 3)决策树预估器
estimator = DecisionTreeClassifier(criterion="entropy", max_depth=8)
estimator.fit(x_train, y_train)

# 4)模型评估
# 方法1:直接比对真实值和预测值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值:\n", y_test == y_predict)

# 方法2:计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)

# 可视化决策树
export_graphviz(estimator, out_file="titanic_tree.dot", feature_names=transfer.get_feature_names())

 [0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0
 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0
 0 1 1 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1]
 831      True
261      True
1210     True
1155     True
255      True
1146     True
1125    False
386      True
1025    False
337      True
Name: survived, Length: 329, dtype: bool


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
estimator = RandomForestClassifier()
# 加入网格搜索与交叉验证
# 参数准备
param_dict = {"n_estimators": [120,200,300,500,800,1200], "max_depth": [5,8,15,25,30]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
estimator.fit(x_train, y_train)

# 5)模型评估
# 方法1:直接比对真实值和预测值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值:\n", y_test == y_predict)

# 方法2:计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)

# 最佳参数:best_params_
print("最佳参数:\n", estimator.best_params_)
# 最佳结果:best_score_
print("最佳结果:\n", estimator.best_score_)
# 最佳估计器:best_estimator_
print("最佳估计器:\n", estimator.best_estimator_)
# 交叉验证结果:cv_results_
print("交叉验证结果:\n", estimator.cv_results_)
 [0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0
 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0
 0 1 1 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1]
 831      True
261      True
1210     True
1155     True
255      True
762      True
615      True
507      True
1175     True
301      True
1134     True
177      True
183     False
125     False
1093     True
1304    False
1124     True
798     False
1101     True
1239    False
1153     True
1068    False
846      True
148      True
478      True
642      True
1298     True
540      True
28       True
130      True
194      True
663      True
1209     True
117     False
595     False
1151    False
1143     True
1216     True
874      True
246      True
160      True
1208     True
682      True
307      True
67       True
961      True
400      True
923     False
866      True
134      True
613      True
242      True
320     False
829      True
94       True
1146     True
1125    False
386      True
1025    False
337      True
Name: survived, Length: 329, dtype: bool
 {'max_depth': 5, 'n_estimators': 120}
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
 {'mean_fit_time': array([0.11182229, 0.19149677, 0.27871044, 0.4505314 , 0.72257209,
       1.21950404, 0.15458934, 0.23542873, 0.37338622, 0.55880507,
       0.90250031, 1.44036126, 0.13625924, 0.24566126, 0.39018901,
       0.57973933, 0.94357061, 1.46748765, 0.15806643, 0.25924444,
       0.3800021 , 0.60227998, 0.98656511, 1.5208021 , 0.15277807,
       0.25416827, 0.37849299, 0.61238893, 1.00995   , 1.51009766]), 'std_fit_time': array([0.00438099, 0.00391445, 0.00445387, 0.00552127, 0.0178945 ,
       0.05956372, 0.00696462, 0.01180214, 0.01545986, 0.02345017,
       0.01762821, 0.07661026, 0.00448709, 0.00753101, 0.01337304,
       0.02401102, 0.02824846, 0.00723971, 0.00559061, 0.00539144,
       0.03176938, 0.00900011, 0.0357836 , 0.02412509, 0.01049831,
       0.00312499, 0.02043117, 0.03736237, 0.03896   , 0.01708367]), 'mean_score_time': array([0.01055225, 0.02124031, 0.02604191, 0.04676072, 0.06393997,
       0.1221021 , 0.01392762, 0.02117666, 0.03027145, 0.04542494,
       0.08080705, 0.11298935, 0.01059707, 0.02046402, 0.02975106,
       0.04587412, 0.07316939, 0.14350526, 0.0142649 , 0.02011824,
       0.02920715, 0.0444289 , 0.07418664, 0.11165055, 0.01248868,
       0.02353628, 0.03232622, 0.04952399, 0.08569598, 0.11799375]), 'std_score_time': array([1.09863734e-03, 2.29822618e-03, 3.20843508e-03, 4.00866766e-03,
       1.42997845e-03, 1.48818168e-02, 2.37098736e-03, 8.80449078e-04,
       1.62827120e-03, 1.83137647e-03, 9.86835991e-03, 9.71738484e-03,
       5.51943914e-04, 1.00782641e-03, 2.11610207e-03, 1.98464255e-03,
       3.04582952e-03, 9.81828652e-03, 1.69302449e-03, 1.37694072e-03,
       1.67724778e-03, 7.58986198e-05, 3.23449160e-03, 3.78348887e-03,
       1.02684570e-03, 5.07326308e-03, 4.72586897e-03, 2.47344396e-03,
       1.11438683e-02, 4.31988881e-03]), 'param_max_depth': masked_array(data=[5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 15, 15, 15, 15, 15,
                   15, 25, 25, 25, 25, 25, 25, 30, 30, 30, 30, 30, 30],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False],
            dtype=object), 'param_n_estimators': masked_array(data=[120, 200, 300, 500, 800, 1200, 120, 200, 300, 500, 800,
                   1200, 120, 200, 300, 500, 800, 1200, 120, 200, 300,
                   500, 800, 1200, 120, 200, 300, 500, 800, 1200],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False],
            dtype=object), 'params': [{'max_depth': 5, 'n_estimators': 120}, {'max_depth': 5, 'n_estimators': 200}, {'max_depth': 5, 'n_estimators': 300}, {'max_depth': 5, 'n_estimators': 500}, {'max_depth': 5, 'n_estimators': 800}, {'max_depth': 5, 'n_estimators': 1200}, {'max_depth': 8, 'n_estimators': 120}, {'max_depth': 8, 'n_estimators': 200}, {'max_depth': 8, 'n_estimators': 300}, {'max_depth': 8, 'n_estimators': 500}, {'max_depth': 8, 'n_estimators': 800}, {'max_depth': 8, 'n_estimators': 1200}, {'max_depth': 15, 'n_estimators': 120}, {'max_depth': 15, 'n_estimators': 200}, {'max_depth': 15, 'n_estimators': 300}, {'max_depth': 15, 'n_estimators': 500}, {'max_depth': 15, 'n_estimators': 800}, {'max_depth': 15, 'n_estimators': 1200}, {'max_depth': 25, 'n_estimators': 120}, {'max_depth': 25, 'n_estimators': 200}, {'max_depth': 25, 'n_estimators': 300}, {'max_depth': 25, 'n_estimators': 500}, {'max_depth': 25, 'n_estimators': 800}, {'max_depth': 25, 'n_estimators': 1200}, {'max_depth': 30, 'n_estimators': 120}, {'max_depth': 30, 'n_estimators': 200}, {'max_depth': 30, 'n_estimators': 300}, {'max_depth': 30, 'n_estimators': 500}, {'max_depth': 30, 'n_estimators': 800}, {'max_depth': 30, 'n_estimators': 1200}], 'split0_test_score': array([0.82674772, 0.82066869, 0.82674772, 0.82674772, 0.82674772,
       0.82674772, 0.80547112, 0.80851064, 0.80243161, 0.80243161,
       0.80243161, 0.81155015, 0.79027356, 0.79635258, 0.79635258,
       0.79331307, 0.79027356, 0.79027356, 0.79635258, 0.79027356,
       0.79635258, 0.79331307, 0.79027356, 0.79331307, 0.79027356,
       0.79635258, 0.79635258, 0.79331307, 0.7993921 , 0.79331307]), 'split1_test_score': array([0.85670732, 0.85670732, 0.85365854, 0.85365854, 0.85365854,
       0.85365854, 0.85060976, 0.8597561 , 0.84756098, 0.85670732,
       0.85670732, 0.85670732, 0.85365854, 0.85365854, 0.85365854,
       0.8597561 , 0.85060976, 0.85670732, 0.85670732, 0.85365854,
       0.85670732, 0.85365854, 0.85670732, 0.85060976, 0.85670732,
       0.8597561 , 0.84756098, 0.85670732, 0.85060976, 0.85060976]), 'split2_test_score': array([0.82568807, 0.82262997, 0.82262997, 0.82568807, 0.82262997,
       0.82262997, 0.80122324, 0.79510703, 0.80122324, 0.80428135,
       0.80122324, 0.80122324, 0.80428135, 0.80122324, 0.80122324,
       0.80122324, 0.80428135, 0.80428135, 0.80428135, 0.80733945,
       0.80122324, 0.80122324, 0.80122324, 0.80122324, 0.79816514,
       0.80428135, 0.80122324, 0.80428135, 0.80122324, 0.80122324]), 'mean_test_score': array([0.83638211, 0.83333333, 0.83434959, 0.83536585, 0.83434959,
       0.83434959, 0.81910569, 0.82113821, 0.81707317, 0.82113821,
       0.82012195, 0.82317073, 0.81605691, 0.81707317, 0.81707317,
       0.81808943, 0.81504065, 0.81707317, 0.81910569, 0.81707317,
       0.81808943, 0.81605691, 0.81605691, 0.81504065, 0.81504065,
       0.82012195, 0.81504065, 0.81808943, 0.81707317, 0.81504065]), 'std_test_score': array([0.0143786 , 0.01654729, 0.01375658, 0.01294211, 0.01375658,
       0.01375658, 0.02234414, 0.02784983, 0.02156378, 0.02516249,
       0.02587446, 0.02408579, 0.02719639, 0.02594607, 0.02594607,
       0.02963923, 0.02579309, 0.02860307, 0.02678467, 0.02679151,
       0.02737927, 0.02678375, 0.02908969, 0.02535762, 0.0296384 ,
       0.02821188, 0.02308115, 0.02767166, 0.02372573, 0.02535762]), 'rank_test_score': array([ 1,  6,  3,  2,  3,  3, 12,  8, 17,  8, 10,  7, 23, 17, 17, 14, 26,
       17, 12, 17, 14, 23, 23, 26, 26, 10, 26, 14, 17, 26], dtype=int32), 'split0_train_score': array([0.85801527, 0.85954198, 0.85648855, 0.85801527, 0.85648855,
       0.85648855, 0.87633588, 0.87633588, 0.87633588, 0.87633588,
       0.87633588, 0.87480916, 0.88244275, 0.88244275, 0.88244275,
       0.88244275, 0.88244275, 0.88244275, 0.88244275, 0.88244275,
       0.88244275, 0.88244275, 0.88244275, 0.88244275, 0.88244275,
       0.88244275, 0.88244275, 0.88244275, 0.88244275, 0.88244275]), 'split1_train_score': array([0.84603659, 0.84756098, 0.84756098, 0.84756098, 0.84756098,
       0.84756098, 0.86128049, 0.86128049, 0.86128049, 0.86128049,
       0.86128049, 0.86128049, 0.86128049, 0.86128049, 0.86128049,
       0.86128049, 0.86128049, 0.86128049, 0.86128049, 0.86128049,
       0.86128049, 0.86128049, 0.86128049, 0.86128049, 0.86128049,
       0.86128049, 0.86128049, 0.86128049, 0.86128049, 0.86128049]), 'split2_train_score': array([0.87214612, 0.87214612, 0.86757991, 0.87214612, 0.86757991,
       0.86757991, 0.88736682, 0.88736682, 0.88584475, 0.88584475,
       0.88584475, 0.88584475, 0.88736682, 0.88736682, 0.88736682,
       0.88736682, 0.88736682, 0.88736682, 0.88736682, 0.88736682,
       0.88736682, 0.88736682, 0.88736682, 0.88736682, 0.88736682,
       0.88736682, 0.88736682, 0.88736682, 0.88736682, 0.88736682]), 'mean_train_score': array([0.85873266, 0.85974969, 0.85720981, 0.85924079, 0.85720981,
       0.85720981, 0.87499439, 0.87499439, 0.87448704, 0.87448704,
       0.87448704, 0.87397813, 0.87703002, 0.87703002, 0.87703002,
       0.87703002, 0.87703002, 0.87703002, 0.87703002, 0.87703002,
       0.87703002, 0.87703002, 0.87703002, 0.87703002, 0.87703002,
       0.87703002, 0.87703002, 0.87703002, 0.87703002, 0.87703002]), 'std_train_score': array([0.01067124, 0.01003792, 0.00818859, 0.01007418, 0.00818859,
       0.00818859, 0.01069186, 0.01069186, 0.01011317, 0.01011317,
       0.01011317, 0.01004552, 0.01131658, 0.01131658, 0.01131658,
       0.01131658, 0.01131658, 0.01131658, 0.01131658, 0.01131658,
       0.01131658, 0.01131658, 0.01131658, 0.01131658, 0.01131658,
       0.01131658, 0.01131658, 0.01131658, 0.01131658, 0.01131658])}





当前余额3.43前往充值 >
领取后你会自动成为博主和红包主的粉丝 规则
钱包余额 0


