包含全部示例的代码仓库见GIthub
1 导入库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
2 数据准备
data = pd.read_csv('./dataset/tt/train.csv')
data.columns
# output
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
data = data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
data.head()
# output
Survived Pclass Sex Age SibSp Parch Fare Embarked
0 0 3 male 22.0 1 0 7.2500 S
1 1 1 female 38.0 1 0 71.2833 C
2 1 3 female 26.0 0 0 7.9250 S
3 1 1 female 35.0 1 0 53.1000 S
4 0 3 male 35.0 0 0 8.0500 S
data['Age'] = data['Age'].fillna(data['Age'].mean())
data.fillna(0, inplace=True)
data['Sex'] = [1 if x=='male' else 0 for x in data.Sex]
data['p1'] = np.array(data['Pclass']==1).astype(np.int32)
data['p2'] = np.array(data['Pclass']==2).astype(np.int32)
data['p3'] = np.array(data['Pclass']==3).astype(np.int32)
del data['Pclass']
data.Embarked.unique()
# output
array(['S', 'C', 'Q', 0], dtype=object)
data['e1'] = np.array(data['Embarked']=='S').astype(np.int32)
data['e2'] = np.array(data['Embarked']=='C').astype(np.int32)
data['e3'] = np.array(data['Embarked']=='Q').astype(np.int32)
del data['Embarked']
data.values.dtype
# output
dtype('float64')
data.head()
# output
Survived Sex Age SibSp Parch Fare p1 p2 p3 e1 e2 e3
0 0 1 22.0 1 0 7.2500 0 0 1 1 0 0
1 1 0 38.0 1 0 71.2833 1 0 0 0 1 0
2 1 0 26.0 0 0 7.9250 0 0 1 1 0 0
3 1 0 35.0 1 0 53.1000 1 0 0 1 0 0
4 0 1 35.0 0 0 8.0500 0 0 1 1 0 0
X与Y划分
data_train = data[[x for x in data.columns if x != 'Survived']].values
data_target = data['Survived'].values.reshape(len(data),1)
np.shape(data_train), np.shape(data_target)
# output
((891, 11), (891, 1))
训练集与测试集划分
x_train, x_test, y_train, y_test = train_test_split(data_train, data_target,
test_size=0.2)
x_train.shape, x_test.shape
# output
((712, 11), (179, 11))
3 决策树模型
如果特征为连续值,需要根据需求对数据进行离散化处理
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
model.score(x_test, y_test)
# output
0.8044692737430168
model.score(x_train, y_train)
# output
0.9859550561797753
3.1 控制树的深度处理过拟合
def m_score(depth):
model = DecisionTreeClassifier(max_depth=depth)
model.fit(x_train, y_train)
train_score = model.score(x_train, y_train)
test_score = model.score(x_test, y_test)
return train_score, test_score
depths = range(2,15)
scores = [m_score(depth) for depth in depths]
scores
# output
[(0.7907303370786517, 0.770949720670391),
(0.8188202247191011, 0.8156424581005587),
(0.8412921348314607, 0.7988826815642458),
(0.8539325842696629, 0.8100558659217877),
(0.8609550561797753, 0.7877094972067039),
(0.8848314606741573, 0.8044692737430168),
(0.9030898876404494, 0.7653631284916201),
(0.9171348314606742, 0.7932960893854749),
(0.9325842696629213, 0.7877094972067039),
(0.9480337078651685, 0.776536312849162),
(0.9662921348314607, 0.8100558659217877),
(0.9719101123595506, 0.8100558659217877),
(0.976123595505618, 0.8044692737430168)]
train_s = [s[0] for s in scores]
test_s = [s[1] for s in scores]
plt.plot(train_s)
plt.plot(test_s)
3.2 设置信息增益阈值处理过拟合
def m_score(depth):
model = DecisionTreeClassifier(min_impurity_split=depth)
model.fit(x_train, y_train)
train_score = model.score(x_train, y_train)
test_score = model.score(x_test, y_test)
return train_score, test_score
values = np.linspace(0,0.5,50)
scores = [m_score(value) for value in values]
train_s = [s[0] for s in scores]
test_s = [s[1] for s in scores]
best_index = np.argmax(test_s)
best_score = test_s[best_index]
best_value = values[best_index]
best_score, best_value
# output
(0.8156424581005587, 0.01020408163265306)
plt.plot(train_s)
plt.plot(test_s)
4 交叉验证_决策树
为什么需要交叉验证——解决随机划分的差异和参数选择的问题,让所有的数据都参加到训练和评价当中
values = np.linspace(0, 0.5, 50)
depths = range(2,15)
param_grid = {'max_depth':depths, 'min_impurity_decrease': values}
model = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5) # 5折交叉验证
model.fit(data_train, data_target)
# output
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
param_grid={'max_depth': range(2, 15),
'min_impurity_decrease': array([0. , 0.01020408, 0.02040816, 0.03061224, 0.04081633,
0.05102041, 0.06122449, 0.07142857, 0.08163265, 0.09183673,
0.10204082, 0.1122449 , 0.12244898, 0.13265306, 0.14285714,
0.15306122, 0.16326531, 0.17346939, 0.18367347, 0.19387755,
0.20408163, 0.21428571, 0.2244898 , 0.23469388, 0.24489796,
0.25510204, 0.26530612, 0.2755102 , 0.28571429, 0.29591837,
0.30612245, 0.31632653, 0.32653061, 0.33673469, 0.34693878,
0.35714286, 0.36734694, 0.37755102, 0.3877551 , 0.39795918,
0.40816327, 0.41836735, 0.42857143, 0.43877551, 0.44897959,
0.45918367, 0.46938776, 0.47959184, 0.48979592, 0.5 ])})
model.best_params_
# output
{'max_depth': 6, 'min_impurity_decrease': 0.0}
model.best_score_
# output
0.8170736300295023
5 随机森林模型
x_train.shape, x_test.shape
# output
((712, 11), (179, 11))
model = RandomForestClassifier(n_estimators=100, n_jobs=4)
# n_estimators=100 100颗决策树,n_jobs=4,处理器为4核
# max_features 最大特征数,min_sample_leaf,叶子的数量
模型要求y必须为1列,用y_train.ravel()
model.fit(x_train, y_train.ravel()) # 模型要求y必须为1列
model.score(x_test, y_test)
# output
0.8268156424581006
model.feature_importances_ # 权重,每个特征的重要性
# output
array([0.2605452 , 0.26374442, 0.05267658, 0.0382214 , 0.24781589,
0.02800777, 0.01477589, 0.05457004, 0.01605285, 0.01461969,
0.00897026])
for futh, imp in zip(['Sex','Age','SibSp','Parch','Fare','p1','p2','p3','e1',
'e2','e3'], model.feature_importances_):
print(futh, ': ', imp)
# output
Sex : 0.2605451984578339
Age : 0.26374442405468573
SibSp : 0.05267657561222891
Parch : 0.038221403793938465
Fare : 0.24781589170572155
p1 : 0.0280077665796943
p2 : 0.014775893932383533
p3 : 0.054570038964384275
e1 : 0.016052852543197944
e2 : 0.014619692568660756
e3 : 0.008970261787270525
模型预测
model.predict(x_test[0].reshape(1,-1))
# output
array([0], dtype=int64)
6 交叉验证_随机森林
n_estimators = range(80, 130)
param_grid = {'n_estimators': n_estimators}
model = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
model.fit(data_train, data_target.ravel())
model.best_params_
# output
{'n_estimators': 94}
model.best_score_
# output
0.8193333751804657