合辑传送门 -->> 数据分析-合辑
泰坦尼克号船员获救(随机森林)
数据集中包含31行12列数据
问题:泰坦尼克号的船员获救与其特征的关系问题,根据已有的11个的特征及survived进行分类,判断为是否获救
数据资源: https://pan.baidu.com/s/1fzqeieHOrBmV5TJ1RfhBbw 提取码: buiu
如果随机森林的过程不清楚,可参见数据分析理论【7】之 集成算法
首先读取 titanic_train.csv 文件并对相关特征进行预处理
def Read_csv(filename):
print('正在读取 ',filename)
data = pd.read_csv(filename)
# 特征数据的预处理
# 对年龄的空缺进行填充
data['Age'] = data['Age'].fillna(data['Age'].median())
# 去掉无关特征
data.drop(['PassengerId','Name','Cabin'], axis=1, inplace=True)
# print(data.shape)
for i in range(data.shape[0]):
if data['Ticket'][i] == 'LINE':
data.at[i, 'Ticket'] = 0
continue
# print(type(data['Ticket'][i]))
# print(data['Ticket'][i].split(' ')[-1])
data.at[i,'Ticket'] = data['Ticket'][i].split(' ')[-1]
data.at[data['Sex'] == 'male', 'Sex'] = 1
data.at[data['Sex'] == 'female', 'Sex'] = 0
data.at[data['Embarked'] == 'S', 'Embarked'] = 1
data.at[data['Embarked'] == 'C', 'Embarked'] = 2
data.at[data['Embarked'] == 'Q', 'Embarked'] = 3
# print(list(data['Embarked']).count(1)) #644
# print(list(data['Embarked']).count(2)) #168
# print(list(data['Embarked']).count(3)) #77
# 以众数进行缺少值填充
data['Embarked'] = data['Embarked'].fillna(1)
#分成训练集和测试集
return data.drop(['Survived'], axis=1),pd.DataFrame(data['Survived'])
利用决策树交叉验证查看结果
def klg(data,label):
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
#交叉验证
tree = DecisionTreeClassifier(random_state=42)
kf = KFold(n_splits=5,random_state=42)
pre = []
num = 1
print('#'*30)
print()
for train,test in kf.split(data):
train_data = data[data.columns.values].iloc[train,:]
train_label = label[label.columns.values].iloc[train, :]
test_data = data[data.columns.values].iloc[test, :]
test_label = label[label.columns.values].iloc[test, :]
tree.fit(train_data,train_label)
print('第%d轮 交叉验证得分: '%num,tree.score(test_data, test_label))
num+=1
pre.append(float(tree.score(test_data, test_label)))
print('最终 交叉验证得分: ',sum(pre) / len(pre))
print()
print('#'*30)
尝试使用随机森林交叉验证进行分析结果
def RanForest(data,label):
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
alg = RandomForestClassifier(random_state=42,n_estimators=10,min_samples_split=2,min_samples_leaf=1)
kf = KFold(n_splits=5,random_state=42)
pre = []
num = 1
print('#'*30)
print()
for train,test in kf.split(data):
train_data = data[data.columns.values].iloc[train,:]
train_label = np.array(label[label.columns.values].iloc[train, :]).ravel() #需要化成一维
test_data = data[data.columns.values].iloc[test, :]
test_label = label[label.columns.values].iloc[test, :]
alg.fit(train_data,train_label)
print('第%d轮 交叉验证得分: '%num,alg.score(test_data, test_label))
num+=1
pre.append(float(alg.score(test_data, test_label)))
print('最终 交叉验证得分: ',sum(pre) / len(pre))
print()
print('#'*30)
似乎效果也一般,也不过增加了2%的得分
我们看一下sklearn中 RandomForestClassifier 中的一些配置参数(sklearn文档)
挑几个相对比较常用的
①n_estimators:随机森林中树的个数
②min_samples_split:对节点进行拆分时所需要的最小样本数
③min_samples_leaf:叶节点所需的最小样本数
④max_depth:树的最大深度
修改RandomForestClassifier 的一些属性
alg = RandomForestClassifier(random_state=42,
n_estimators=50,
min_samples_split=2,
min_samples_leaf=2,
max_depth=10)
能得到
似乎已经到了一个瓶颈,我们来分析一下特征重要性
def FeatureTest(data,label):
from sklearn.feature_selection import SelectKBest,f_classif
import matplotlib.pyplot as plt
print(list(data.keys()))
features = list(data.keys())
selector = SelectKBest(f_classif,k=5)
selector.fit(data,label)
score = -np.log10(selector.pvalues_)
plt.bar(range(len(features)),score)
plt.xticks(range(len(features)),features,rotation='vertical')
plt.show()