//解析数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
train = pd.read_csv("D:\pythona\Titanic.csv")
train.head()
运行结果:
//查看所有数据
train.info()
运行结果:
train.Embarked[train.Embarked.isnull()] = train.Embarked.dropna().mode().values
train["Cabin"] = train["Cabin"].fillna("U0")
from sklearn.ensemble import RandomForestRegressor as RFR
#先选取数值特征进行预测
age_df = train[["Age","Survived","Fare","Parch","SibSp","Pclass"]]
age_df_notnull = age_df.loc[(train["Age"].notnull())]
age_df_isnull = age_df.loc[(train["Age"].isnull())]
X = age_df_notnull.values[:,1:]
Y = age_df_notnull.values[:,0]
clf = RFR(n_estimators = 1000,n_jobs = -1)
clf.fit(X,Y)
pred = clf.predict(age_df_isnull.values[:,1:])
train.loc[train["Age"].isnull(),["Age"]] = pred
train.info()
运行结果:
#分析特征关系
#先来看一下整体的生存率。
train["Survived"].value_counts().plot.pie(autopct = "%1.2f%%")
plt.show()
#看到大概有38.38%的活下来了。
#看看各个特征和存活之间的关系。
#1.Sex特征:
sns.countplot(x = "Sex",hue = "Survived",data = train)
#女士的存活率比男士高
#2.船舱等级Pclass:
sns.countplot(x = "Pclass",hue = "Survived",data = train)
#看到船舱等级一定程度上代表了社会地位。高级一点的船舱,获救几率大一些。
#4.Age年龄特征:
sns.violinplot(x = "Survived",y = "Age",data = train,split = True)
#从小提琴图中看到大概16-30左右人的生存率比较低。
#因为年龄这个特征比较重要下面还会分析,
#不同性别下,年龄
sns.violinplot(x = "Sex",y = "Age",hue = "Survived",data = train,split = True)
#从这个图中就比较能看出刚刚那个结论。
#不同船舱下的,年龄
sns.violinplot(x = "Pclass",y = "Age",hue = "Survived",data = train,split = True)
#年龄的总体分布
train.boxplot(column = "Age")
#看到年龄分布,大部分人都是20-40岁。60岁以上的人很少。
#不同年龄下的存活率
plt.figure(figsize = (20,5))
train["Age_int"] = train["Age"].astype(int)
av_age = train[["Age_int","Survived"]].groupby(["Age_int"],as_index = False).mean()
sns.barplot(x = "Age_int",y = "Survived",data = av_age)
#从这幅图更可以看到年龄和存活率的关系,依据这幅图,一会对年龄特征做分箱处理。
#分箱处理
bins = [0,10,20,30,40,50,60,70,80]
train["Age-Cut"] = pd.cut(train["Age"],bins)
#5.称呼与存活关系 Name
#提取称呼
train["Title"] = train["Name"].str.extract(' ([A-Za-z]+)\.',expand = False)
#观察下称呼,和社会地位有关
pd.crosstab(train["Title"],train["Sex"])
plt.figure(figsize = (20,5))
sns.countplot(x = "Title",hue = "Survived",data = train)
#称呼一定程度上可以看出一个的性别和社会地位,所以还是有些用的。
#观察下名字的长度与生存率的关系
plt.figure(figsize = (20,5))
train["Name-Len"] = train["Name"].apply(len)
sns.countplot(x = "Name-Len",hue = "Survived",data = train)
#可以看到好像名字越长,存活率好像越大啊。是不是玄学啊。仔细想一下,好像社会地位越高的人,名字越长。
#6.兄弟姐妹
sns.countplot(x = "SibSp", hue = "Survived",data = train)
#好像有那么一点关系,我们把它分为有无兄弟姐妹看看。
#用来正常显示中文标签
plt.rcParams['font.sans-serif'] = ['SimHei']
#用来正常显示负号
plt.rcParams['axes.unicode_minus'] = False
sibsp_df = train[train["SibSp"] != 0]
no_sibsp_df = train[train["SibSp"] == 0]
sibsp_df["Survived"].value_counts().plot.pie(labels = ["N","Y"],autopct = "%1.2f%%")
plt.xlabel("有兄弟姐妹")
no_sibsp_df["Survived"].value_counts().plot.pie(labels = ["N","Y"],autopct = "%1.2f%%")
plt.xlabel("没有兄弟姐妹")
#有兄弟姐妹一起存活率要高一点,可以互帮互助嘛。
#7.有无父母子女
sns.countplot(x = "Parch",hue = "Survived",data = train)
#其实也可以从图中看出,有父母子女在存活率会大一点。
#8.票价分布
#来看看不同船舱等级的票价
train.boxplot(column = "Fare",by = "Pclass")
#船舱等级1果然是最贵的。
#将票价分箱处理一下
bins = [0,50,100,200,300,500]
train["Fare-Cut"] = pd.cut(train["Fare"],bins)
sns.countplot(x = "Fare-Cut",hue = "Survived",data = train)
#看到票价越高,存活率越高,很直觉。
#9.船舱类型
#由于缺失值较多,就按有无类型划分数据类别
train["Cabin"] = train["Cabin"].apply(lambda x:0 if x == "U0" else 1)
sns.countplot(x = "Cabin",hue = "Survived",data = train)
#看到有船舱类型的存活率会高一点。我大胆猜测一下,是不是一般工作人员只记录了达官显贵们的船舱类型。
#10.港口
#运行结果可能会有警告不过没关系没有影响
sns.factorplot('Embarked', 'Survived', data=train, size=3, aspect=2)
#看到在C港口上船存活率高一点,可能就像是贵宾通道一样的意思。
#特征分析告一段落,基本清楚各个特征对存活率的影响。
#接下来可以删除,提取,构造特征。
labels = train["Survived"]
features = train.drop(["Survived","PassengerId","Name","Age","Age","Ticket"],axis = 1)
#因为有些特征不是数值类型的,这里可以对非数值类型的特征进行对热编码处理。
features = pd.get_dummies(features)
encoded = list(features.columns)
#这是一个算法库
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score,roc_auc_score
from time import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
# from xgboost.sklearn import XGBClassifier
#这里使用的网格搜索自动调参
#先编写一个函数
def fit_model(alg,parameters):
X = np.array(features)
y = np.array(labels)
scorer = make_scorer(roc_auc_score) #评分标准
grid = GridSearchCV(alg,parameters,scoring = scorer,cv = 5)
start = time() #计时
grid = grid.fit(X,y)
a =grid.score(X,y)
end = time()
t = round(end - start,3)
# fun.fit(X,y)
# a = fun.score(X,y)
print(a)
print("搜索时间:",t)
print(grid.best_params_) #输出最佳参数
return grid
#定义初始化模型
alg1 = DecisionTreeClassifier(random_state = 15)
alg2 = SVC(probability = True,random_state = 15) #使用roc_auc_score作为评分标准要把probability打开
alg3 = RandomForestClassifier(random_state = 15)
alg4 = KNeighborsClassifier(n_jobs = -1)
parameters1={'max_depth':range(1,10),'min_samples_split':range(2,10)}
parameters2 = {"C":range(1,20), "gamma": [0.05,0.1,0.15,0.2,0.25]}
parameters3 = {'n_estimators':range(10,200,10)}
parameters4 = {'n_neighbors':range(2,10),'leaf_size':range(10,80,20)}
clf1 = fit_model(alg1,parameters1)
clf2 = fit_model(alg2,parameters2)
clf3_m1 = fit_model(alg3,parameters3)
clf5 = fit_model(alg4,parameters4)
以上用决策树、SVM、随机森林、KNN四种方法创建模型,准确率最高的是随机森林,准确率最低的是KNN.