线性回归,逻辑回归,随即森林进行泰坦尼克号获救预测

import pandas as pd
from sklearn.linear_model import LinearRegression #导入线性回归包
from sklearn.linear_model import LogisticRegression #导入逻辑回归
from sklearn.model_selection import KFold #导入交叉验证包
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier #导入随即森林模块  一般先用这个试试在考虑回归
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest,f_classif


tatannike=pd.read_csv("taitannike.csv")
#print (tatannike.describe()) #describe()统计文档每列的数据,如个数,均值,方差等
tatannike["Age"]=tatannike["Age"].fillna(tatannike["Age"].median()) #fillna()函数:对缺失值进行填充。
#print(tatannike['Sex'].unique()) #查看sex列中有几个类别
tatannike.loc[tatannike['Sex'] == 'male', 'Sex'] = 0 # df.loc[[索引行的名称 ],  [索引列的名称]]
tatannike.loc[tatannike['Sex'] == 'female', 'Sex'] = 1
tatannike['Embarked']=tatannike['Embarked'].fillna('S')
tatannike.loc[tatannike['Embarked']=='S','Embarked']=0
tatannike.loc[tatannike['Embarked']=='C','Embarked']=1
tatannike.loc[tatannike['Embarked']=='Q','Embarked']=2

#线性回归
predictors=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
alg=LinearRegression()  #线性回归
kf = KFold(n_splits=3, shuffle=False, random_state=1) #实例化交叉验证

predictions=[]
for train, test in kf.split(tatannike['Survived']):
    train_predictors=tatannike[predictors].iloc[train,:]
    train_target=tatannike['Survived'].iloc[train]
    alg.fit(train_predictors,train_target)
    test_prdictions=alg.predict(tatannike[predictors].iloc[test,:])
    predictions.append(test_prdictions)

predictions=np.concatenate(predictions,axis=0)
predictions[predictions>0.5]=1
predictions[predictions<=0.5]=0
accuracy=sum(predictions[predictions==tatannike['Survived']])/len(predictions)
print(accuracy)

#逻辑回归
alg2=LogisticRegression(random_state=1) #逻辑回归
scores=model_selection.cross_val_score(alg2,tatannike[predictors],tatannike['Survived'],cv=3)
print(scores.mean())

#随即森林
   #实例化随即森林
alg3=RandomForestClassifier(random_state=1,n_estimators=100,min_samples_split=8,min_samples_leaf=4)
scores2=model_selection.cross_val_score(alg3,tatannike[predictors],tatannike['Survived'],cv=kf)
print(scores2.mean())

tatannike['familysize']=tatannike['SibSp']+tatannike['Parch']
tatannike['namelength']=tatannike['Name'].apply(lambda x:len(x)) #lambda 定义简单的函数

def get_title(name):  #提取一段字符串中的某一段字符
    title_search=re.search(' ([A-Za-z]+)\.',name)
    if title_search:
        return title_search.group(1)
    return " "
title=tatannike['Name'].apply(get_title)
print(pd.value_counts(title))

title_mapping={'Mr':1,'Miss':2,'Mrs':3,'Master':4,'Dr':5,'Rev':6,'Mlle':7,'Col':8,'Major':9,'Capt':10,'Don':11,'Ms':12,'Jonkheer':13,'Lady':14,'Sir':15,'Mme':16,'Countess':17}
for k,v in title_mapping.items():
    title[title==k]=v
print(pd.value_counts(title))
tatannike['title']=title

predictors=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','familysize','namelength','title']
selector=SelectKBest(f_classif,k=5) # k:输出分数最高的k个特征;SelectKBest筛选得分最高的k个特征
selector.fit(tatannike[predictors],tatannike['Survived'])
scores3=-np.log10(selector.pvalues_) #pvalues_:返回特征的p值
print(scores3)

plt.bar(range(len(predictors)),scores3)
plt.xticks(range(len(predictors)),predictors,rotation='vertical')
plt.show()

predictors=['Pclass','Sex','Fare','title']
alg4=RandomForestClassifier(random_state=1,n_estimators=50,min_samples_leaf=4,min_samples_split=8)
scores4=model_selection.cross_val_score(alg4,tatannike[predictors],tatannike['Survived'],cv=kf)
print(scores4.mean())

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值