import pandas as pd
from sklearn.linear_model import LinearRegression #导入线性回归包
from sklearn.linear_model import LogisticRegression #导入逻辑回归
from sklearn.model_selection import KFold #导入交叉验证包
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier #导入随即森林模块 一般先用这个试试在考虑回归
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest,f_classif
tatannike=pd.read_csv("taitannike.csv")
#print (tatannike.describe()) #describe()统计文档每列的数据,如个数,均值,方差等
tatannike["Age"]=tatannike["Age"].fillna(tatannike["Age"].median()) #fillna()函数:对缺失值进行填充。
#print(tatannike['Sex'].unique()) #查看sex列中有几个类别
tatannike.loc[tatannike['Sex'] == 'male', 'Sex'] = 0 # df.loc[[索引行的名称 ], [索引列的名称]]
tatannike.loc[tatannike['Sex'] == 'female', 'Sex'] = 1
tatannike['Embarked']=tatannike['Embarked'].fillna('S')
tatannike.loc[tatannike['Embarked']=='S','Embarked']=0
tatannike.loc[tatannike['Embarked']=='C','Embarked']=1
tatannike.loc[tatannike['Embarked']=='Q','Embarked']=2
#线性回归
predictors=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
alg=LinearRegression() #线性回归
kf = KFold(n_splits=3, shuffle=False, random_state=1) #实例化交叉验证
predictions=[]
for train, test in kf.split(tatannike['Survived']):
train_predictors=tatannike[predictors].iloc[train,:]
train_target=tatannike['Survived'].iloc[train]
alg.fit(train_predictors,train_target)
test_prdictions=alg.predict(tatannike[predictors].iloc[test,:])
predictions.append(test_prdictions)
predictions=np.concatenate(predictions,axis=0)
predictions[predictions>0.5]=1
predictions[predictions<=0.5]=0
accuracy=sum(predictions[predictions==tatannike['Survived']])/len(predictions)
print(accuracy)
#逻辑回归
alg2=LogisticRegression(random_state=1) #逻辑回归
scores=model_selection.cross_val_score(alg2,tatannike[predictors],tatannike['Survived'],cv=3)
print(scores.mean())
#随即森林
#实例化随即森林
alg3=RandomForestClassifier(random_state=1,n_estimators=100,min_samples_split=8,min_samples_leaf=4)
scores2=model_selection.cross_val_score(alg3,tatannike[predictors],tatannike['Survived'],cv=kf)
print(scores2.mean())
tatannike['familysize']=tatannike['SibSp']+tatannike['Parch']
tatannike['namelength']=tatannike['Name'].apply(lambda x:len(x)) #lambda 定义简单的函数
def get_title(name): #提取一段字符串中的某一段字符
title_search=re.search(' ([A-Za-z]+)\.',name)
if title_search:
return title_search.group(1)
return " "
title=tatannike['Name'].apply(get_title)
print(pd.value_counts(title))
title_mapping={'Mr':1,'Miss':2,'Mrs':3,'Master':4,'Dr':5,'Rev':6,'Mlle':7,'Col':8,'Major':9,'Capt':10,'Don':11,'Ms':12,'Jonkheer':13,'Lady':14,'Sir':15,'Mme':16,'Countess':17}
for k,v in title_mapping.items():
title[title==k]=v
print(pd.value_counts(title))
tatannike['title']=title
predictors=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','familysize','namelength','title']
selector=SelectKBest(f_classif,k=5) # k:输出分数最高的k个特征;SelectKBest筛选得分最高的k个特征
selector.fit(tatannike[predictors],tatannike['Survived'])
scores3=-np.log10(selector.pvalues_) #pvalues_:返回特征的p值
print(scores3)
plt.bar(range(len(predictors)),scores3)
plt.xticks(range(len(predictors)),predictors,rotation='vertical')
plt.show()
predictors=['Pclass','Sex','Fare','title']
alg4=RandomForestClassifier(random_state=1,n_estimators=50,min_samples_leaf=4,min_samples_split=8)
scores4=model_selection.cross_val_score(alg4,tatannike[predictors],tatannike['Survived'],cv=kf)
print(scores4.mean())
线性回归,逻辑回归,随即森林进行泰坦尼克号获救预测
最新推荐文章于 2024-04-29 14:11:23 发布