数据科学工程师面试宝典系列之二---Python机器学习kaggle案例：泰坦尼克号船员获救预测

最新推荐文章于 2022-03-06 13:52:59 发布

张博208

最新推荐文章于 2022-03-06 13:52:59 发布

阅读量1.3k

点赞数

分类专栏： Python Case

Python 同时被 2 个专栏收录

158 篇文章 2 订阅

订阅专栏

Case

15 篇文章 1 订阅

订阅专栏

1.Python 机器学习kaggle案例

Numpy-python科学计算库；Pandas-python数据分析处理库；Scikit-learn-python机器学习库；

2.泰坦尼克号数据介绍

乘客编号、是否幸存、等级、姓名、性别、年龄、兄弟姐妹个数、带老人孩子个数、船票、船票价格、上船地点；

3.数据预处理

[python]view plaincopy 
   
 import pandas  #ipython notebook  
 titanic = pandas.read_csv("titanic_train.csv")  
 #titanic.head(3) //前3行打印出来  
 print titanic.describe()  //统计特性：count、mean、std、min、25%、50%、75%、max  

[python]view plaincopy 
   
 titanic ["Age"] = titanic ['Age'] . fillna(titanic['Age'].median())  //Age列中的缺失值用Age均值进行填充  
 printf  titanic.describe()  

[python]view plaincopy 
   
 print  titanic ["Sex"].unique()                               //male用0，female用1  
 #Replace all the occurences of male with the number 0.  
 titanic.loc[titanic["Sex"] == "male","Sex"] = 0  
 titanic.loc[titanic["Sex"] == "female","Sex"] = 1  

[python]view plaincopy 
   
 print titanic ["Embarked"].unique()  
 titanic["Embarked"] = titanic["Embarked"].fillna('S')     //缺失值用最多的S进行填充  
 titanic.loc[titanic["Embarked"] == "S","Embarked"] = 0    //地点用0,1,2  
 titanic.loc[titanic["Embarked"] == "C","Embarked"] = 1  
 titanic.loc[titanic["Embarked"] == "Q","Embarked"] = 2  

4.回归模型

[python]view plaincopy 
   
 #Import the linear regression class  
 from sklearn.linear_model import LinearRegression   //线性回归  
 #Sklearn also has a helper that makes it easy to do cross validation  
 from sklearn.cross_validation import KFold    //训练集交叉验证，得到平均值  
   
 #The columns we'll use to predict the target  
 predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]     //要输入的特征  
   
 #Initialize our algorithm class  
 alg = LinearRegression()  
 #Generate cross validation folds for the titanic dataset.   It return the row indices corresponding to train  
 kf = KFold(titanic.shape[0],n_folds=3,random_state=1)   //样本平均分成3份，交叉验证  
   
 predictions = []  
 for train,test in kf:  
     #The predictors we're using to train the algorithm.  Note how we only take then rows in the train folds.  
     train_predictors = (titanic[predictors].iloc[train,:])  
     #The target we're using to train the algorithm.  
     train_target = titanic["Survived"].iloc[train]  
     #Training the algorithm using the predictors and target.  
     alg.fit(train_predictors,train_target)  
     #We can now make predictions on the test fold  
     test_predictions = alg.predict(titanic[predictors].iloc[test,:])  
     predictions.append(test_predictions)  

[python]view plaincopy 
   
 import numpy as np  
   
 #The predictions are in three aeparate numpy arrays.    Concatenate them into one.  
 #We concatenate them on axis 0,as they only have one axis.  
 predictions = np.concatenate(predictions,axis=0)  
   
 #Map predictions to outcomes(only possible outcomes are 1 and 0)  
 predictions[predictions>.5] = 1  
 predictions[predictions<=.5] = 0  
 accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)  
 print accuracy  

[python]view plaincopy 
   
 from sklearn import cross_validation  
 from sklearn.linear_model import LogisticRegression   //逻辑回归  
 #Initialize our algorithm  
 alg=LogisticRegression(random_state=1)  
 #Compute the accuracy score for all the cross validation folds.(much simpler than what we did before!)  
 scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=3)  
 #Take the mean of the scores (because we have one for each fold)  
 print(scores.mean())  

5.随机森林模型

（1）随机取样本（有放回的取样）

（2）随机选择特征

（3）多个决策树（投票机制）

[python]view plaincopy 
   
 from sklearn import cross_validation  
 from sklearn.ensemble import RandomForestClassifier  
   
 predictors=["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]  
   
 #Initialize our algorithm with the default parameters  
 #n_estimators is the number of tress we want to make  
 #min_samples_split is the minimum number of rows we need to m,ake a split  
 #min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the b)  
 alg=RandomForestClassifier(random_state=1,n_estimators=10,min_samples_split=2,min_samples_leaf=1) //10棵决策树，停止的条件：样本个数为2，叶子节点个数为1  
 #Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)  
 kf=cross_validation.KFold(titanic.shape[0],n_folds=3,random_state=1)  
 scores=cross_validation.cross_val_cross_val_score(alg,titanic[predictors],titanic["Survived"],cv=kf)  
   
 #Take the mean of the scores (because we have one for each fold)  
 print(scores.mean())  

[python]view plaincopy 
   
 #Generating a familysize column  
 titanic["FamilySize"]=titanic["SibSp"]+titanic["Parch"]  
   
 #The .apply method generates a new series  
 titanic["NameLength"]=titanic["Name"].apply(lambda x:len(x))  

[python]view plaincopy 
   
 import re   
   
 #A function to get the title from a name  
 def get_title(name):  
     #Use a regular expression to search for a title. Titles always consist of capital and lowercase letters  
     title_search = re.search('([A-Za-z]+)\.',name)  
     #If the title exists,extract and return it.  
     if title_search:  
         return title_search.group(1)  
     return ""  
   
 #Get all the titles and print how often each one occurs.  
 titles=titanic["Name"].apply(get_title)  
 print(pandas.value_counts(titles))  
   
 #Map each titles to an integer.  Some titles are very rare,and are compressed into the same codes as other  
 title_mapping = {"Mr":1,"Miss":2,"Mrs":3,"Master":4,"Dr":5,"Rev":6,"Major":7,"Col":7,"Mile":8,"Mme":8,"Don":9,"Lady":10,"Countess":10,"Jonkheer":10,"Str":9,"Capt":7}  
 for k,v in title_mapping.items()  
     titles[titles==k]=v  
   
 #Verify that we converted everything.  
 print(pandas.value_counts(titles))  
   
 #Add in the title column  
 titanic["Title"]=titles  

6.特征选择

通过加入噪音值前后的错误率的差值来判断特征值的重要程度。

[python]view plaincopy 
   
 import numpy as np  
 from sklearn.feature_selection import SelectKBest,f_classif  
 import matplotlib.pyplot as plt  
 predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","FamilySize","Title","NameLength"]  
   
 #Perform feature selection  
 selector=SelectKBest(f_classif,k=5)  
 selector.fit(titanic[predictors],titanic["Survived"])  
   
 #Plot the raw p-values for each feature,and transform from p-values into scores  
 scores=-np.log10(selector.pvalues_)  
   
 #Plot the scores.   See how "Pclass","Sex","Title",and "Fare" are the best?  
 plt.bar(range(len(predictors)).scores)  
 plt.xticks(range(len(predictors)).predictors,rotation='vertical')  
 plt.show()  
   
 #Pick only the four best features.  
 predictors=["Pclass","Sex","Fare","Title"]  
   
 alg=RandomForestClassifier(random_state=1,n_estimators=50,min_samples_split=8,min_samples_leaf=4)  

[python]view plaincopy 
   
 //集成多种算法求平均的方法来进行机器学习求解  
 from sklearn.ensemble import GradientBoostingClassifier  
 import numpy as  np  
   
 #The algorithms we want to ensemble.  
 #We're using the more linear predictors for the logistic regression,and everything with the gradient boosting classifier  
 algorithms=[  
     [GradientBoostingClassifier(random_state=1,n_estimators=25,max_depth=3, ["Pclass","Sex","Age","Fare","FamilySize","Title","Age","Embarked"]]  
     [LogisticRegression(random_state=1),["Pclass","Sex","Fare","FamilySize","Title","Age","Embarked"]]  
 ]  
   
 #Initialize the cross validation folds  
 kf=KFold(titanic.shape[0],n_folds=3,random_state=1)  
   
 predictions=[]  
 for train,test in kf:  
     train_target=titanic["Survived"].iloc[train]  
     full_test_predictions=[]  
     #Make predictions for each algorithm on each fold  
     for alg,predictors in algorithms:  
         #Fit the algorithm on the training data  
         alg.fit(titanic[predictors].iloc[train,:],train_targegt)  
         #Select and predict on the test fold  
         #The .astype(float) is necessary to convert the dataframe to all floats and sklearn error.  
         test_predictions=alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]  
     #Use a simple ensembling scheme -- just  average the predictions to get the final classification.  
     test_predictions=(full_test_predictions[0]+full_test_predictions[1])/2  
     #Any value over .5 is assumed to be a 1 prediction,and below .5 is a 0 prediction.  
     test_predictions[test_predictions<=0.5]=0  
     test_predictions[test_predictions>0.5]=1  
     predictions.append(test_predictions)  
   
 #Put all the predictions together into one array.  
 predictions=np.concatenate(predictions,axis=0)  
   
 #Compute accuracy by comparing to the training data  
 accuracy=sum(predictions[predictions==titanic["Survived"]])/len(predictions)  
 print(accuracy)  

[python]view plaincopy 
   
 #The gradient boosting classifier generates better predictions,so we weight it higher  
 predictions=(full_predictions[0]*3+full_predictions[1]*1)/4  
 predictions