# Import the linear regression classfrom sklearn.linear_model import LinearRegression
# Sklearn also has a helper that makes it easy to do cross validationfrom sklearn.model_selection import KFold
# The columns we'll use to predict the target# 分类器的特征
predictors =["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]# Initialize our algorithm class
alg = LinearRegression()# Generate cross validation folds for the titanic dataset. It return the row indices corresponding to train and test.# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(n_splits=3, random_state=1)print(kf)
predictions =[]# 此时相当于用predictions来进行折叠交叉划分for train, test in kf.split(titanic):# The predictors we're using the train the algorithm. Note how we only take the rows in the train folds.
train_predictors =(titanic[predictors].iloc[train,:])# The target we're using to train the algorithm.
train_target = titanic["Survived"].iloc[train]# Training the algorithm using the predictors and target.
alg.fit(train_predictors, train_target)# We can now make predictions on the test fold
test_predictions = alg.predict(titanic[predictors].iloc[test,:])
predictions.append(test_predictions)
KFold(n_splits=3, random_state=1, shuffle=False)
import numpy as np
# The predictions are in three separate numpy arrays. Concatenate them into one. # We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions >.5]=1
predictions[predictions <=.5]=0
accuracy =sum(predictions[predictions == titanic["Survived"]])/len(predictions)print(accuracy)
0.26038159371492703
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
# Initialize our algorithm
alg = LogisticRegression(random_state=1)# Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!)
scores = model_selection.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)# Take the mean of the scores (because we have one for each fold)print(scores.mean())
0.7901234567901234
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
predictors =["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]# Initialize our algorithm with the default paramters# n_estimators is the number of trees we want to make# min_samples_split is the minimum number of rows we need to make a split# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)# Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!)
kf = model_selection.KFold(n_splits=3, random_state=1)
scores = model_selection.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)# Take the mean of the scores (because we have one for each fold)print(scores.mean())
alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=4, min_samples_leaf=2)# Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!)
kf = model_selection.KFold(n_splits=3, random_state=1)
scores = model_selection.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)# Take the mean of the scores (because we have one for each fold)print(scores.mean())
0.8305274971941637
# Generating a familysize column
titanic["FamilySize"]= titanic["SibSp"]+ titanic["Parch"]# The .apply method generates a new series
titanic["NameLength"]= titanic["Name"].apply(lambda x:len(x))
import re
# A function to get the title from a name.defget_title(name):# Use a regular expression to search for a title. Titles always consist of capital and lowercase letters, and end with a period.
title_search = re.search(' ([A-Za-z]+)\.', name)# If the title exists, extract and return it.if title_search:return title_search.group(1)return""# Get all the titles and print how often each one occurs.
titles = titanic["Name"].apply(get_title)print(pd.value_counts(titles))# Map each title to an integer. Some titles are very rare, and are compressed into the same codes as other titles.
title_mapping ={"Mr":1,"Miss":2,"Mrs":3,"Master":4,"Dr":5,"Rev":6,"Major":7,"Col":7,"Mlle":8,"Mme":8,"Don":9,"Lady":10,"Countess":10,"Jonkheer":10,"Sir":9,"Capt":7,"Ms":2}for k,v in title_mapping.items():
titles[titles == k]= v
# Verify that we converted everything.print(pd.value_counts(titles))# Add in the title column.
titanic["Title"]= titles
Mr 517
Miss 182
Mrs 125
Master 40
Dr 7
Rev 6
Col 2
Major 2
Mlle 2
Ms 1
Sir 1
Don 1
Capt 1
Countess 1
Mme 1
Lady 1
Jonkheer 1
Name: Name, dtype: int64
1 517
2 183
3 125
4 40
5 7
6 6
7 5
10 3
8 3
9 2
Name: Name, dtype: int64
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
predictors =["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","FamilySize","Title","NameLength"]# Perform feature selection
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic["Survived"])# Get the raw p-values for each feature, and transform from p-values into scores
scores =-np.log10(selector.pvalues_)# Plot the scores. See how "Pclass", "Sex", "Title", and "Fare" are the best?
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()# Pick only the four best features.
predictors =["Pclass","Sex","Fare","Title"]
alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4)
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
# The algorithms we want to ensemble.# We're using the more linear predictors for the logistic regression, and everything with the gradient boosting classifier.
algorithms =[[GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3),["Pclass","Sex","Age","Fare","Embarked","FamilySize","Title",]],[LogisticRegression(random_state=1),["Pclass","Sex","Fare","FamilySize","Title","Age","Embarked"]]]# Initialize the cross validation folds
kf = KFold(n_splits=3, random_state=1)
predictions =[]for train, test in kf.split(titanic):
train_target = titanic["Survived"].iloc[train]
full_test_predictions =[]# Make predictions for each algorithm on each foldfor alg, predictors in algorithms:# Fit the algorithm on the training data.
alg.fit(titanic[predictors].iloc[train,:], train_target)# Select and predict on the test fold. # The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error.
test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
full_test_predictions.append(test_predictions)# Use a simple ensembling scheme -- just average the predictions to get the final classification.
test_predictions =(full_test_predictions[0]+ full_test_predictions[1])/2# Any value over .5 is assumed to be a 1 prediction, and below .5 is a 0 prediction.
test_predictions[test_predictions <=.5]=0
test_predictions[test_predictions >.5]=1
predictions.append(test_predictions)# Put all the predictions together into one array.
predictions = np.concatenate(predictions, axis=0)# Compute accuracy by comparing to the training data.
accuracy =sum(predictions[predictions == titanic["Survived"]])/len(predictions)print(accuracy)
0.28058361391694725
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
titles = titanic_test["Name"].apply(get_title)# We're adding the Dona title to the mapping, because it's in the test set, but not the training set
title_mapping ={"Mr":1,"Miss":2,"Mrs":3,"Master":4,"Dr":5,"Rev":6,"Major":7,"Col":7,"Mlle":8,"Mme":8,"Don":9,"Lady":10,"Countess":10,"Jonkheer":10,"Sir":9,"Capt":7,"Ms":2,"Dona":10}for k,v in title_mapping.items():
titles[titles == k]= v
titanic_test["Title"]= titles
# Check the counts of each unique title.print(pd.value_counts(titanic_test["Title"]))# Now, we add the family size column.
titanic_test["FamilySize"]= titanic_test["SibSp"]+ titanic_test["Parch"]
predictors =["Pclass","Sex","Age","Fare","Embarked","FamilySize","Title"]
algorithms =[[GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors],[LogisticRegression(random_state=1),["Pclass","Sex","Fare","FamilySize","Title","Age","Embarked"]]]
full_predictions =[]for alg, predictors in algorithms:# Fit the algorithm using the full training data.
alg.fit(titanic[predictors], titanic["Survived"])# Predict using the test dataset. We have to convert all the columns to floats to avoid an error.
predictions = alg.predict_proba(titanic_test[predictors].astype(float))[:,1]
full_predictions.append(predictions)# The gradient boosting classifier generates better predictions, so we weight it higher.
predictions =(full_predictions[0]*3+ full_predictions[1])/4
predictions
full_predictions.append(predictions)# The gradient boosting classifier generates better predictions, so we weight it higher.
predictions =(full_predictions[0]*3+ full_predictions[1])/4
predictions
集成算法随机森林实现 坦尼克获救预测import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport sklearndata = pd.read_csv('data/data2138/train.csv')data.head() PassengerId Survived Pclass Name Sex