#需要0或者1,以上为小数,要转换一下import numpy as np
predictions=np.concatenate(predictions,axis=0)
predictions[predictors>.5]=1
predictions[predictors<.5]=0#判断是否和标签一样,一样就是1,不一样就是0
accuracy=len(predictions[predictions==titanic["Survived"]])/len(predictions)print(accuracy)
#逻辑回归训练from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
# Initialize our algorithm
alg = LogisticRegression(random_state=1)# Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)# Take the mean of the scores (because we have one for each fold)print(scores.mean())
#数据特征判断,家庭成员和名字长度# Generating a familysize column
titanic["FamilySize"]= titanic["SibSp"]+ titanic["Parch"]# The .apply method generates a new series
titanic["NameLength"]= titanic["Name"].apply(lambda x:len(x))
#名字里也有特征,进行提取import re
# A function to get the title from a name.defget_title(name):# Use a regular expression to search for a title. Titles always consist of capital and lowercase letters, and end with a period.
title_search = re.search(' ([A-Za-z]+)\.', name)# If the title exists, extract and return it.if title_search:return title_search.group(1)return""
# Get all the titles and print how often each one occurs.
titles = titanic["Name"].apply(get_title)print(pd.value_counts(titles))# Map each title to an integer. Some titles are very rare, and are compressed into the same codes as other titles.
title_mapping ={"Mr":1,"Miss":2,"Mrs":3,"Master":4,"Dr":5,"Rev":6,"Major":7,"Col":7,"Mlle":8,"Mme":8,"Don":9,"Lady":10,"Countess":10,"Jonkheer":10,"Sir":9,"Capt":7,"Ms":2}for k,v in title_mapping.items():
titles[titles == k]= v
# Verify that we converted everything.print(pd.value_counts(titles))# Add in the title column.
titanic["Title"]= titles
#特征的重要程度提取,这里原理是故意对其中一个变量错误抖动,看结果变化大不大,如果大,为重要特征。import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
predictors =["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","FamilySize","Title","NameLength"]# Perform feature selection
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic["Survived"])# Get the raw p-values for each feature, and transform from p-values into scores
scores =-np.log10(selector.pvalues_)# Plot the scores. See how "Pclass", "Sex", "Title", and "Fare" are the best?
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()# Pick only the four best features.
predictors =["Pclass","Sex","Fare","Title"]
alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4)
#通过使用不同的预测模型,进行投票的方式提高准确率,当然差别要大一点from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
# The algorithms we want to ensemble.# We're using the more linear predictors for the logistic regression, and everything with the gradient boosting classifier.
algorithms =[[GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3),["Pclass","Sex","Age","Fare","Embarked","FamilySize","Title",]],[LogisticRegression(random_state=1),["Pclass","Sex","Fare","FamilySize","Title","Age","Embarked"]]]# Initialize the cross validation folds
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions =[]for train, test in kf:
train_target = titanic["Survived"].iloc[train]
full_test_predictions =[]# Make predictions for each algorithm on each foldfor alg, predictors in algorithms:# Fit the algorithm on the training data.
alg.fit(titanic[predictors].iloc[train,:], train_target)# Select and predict on the test fold. # The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error.
test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
full_test_predictions.append(test_predictions)# Use a simple ensembling scheme -- just average the predictions to get the final classification.
test_predictions =(full_test_predictions[0]+ full_test_predictions[1])/2# Any value over .5 is assumed to be a 1 prediction, and below .5 is a 0 prediction.
test_predictions[test_predictions <=.5]=0
test_predictions[test_predictions >.5]=1
predictions.append(test_predictions)# Put all the predictions together into one array.
predictions = np.concatenate(predictions, axis=0)# Compute accuracy by comparing to the training data.
accuracy =sum(predictions[predictions == titanic["Survived"]])/len(predictions)print(accuracy)
#对不同的模型,设置权重。
predictors =["Pclass","Sex","Age","Fare","Embarked","FamilySize","Title"]
algorithms =[[GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors],[LogisticRegression(random_state=1),["Pclass","Sex","Fare","FamilySize","Title","Age","Embarked"]]]
full_predictions =[]for alg, predictors in algorithms:# Fit the algorithm using the full training data.
alg.fit(titanic[predictors], titanic["Survived"])# Predict using the test dataset. We have to convert all the columns to floats to avoid an error.
predictions = alg.predict_proba(titanic_test[predictors].astype(float))[:,1]
full_predictions.append(predictions)# The gradient boosting classifier generates better predictions, so we weight it higher.
predictions =(full_predictions[0]*3+ full_predictions[1])/4
predictions