# 4. 线性回归算法#线性回归from sklearn.linear_model import LinearRegression
#训练集交叉验证,得到平均值#from sklearn.cross_validation import KFold from sklearn.model_selection import KFold
#选取简单的可用输入特征
predictors =["Pclass","Age","SibSp","Parch","Fare"]#初始化现行回归算法
alg = LinearRegression()#样本平均分成3份,3折交叉验证#kf = KFold(data_train.shape[0],n_folds=3,random_state=1)
kf = KFold(n_splits=3,shuffle=False,random_state=1)
predictions =[]for train,test in kf.split(data_train):#The predictors we're using to train the algorithm. Note how we only take then rows in the train folds.
train_predictors =(data_train[predictors].iloc[train,:])#The target we're using to train the algorithm.
train_target = data_train["Survived"].iloc[train]#Training the algorithm using the predictors and target.
alg.fit(train_predictors,train_target)#We can now make predictions on the test fold
test_predictions = alg.predict(data_train[predictors].iloc[test,:])
predictions.append(test_predictions)
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/model_selection/_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
FutureWarning
import numpy as np
#The predictions are in three aeparate numpy arrays. Concatenate them into one.#We concatenate them on axis 0,as they only have one axis.
predictions = np.concatenate(predictions,axis=0)#Map predictions to outcomes(only possible outcomes are 1 and 0)
predictions[predictions>.5]=1
predictions[predictions<=.5]=0
accuracy =sum(predictions == data_train["Survived"])/len(predictions)print("准确率为: ", accuracy)
# 增加2个特征Sex和Embarked,继续使用逻辑回归算法进行预测
predictors =["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
LogRegAlg=LogisticRegression(random_state=1)#Compute the accuracy score for all the cross validation folds.(much simpler than what we did before!)
re = LogRegAlg.fit(data_train[predictors],data_train["Survived"])
scores = model_selection.cross_val_score(LogRegAlg,data_train[predictors],data_train["Survived"],cv=3)#Take the mean of the scores (because we have one for each fold)print("准确率为: ",scores.mean())
准确率为: 0.7957351290684623
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
# 6.使用随机森林算法from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
predictors=["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]#10棵决策树,停止的条件:样本个数为2,叶子节点个数为1
alg=RandomForestClassifier(random_state=1,n_estimators=10,min_samples_split=2,min_samples_leaf=1)#Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!)#kf=cross_validation.KFold(data_train.shape[0],n_folds=3,random_state=1)
kf=model_selection.KFold(n_splits=3,shuffle=False, random_state=1)
scores=model_selection.cross_val_score(alg,data_train[predictors],data_train["Survived"],cv=kf)print(scores)#Take the mean of the scores (because we have one for each fold)print(scores.mean())
[0.75420875 0.8013468 0.8013468 ]
0.7856341189674523
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/model_selection/_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
FutureWarning
#增加决策树的个数到30棵决策树,交叉验证方法采用10折交叉验证#30棵决策树,停止的条件:样本个数为2,叶子节点个数为1
alg=RandomForestClassifier(random_state=1,n_estimators=30,min_samples_split=2,min_samples_leaf=1)#Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!)#kf=cross_validation.KFold(data_train.shape[0],n_folds=10,random_state=1)
kf=model_selection.KFold(n_splits=10,shuffle=False,random_state=1)
scores=model_selection.cross_val_score(alg,data_train[predictors],data_train["Survived"],cv=kf)print(scores)#Take the mean of the scores (because we have one for each fold)print(scores.mean())
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/sklearn/model_selection/_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
FutureWarning
[0.74444444 0.80898876 0.78651685 0.82022472 0.85393258 0.85393258
0.7752809 0.7752809 0.84269663 0.85393258]
0.8115230961298376
代码中数据集:https://github.com/jsusu/Titanic_passenger-survival-prediction/tree/master/titanic_data# Tatanic乘客生存预测1#数据分析库import pandas as pd#科学计算库import numpy as np from pandas import Series,DataFra...