处理数据:
import pandas as pd
titanic=pd.read_csv("C:/Users/15520/Desktop/AI/kaggle/titanic/train.csv")
# titanic.describe()#descrbie()函数只描述含有数据的列,对"sex"这样的列没有描述
#"age"列的数据有缺失,用平局值填充
titanic["Age"]=titanic["Age"].fillna(titanic["Age"].median())
titanic.loc[titanic["Sex"]=="male","Sex"]=0
titanic.loc[titanic["Sex"]=="female","Sex"]=1
#将Embarked列该为数值形式
titanic["Embarked"]=titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"]=="S","Embarked"]=0
titanic.loc[titanic["Embarked"]=="C","Embarked"]=1
titanic.loc[titanic["Embarked"]=="Q","Embarked"]=2
print(titanic["Embarked"].unique())
通过sklearn进行计算线性回归:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
predictors = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]#选择进行预测使用的数据
alg=LinearRegression()
kf=KFold(n_splits=3,random_state=1)#将训练集分成三份,进行交叉验证
predictions=[]
for train,test in kf.split(titanic):
train_predictors=titanic[predictors].iloc[train,:]#给出训练集中的数据
train_target=titanic["Survived"].iloc[train]#给出测试集中的数据
alg.fit(train_predictors,train_target)#对数据进行训练
test_prediction=alg.predict(titanic[predictors].iloc[test,:])#预测结果,对每一个数据进行预测
predictions.append(test_prediction)#将结果添加到列表中
import numpy as np
predictions=np.concatenate(predictions,axis=0)#对predictions按照行进行整合
predictions[predictions > 0.5]=1
predictions[predictions <= 0.5]=0
#两种计算最终结果的方法,其计算的原理是一样的
#(网上关于accuracy的计算,有
sum(predictions[predictions==titanic["Survived"]]),用这种方式进行的计算,得到最终的结果只有26%左右,这是python2
#中的写法,是错误的。)
#解法1
accuracy=sum(predictions==titanic["Survived"])/len(predictions)
#解法2
num=0
print(accuracy)
for i in range(len(predictions)):
if predictions[i]==titanic["Survived"][i]:
num+=1
print(num/len(predictions))
结果:
两中解法的最终预测结果都为0.7833894500561167

被折叠的 条评论
为什么被折叠?



