泰坦尼克号生存预测,kaggle得分0.77,4000多排名,实在没有办法提高排名了。
逻辑回归和逻辑回归训练集效果有差距,但是kaggle得分没什么区别。未做特征工程时候KNN算法0.73。
改天试试神经网络,估计效果也不 怎么样,
刚学习机器学习不久,只是为了自己看。
# -*- coding: utf-8 -*-
# @Time : 2020/11/2 22:17
# @Author : spore
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
#代码只是为了让pycharm输出控制台输出时不进行省略
pd.set_option('display.max_columns',1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',1000)
#读取训练数据
df = pd.read_csv(r"D:\test\train.csv")
pr= pd.read_csv(r"D:\test\test.csv")
df=df.append(pr,ignore_index=True,sort=True)
# #查看数据前2行观察数据
# print(df.head(2))
# #查看数据大小
# print(np.shape(np.array(df)))
# #查看数据,有字符串和数据缺失情况
# print(df.info())
# # #查看数据缺失的数量
# print(df.isnull().sum())
# #观测 数量、平均值、标准差、四分位置、最大值,主要是观察离群点
# print (df.describe())
#Age、Fare列用均值进行填充
df['Age']=df['Age'].fillna(df['Age'].mean())
df['Fare']=df["Fare"].fillna(df["Fare"].mean())
pr['Age']=pr['Age'].fillna(pr['Age'].mean())
pr['Fare']=pr["Fare"].fillna(pr["Fare"].mean())
#Embarked列缺失两条数据,缺失比较少,采用众数填充
#先获得众数
# print(df["Embarked"].value_counts())
#打印后可知S最多就用S填充.
df['Embarked']=df["Embarked"].fillna("S")
pr['Embarked']=pr["Embarked"].fillna("S")
#Cabin缺失数据较多用”U“表示未知"Uknow"
df['Cabin']=df["Cabin"].fillna("U")
pr['Cabin']=pr["Cabin"].fillna("U")
#关于Survived这是标签,用于预测所以不需要处理,最后打印下确实信息
# print(df.info())
#接下来将字符串数据转换,使用LabelEncoder函数
for name in ['Ticket',"Sex"]:
encoder = LabelEncoder()
df[name] = encoder.fit_transform(df[name])
for name in ["Ticket","Sex"]:
encoder = LabelEncoder()
pr[name] = encoder.fit_transform(pr[name])
#特征工程
#对登船港口进行One-Hot编码,#get_dummies 独热编码 prefix 前缀
embarkedDF=pd.DataFrame()
embarkedDF=pd.get_dummies (df['Embarked'],prefix='Embarked')
#添加虚拟变量至原数据集并删除原有数据列'''
df=pd.concat([df,embarkedDF],axis=1)
df.drop('Embarked',axis=1,inplace=True)
#对客舱等级进行One-Hot编码'''
pclassDF=pd.DataFrame()
pclassDF=pd.get_dummies(df['Pclass'],prefix='Pclass')
#添加虚拟变量至原数据集并删除原有数据列'''
df=pd.concat([df,pclassDF],axis=1)
df.drop('Pclass',axis=1,inplace=True)
'''对客舱号进行首字母提取'''
df['Cabin']=df['Cabin'].map(lambda c: c[0])
'''对客舱号进行One-Hot编码'''
cabinDF=pd.get_dummies(df['Cabin'],prefix='Cabin')
'''添加虚拟变量至原数据集并删除原有数据列'''
df=pd.concat([df,cabinDF],axis=1)
df.drop('Cabin',axis=1,inplace=True)
#对直系亲属数进行特征提取
'''计算乘船人员直系亲属总数'''
familyDF=pd.DataFrame()
familyDF['FamilySize']=df['Parch']+df['SibSp']+1
'''根据乘船人员直系亲属总数进行分类'''
familyDF['Family_Single']=familyDF['FamilySize'].map(lambda f:1 if f == 1 else 0)
familyDF['Family_Small']=familyDF['FamilySize'].map(lambda f:1 if 2<= f <=4 else 0)
familyDF['Family_Large']=familyDF['FamilySize'].map(lambda f:1 if f >= 5 else 0)
'''添加虚拟变量至原数据集'''
df=pd.concat([df,familyDF],axis=1)
#提取头衔
#定义函数:从姓名中提取头衔
def gettitle(name):
str1=name.split(',')[1]
str2=str1.split('.')[0]
str3=str2.strip()
return str3
#存放提取后的特征
titleDF=pd.DataFrame()
titleDF['Title']=df['Name'].map(gettitle)
#
# #查看头衔分类统计
# titleCount=titleDF.groupby('Title')['Title'].count()
# print('\n登船人员头衔统计:\n',titleCount)
#头衔映射字典
title_mapDict={'Capt':'Officer',
'Col':'Officer',
'Major':'Officer',
'Jonkheer':'Royalty',
'Don':'Royalty',
'Sir':'Royalty',
'Dr':'Officer',
'Rev':'Officer',
'the Countess':'Royalty',
'Dona':'Royalty',
'Mme':'Mrs',
'Mlle':'Miss',
'Ms':'Mrs',
'Mr':'Mr',
'Mrs':'Mrs',
'Miss':'Miss',
'Master':'Master',
'Lady':'Royalty'}
#对头衔进行One-Hot编码
titleDF['Title']=titleDF['Title'].map(title_mapDict)
titleDF=pd.get_dummies(titleDF['Title'],prefix='Title')
#添加虚拟变量至原数据集并删除原有数据列'''
df=pd.concat([df,titleDF],axis=1)
df.drop('Name',axis=1,inplace=True)
#查看结果
# print(df.head(5))
#划分数据集
df1=df.drop("Survived",axis = 1)
#进行预测
train=df1.loc[0:890,:]
lab=df.loc[0:890,"Survived"]
pred_X=df1.loc[891:1308,:]
print(pred_X.head(3))
#数据集划分
x_train, x_test, y_train, y_test = train_test_split(train, lab, test_size=0.1,random_state=666)
x_train=np.array(x_train)
x_test=np.array(x_test)
y_train=np.array(y_train)
y_test=np.array(y_test)
# ##寻找最优K值
# train1=[]
# k1=[]
# test1=[]
# #循环不同K值对模型的影响
# for k in range(1,30):
# k1.append(k)
# # #KNN算法
# #初始化KNN
# model = KNeighborsClassifier(n_neighbors= k)
# #训练模型
# model.fit(x_train, y_train)
# #训练得分
# train=model.score(x_train, y_train)
# train1.append(train)
# #测试得分
# test=model.score(x_test, y_test)
# test1.append(test)
# #测试得分
# print('test={};k={}'.format(test,k))
# #绘制K的取值,测试得分
# plt.plot( k1, test1)
# plt.show()
#
#逻辑回归算法
# model = LogisticRegression ()
# model.fit(x_test,y_test)
# print(model.score(x_test, y_test))
# fscore=model.score(x_test, y_test)
# print('\n模型正确率为:',fscore)
# pred_y=model.predict(pred_X)
#xgboot算法
# params ={'learning_rate': 0.01,
# 'max_depth': 5,
# 'num_boost_round':20,
# 'objective': 'multi:softmax',
# 'random_state': 27,
# 'silent':0,
# 'num_class':32,
# }
# model = xgb.train(params,xgb.DMatrix(x_train, y_train))#,num_boost_round=20)
# pred_y=model.predict(pred_X)
#随机森林
model = RandomForestClassifier(n_estimators=100)
model.fit(x_train, y_train)
fscore=model.score(x_test, y_test)
print('\n模型正确率为:',fscore)
pred_y = model.predict(pred_X)
'''将结果转换为整型数据开始kaggle提交'''
pred_y=pred_y.astype(int)
passenger_id=pr.loc[0:417,'PassengerId']
# print(passenger_id)
predDF=pd.DataFrame({'PassengerId':passenger_id,'Survived':pred_y})
'''将结果转换为csv文件输出'''
predDF.to_csv(r'D:\test\Titanic_fucklog.csv',index=False)