泰坦尼克号获救问题
1逻辑回归分析
1 代码
logistic regression.py
import pandas as pd
data = pd.read_csv("Titanic.csv")
# print(date.info())
# print(date.describe())
# print(date.head())
# 通过观察 Age,Cabin,Embarked是存在缺失值的
# print(date.isnull().any())
# 填充缺失值
data['Age'].fillna(data['Age'].mean(), inplace=True)
# print(date.isnull().any())
# print(date['Embarked'].value_counts())
data['Embarked'].fillna('S', inplace=True)
# 丢掉没用的Cabin数据
std_data = data.drop(['Cabin', 'PassengerId', 'Name', 'Ticket'], axis=1)
# print(std_data.isnull().any())
# 将一些特征的值是字符串,将其转化为数值类型这里有Sex, Embarked
std_data.loc[std_data['Sex'] == "male", "Sex"] = 0
std_data.loc[std_data['Sex'] == "female", "Sex"] = 1
# print(std_data['Sex'])
# print(std_data['Embarked'])
std_data.loc[std_data['Embarked'] == "S", "Embarked"] = 0
std_data.loc[std_data['Embarked'] == "C", "Embarked"] = 1
std_data.loc[std_data['Embarked'] == "Q", "Embarked"] = 2
# print(std_data.head())
# 取出特征x和标签y
x = std_data.ix[:, std_data.columns != 'Survived']
y = std_data.ix[:, std_data.columns == 'Survived']
# print(x.head(), y.head())
# 特征选择
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
# print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score
import numpy as np
c_param_range = [0.01, 0.1, 1, 10, 100]
fold = K