泰坦尼克号

这篇博客介绍了如何使用Python的pandas库进行数据预处理,包括缺失值填充、特征编码和数据标准化。然后,利用TensorFlow构建了一个神经网络模型,通过训练和验证数据集调整模型参数。最后,评估模型性能并进行预测,同时展示了如何添加新的乘客数据进行生存概率预测。
摘要由CSDN通过智能技术生成

import numpy as np
import pandas as pd
from sklearn import preprocessing

# 1. 特征选择
# 1.1 读取并查看数据

all_df = pd.read_excel('titanic3.xls')
all_df[:2]

cols = ['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
all_df = all_df[cols]
all_df[:2]
df = all_df.drop(['name'], axis=1)
all_df.isnull().sum()
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)
df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)
x_OneHot_df = pd.get_dummies(data=df, columns=['embarked'])
x_OneHot_df[:2]
ndarray = x_OneHot_df.values
ndarray.shape
ndarray[:2]
Label = ndarray[:, 0]
Features = ndarray[:, 1:]
from sklearn import preprocessing

minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
scaledFeatures = minmax_scale.fit_transform(Features)
scaledFeatures[:2]
import numpy

msk = numpy.random.rand(len(all_df)) < 0.8
msk
train_df = all_df[msk]
test_df = all_df[~msk]
print('total:', len(all_df), 'train:', len(train_df), 'test:', len(test_df))
from sklearn import preprocessing


def PreprocessData(raw_df):
    df = raw_df.drop('name', axis=1)
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)
    x_OneHot_df = pd.get_dummies(data=df, columns=['embarked'])

    ndarray = x_OneHot_df.values
    Label = ndarray[:, 0]
    Features = ndarray[:, 1:]

    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
    scaledFeatures = minmax_scale.fit_transform(Features)

    return scaledFeatures, Label


train_Features, train_Label = PreprocessData(train_df)
test_Features, test_Label = PreprocessData(test_df)
train_Features[:2]
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential()
model.add(Dense(units=40, input_dim=9, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=30, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
train_history = model.fit(x=train_Features,
                          y=train_Label,
                          validation_split=0.1,
                          epochs=30,
                          batch_size=30,
                          verbose=2)
import matplotlib.pyplot as plt


def show_train_history(train_history, train, validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train history')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()


show_train_history(train_history, 'acc', 'val_acc')
scores = model.evaluate(x=test_Features, y=test_Label)
scores[1]
Jack = pd.Series([0, 'Jack', 3, 'male', 23, 1, 0, 5.00, 'S'])
Rose = pd.Series([1, 'Rose', 1, 'female', 20, 1, 0, 100.00, 'S'])
JR_df = pd.DataFrame([list(Jack), list(Rose)],
                     columns=['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked'])
all_df = pd.concat([all_df, JR_df])
all_df[-3:]
all_Features, Label = PreprocessData(all_df)
all_probability = model.predict(all_Features)
all_probability[:10]
pd = all_df
pd.insert(len(all_df.columns), 'probability', all_probability)
pd[-2:]

import numpy as np
import pandas as pd
from sklearn import preprocessing

# import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

all_df = pd.read_excel('titanic3.xls')

# 1.2 特征选择
cols =['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
all_df = all_df[cols]

# 2 数据预处理
# 2.1 将数据分为训练数据与测试数据
#产生1309个随机数。转换随机数为布尔值,当随机数的值小于0.8时,取值为True,否则为False,
#总体而言,大约80%的值为True,20%的值为False
msk = np.random.rand(len(all_df)) < 0.8
train_df = all_df[msk]
test_df = all_df[msk]

# 2.2 数据预处理
def PreprocessData(raw_df):
    df = raw_df.drop('name', axis=1)
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)
    x_OneHot_df = pd.get_dummies(data=df, columns=['embarked'])
    ndarray = x_OneHot_df.values
    Label = ndarray[:, 0]
    Features = ndarray[:, 1:]
    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
    scaledFeatures = minmax_scale.fit_transform(Features)
    return scaledFeatures, Label

train_Features, train_Label = PreprocessData(train_df)
test_Features, test_Label = PreprocessData(test_df)

# 3. 建立模型
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential()
model.add(Dense(units=40, input_dim=9, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=30, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))

# 4. 训练模型
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
train_history = model.fit(x=train_Features,
                         y=train_Label,
                         validation_split=0.1,
                         epochs=30,
                         batch_size=30,
                         verbose=2)

import matplotlib.pyplot as plt
def show_train_history(train_history, train, validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train history')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()
show_train_history(train_history, 'acc', 'val_acc')

# 5. 评估与预测
# 5.1 评估
scores = model.evaluate(x=test_Features, y=test_Label)

# 5.2 构造数据,进行预测
Jack = pd.Series([0, 'Jack', 3, 'male', 23, 1, 0, 5.00, 'S'])
Rose = pd.Series([1,'Rose',1, 'female', 20, 1, 0, 100.00, 'S'])

JR_df = pd.DataFrame([list(Jack), list(Rose)],
                     columns=['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked'])
all_df = pd.concat([all_df, JR_df])
print(all_df[-3:])

all_Features, Label = PreprocessData(all_df)
all_probability = model.predict(all_Features)
print(all_probability[:10])

pd = all_df
pd.insert(len(all_df.columns), 'probability', all_probability)
print(pd[-2:])

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值