import numpy as np
import pandas as pd
from sklearn import preprocessing
# 1. 特征选择
# 1.1 读取并查看数据
all_df = pd.read_excel('titanic3.xls')
all_df[:2]
cols = ['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
all_df = all_df[cols]
all_df[:2]
df = all_df.drop(['name'], axis=1)
all_df.isnull().sum()
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)
df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)
x_OneHot_df = pd.get_dummies(data=df, columns=['embarked'])
x_OneHot_df[:2]
ndarray = x_OneHot_df.values
ndarray.shape
ndarray[:2]
Label = ndarray[:, 0]
Features = ndarray[:, 1:]
from sklearn import preprocessing
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
scaledFeatures = minmax_scale.fit_transform(Features)
scaledFeatures[:2]
import numpy
msk = numpy.random.rand(len(all_df)) < 0.8
msk
train_df = all_df[msk]
test_df = all_df[~msk]
print('total:', len(all_df), 'train:', len(train_df), 'test:', len(test_df))
from sklearn import preprocessing
def PreprocessData(raw_df):
df = raw_df.drop('name', axis=1)
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)
df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)
x_OneHot_df = pd.get_dummies(data=df, columns=['embarked'])
ndarray = x_OneHot_df.values
Label = ndarray[:, 0]
Features = ndarray[:, 1:]
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
scaledFeatures = minmax_scale.fit_transform(Features)
return scaledFeatures, Label
train_Features, train_Label = PreprocessData(train_df)
test_Features, test_Label = PreprocessData(test_df)
train_Features[:2]
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
model = Sequential()
model.add(Dense(units=40, input_dim=9, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=30, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
train_history = model.fit(x=train_Features,
y=train_Label,
validation_split=0.1,
epochs=30,
batch_size=30,
verbose=2)
import matplotlib.pyplot as plt
def show_train_history(train_history, train, validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train history')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
show_train_history(train_history, 'acc', 'val_acc')
scores = model.evaluate(x=test_Features, y=test_Label)
scores[1]
Jack = pd.Series([0, 'Jack', 3, 'male', 23, 1, 0, 5.00, 'S'])
Rose = pd.Series([1, 'Rose', 1, 'female', 20, 1, 0, 100.00, 'S'])
JR_df = pd.DataFrame([list(Jack), list(Rose)],
columns=['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked'])
all_df = pd.concat([all_df, JR_df])
all_df[-3:]
all_Features, Label = PreprocessData(all_df)
all_probability = model.predict(all_Features)
all_probability[:10]
pd = all_df
pd.insert(len(all_df.columns), 'probability', all_probability)
pd[-2:]
import numpy as np
import pandas as pd
from sklearn import preprocessing
# import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
all_df = pd.read_excel('titanic3.xls')
# 1.2 特征选择
cols =['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
all_df = all_df[cols]
# 2 数据预处理
# 2.1 将数据分为训练数据与测试数据
#产生1309个随机数。转换随机数为布尔值,当随机数的值小于0.8时,取值为True,否则为False,
#总体而言,大约80%的值为True,20%的值为False
msk = np.random.rand(len(all_df)) < 0.8
train_df = all_df[msk]
test_df = all_df[msk]
# 2.2 数据预处理
def PreprocessData(raw_df):
df = raw_df.drop('name', axis=1)
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)
df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)
x_OneHot_df = pd.get_dummies(data=df, columns=['embarked'])
ndarray = x_OneHot_df.values
Label = ndarray[:, 0]
Features = ndarray[:, 1:]
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
scaledFeatures = minmax_scale.fit_transform(Features)
return scaledFeatures, Label
train_Features, train_Label = PreprocessData(train_df)
test_Features, test_Label = PreprocessData(test_df)
# 3. 建立模型
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
model = Sequential()
model.add(Dense(units=40, input_dim=9, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=30, kernel_initializer='uniform', activation='relu'))
model.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
# 4. 训练模型
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
train_history = model.fit(x=train_Features,
y=train_Label,
validation_split=0.1,
epochs=30,
batch_size=30,
verbose=2)
import matplotlib.pyplot as plt
def show_train_history(train_history, train, validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train history')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
show_train_history(train_history, 'acc', 'val_acc')
# 5. 评估与预测
# 5.1 评估
scores = model.evaluate(x=test_Features, y=test_Label)
# 5.2 构造数据,进行预测
Jack = pd.Series([0, 'Jack', 3, 'male', 23, 1, 0, 5.00, 'S'])
Rose = pd.Series([1,'Rose',1, 'female', 20, 1, 0, 100.00, 'S'])
JR_df = pd.DataFrame([list(Jack), list(Rose)],
columns=['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked'])
all_df = pd.concat([all_df, JR_df])
print(all_df[-3:])
all_Features, Label = PreprocessData(all_df)
all_probability = model.predict(all_Features)
print(all_probability[:10])
pd = all_df
pd.insert(len(all_df.columns), 'probability', all_probability)
print(pd[-2:])