上一篇:Tensorflow入门八-生成式对抗网络GAN(百度云源码数据资源)https://blog.csdn.net/qq_36187544/article/details/89919656
keras是tensorflow的高阶模块,在高版本一点的tensorflow里直接集成了keras,不用额外安装keras包。keras需要写的东西更少,开发更快,但是修改底层就更困难。高阶API
目录
资源说明
链接:https://pan.baidu.com/s/1s8lks4ZOXShDA1aeSsjRig
提取码:e44s
data/titanic.xls是数据资源,也可以通过运行downloadData.py获取(多次执行只会输出数据已存在)
main.py是运行文件
源代码
利用pandas做数据处理,利用sklearn模块做标准化处理,keras构建模型,并对模型评估、保存,可视化损失函数和精确率,对Jack和Rose生存率做预测(被注释的print都可以放出来看一下效果):
import pandas as pd
from sklearn import preprocessing
import tensorflow as tf
import matplotlib.pyplot as plt
#数据处理
#读取数据文件
from sympy import preorder_traversal
data_file_path = "data/titanic3.xls"
df_data = pd.read_excel(data_file_path)
#查看数据摘要
# print(df_data.describe())
#筛选提取特征字段
selected_cols = ['survived','name','pclass','sex','age','sibsp','parch','fare','embarked']
selected_df_data = df_data[selected_cols]
#显示缺失值的位置
# print(selected_df_data[selected_df_data.isnull().values==True])
#显示转换后前三行
# print(selected_df_data[:3])
def prepare_data(df_data):
#删除name字段
selected_df_data = df_data.drop(['name'],axis=1)
# 处理缺失值
age_mean_value = selected_df_data['age'].mean()
selected_df_data['age'] = selected_df_data['age'].fillna(age_mean_value)
fare_mean_value = selected_df_data['fare'].mean()
selected_df_data['fare'] = selected_df_data['fare'].fillna(fare_mean_value)
selected_df_data['embarked'] = selected_df_data['embarked'].fillna("S")
# 编码转换,性别由字符串转为数字编码,港口由字母转为数字
selected_df_data['sex'] = selected_df_data['sex'].map({'female': 0, 'male': 1}).astype(int)
selected_df_data['embarked'] = selected_df_data['embarked'].map({'C': 0, 'Q': 1, 'S': 2}).astype(int)
#转为ndarray数组
ndarray_data = selected_df_data.values
#后7列为特征值,第一列是标签值
features = ndarray_data[:,1:]
label = ndarray_data[:,0]
#特征值标准化处理
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
norm_features = minmax_scale.fit_transform(features)
return norm_features,label
#显示标准化后前三列数据
# print(norm_features[:3])
#数据准备,打乱顺序
shuffled_df_data = selected_df_data.sample(frac=1)
x_data,y_data =prepare_data(shuffled_df_data)
train_size = int(len(x_data)*0.8)
x_train,y_train = x_data[:train_size],y_data[:train_size]
x_test,y_test = x_data[train_size:],y_data[train_size:]
#建立模型结构
model = tf.keras.models.Sequential()
#加入第一层,输入特征为7列,input_dim = 7 等价于input_shape=(7,)
model.add(tf.keras.layers.Dense(units=64,input_dim=7,use_bias=True,kernel_initializer='uniform',bias_initializer='zeros',activation='relu'))
model.add(tf.keras.layers.Dense(units=32,activation='sigmoid'))
model.add(tf.keras.layers.Dense(units=1,activation='sigmoid'))
#查看模型结果
# print(model.summary())
model.compile(optimizer=tf.keras.optimizers.Adam(0.003),loss='binary_crossentropy',metrics=['accuracy'])#metrics是训练和评估的度量值
#设置回调参数,保存模型
checkpoint_path = './checkpoint/Titanic.{epoch:02d}-{val_loss:.2f}.ckpt'
callbacks = [tf.keras.callbacks.TensorBoard(histogram_freq=2),
tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,save_weights_only=True,verbose=1,period=5)]
#模型训练
# train_history = model.fit(x=x_train,y=y_train,validation_split=0.2,epochs=20,batch_size=40,verbose=2)#不带回调参数
train_history = model.fit(x=x_train,y=y_train,validation_split=0.2,epochs=100,batch_size=40,callbacks=callbacks,verbose=1)
#history存储训练过程所有状态
#print(train_history.history)
#训练过程可视化
def visu_train_history(train_hostory,train_metric,validation_metric):
plt.plot(train_history.history[train_metric])
plt.plot(train_history.history[validation_metric])
plt.title("Train history")
plt.xlabel('epoch')
plt.ylabel(train_metric)
plt.legend(['train','validation'],loc='upper left')
plt.show()
visu_train_history(train_history,'acc','val_acc')
visu_train_history(train_history,'loss','val_loss')
#模型评估
print("==================")
print('evaluate:')
evaluate_result = model.evaluate(x=x_test,y=y_test)
for i in range(len(evaluate_result)):
print(model.metrics_names[i],'is',evaluate_result[i])
print('==================')
print("prediction:")
#预测
Jack_info = [0,'Jack',3,'male',23,1,0,5.000,'S']
Rose_info = [1,'Rose',1,'female',20,1,0,100.000,'S']
#创建新旅客的DataFrame
new_passenger_pd = pd.DataFrame([Jack_info,Rose_info],columns=selected_cols)
#加入信息
all_passage_pd = selected_df_data.append(new_passenger_pd)
#数据准备并计算生存率
x_features,y_label = prepare_data(all_passage_pd)
surv_probability = model.predict(x_features)
#在数据最后一列插入生存率预测
all_passage_pd.insert(len(all_passage_pd.columns),'surv_probability',surv_probability)
#显示预测结果
print(all_passage_pd[-2:])
运行结果(Rose生存率0.974297,Jack生存率 0.127492):