预测泰坦尼克号生存问题

最新推荐文章于 2024-07-22 17:32:39 发布

白色小靴

最新推荐文章于 2024-07-22 17:32:39 发布

阅读量617

点赞数

分类专栏： python学习文章标签：泰坦尼克 CNN 决策树随机森林

本文链接：https://blog.csdn.net/weixin_44684139/article/details/100799814

版权

python学习专栏收录该内容

47 篇文章 3 订阅

订阅专栏

import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz


# 预测泰坦尼克生存

titanic = pd.read_csv('./titanic.csv')

# 筛选目标值和特征值
x = titanic[["pclass","age","sex"]]
y = titanic["survived"]

# 2.数据处理
# 1）缺失值处理---填补
x["age"].fillna(x["age"].mean(),inplace=True)    #用平均值就地填补，inplace=True就地填补


# 2)转换成字典
x = x.to_dict(orient="records")     #orient="records" 转换格式都习惯用这个

# 3.数据集划分
y = pd.get_dummies(y)
y = y.values.astype(np.float32)
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=22)

transfer = DictVectorizer(sparse=False)
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
print(type(x_test),x_test.shape)
# print(type(y_test), y_test.shape)

tfx = tf.placeholder(tf.float32, [None, 6], 'x')    #转换为one-hot了就是6
tfy = tf.placeholder(tf.float32, [None, 2], 'y')


l1 = tf.layers.dense(tfx, 128, tf.nn.relu, name="l1")
l2 = tf.layers.dense(l1, 128, tf.nn.relu, name="l2")
l3 = tf.layers.dense(l2, 128, tf.nn.relu, name="l3")
out = tf.layers.dense(l3, 2, name="l4")
prediction = tf.nn.softmax(out, name="pred")

loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=tfy, logits=out)
accuracy = tf.metrics.accuracy(          # return (acc, update_op), and create 2 local variables
    labels=tf.argmax(tfy, axis=1), predictions=tf.argmax(out, axis=1))[1]
opt = tf.train.AdamOptimizer(learning_rate=0.00001)
train_op = opt.minimize(loss)


with tf.Session() as sess:
    sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))

    accuracies, steps = [], []
    for t in range(400000):
        # training
        batch_index = np.random.randint(len(x_train), size=16)  # 生成32个随机数，返回array数组
        # print(batch_index)
        sess.run(train_op, {tfx: x_train[batch_index], tfy:y_train[batch_index]})  # 注意feeddict可以不用了

        if t % 50 == 0:
            # testing
            batch_index = np.random.randint(len(x_test), size=32)  # 生成32个随机数，返回array数组
            acc_, pred_, loss_ = sess.run([accuracy, prediction, loss], {tfx: x_test[batch_index], tfy:y_test[batch_index]})
            accuracies.append(acc_)
            steps.append(t)
            print("Step: %i" % t, "| Accurate: %.2f" % acc_, "| Loss: %.2f" % loss_, )

发现准确率最高到78%，和用决策树差不多，比决策树高一点，和随机森林也差不多
感觉可能是因为样本数目不足