预测泰坦尼克号生存问题

import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz


# 预测泰坦尼克生存

titanic = pd.read_csv('./titanic.csv')

# 筛选目标值和特征值
x = titanic[["pclass","age","sex"]]
y = titanic["survived"]

# 2.数据处理
# 1)缺失值处理---填补
x["age"].fillna(x["age"].mean(),inplace=True)    #用平均值就地填补,inplace=True就地填补


# 2)转换成字典
x = x.to_dict(orient="records")     #orient="records" 转换格式都习惯用这个

# 3.数据集划分
y = pd.get_dummies(y)
y = y.values.astype(np.float32)
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=22)

transfer = DictVectorizer(sparse=False)
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
print(type(x_test),x_test.shape)
# print(type(y_test), y_test.shape)

tfx = tf.placeholder(tf.float32, [None, 6], 'x')    #转换为one-hot了就是6
tfy = tf.placeholder(tf.float32, [None, 2], 'y')


l1 = tf.layers.dense(tfx, 128, tf.nn.relu, name="l1")
l2 = tf.layers.dense(l1, 128, tf.nn.relu, name="l2")
l3 = tf.layers.dense(l2, 128, tf.nn.relu, name="l3")
out = tf.layers.dense(l3, 2, name="l4")
prediction = tf.nn.softmax(out, name="pred")

loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=tfy, logits=out)
accuracy = tf.metrics.accuracy(          # return (acc, update_op), and create 2 local variables
    labels=tf.argmax(tfy, axis=1), predictions=tf.argmax(out, axis=1))[1]
opt = tf.train.AdamOptimizer(learning_rate=0.00001)
train_op = opt.minimize(loss)


with tf.Session() as sess:
    sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))

    accuracies, steps = [], []
    for t in range(400000):
        # training
        batch_index = np.random.randint(len(x_train), size=16)  # 生成32个随机数,返回array数组
        # print(batch_index)
        sess.run(train_op, {tfx: x_train[batch_index], tfy:y_train[batch_index]})  # 注意feeddict可以不用了

        if t % 50 == 0:
            # testing
            batch_index = np.random.randint(len(x_test), size=32)  # 生成32个随机数,返回array数组
            acc_, pred_, loss_ = sess.run([accuracy, prediction, loss], {tfx: x_test[batch_index], tfy:y_test[batch_index]})
            accuracies.append(acc_)
            steps.append(t)
            print("Step: %i" % t, "| Accurate: %.2f" % acc_, "| Loss: %.2f" % loss_, )


发现准确率最高到78%,和用决策树差不多,比决策树高一点,和随机森林也差不多
感觉可能是因为样本数目不足

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值