import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
# 预测泰坦尼克生存
titanic = pd.read_csv('./titanic.csv')
# 筛选目标值和特征值
x = titanic[["pclass","age","sex"]]
y = titanic["survived"]
# 2.数据处理
# 1)缺失值处理---填补
x["age"].fillna(x["age"].mean(),inplace=True) #用平均值就地填补,inplace=True就地填补
# 2)转换成字典
x = x.to_dict(orient="records") #orient="records" 转换格式都习惯用这个
# 3.数据集划分
y = pd.get_dummies(y)
y = y.values.astype(np.float32)
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=22)
transfer = DictVectorizer(sparse=False)
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
print(type(x_test),x_test.shape)
# print(type(y_test), y_test.shape)
tfx = tf.placeholder(tf.float32, [None, 6], 'x') #转换为one-hot了就是6
tfy = tf.placeholder(tf.float32, [None, 2], 'y')
l1 = tf.layers.dense(tfx, 128, tf.nn.relu, name="l1")
l2 = tf.layers.dense(l1, 128, tf.nn.relu, name="l2")
l3 = tf.layers.dense(l2, 128, tf.nn.relu, name="l3")
out = tf.layers.dense(l3, 2, name="l4")
prediction = tf.nn.softmax(out, name="pred")
loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=tfy, logits=out)
accuracy = tf.metrics.accuracy( # return (acc, update_op), and create 2 local variables
labels=tf.argmax(tfy, axis=1), predictions=tf.argmax(out, axis=1))[1]
opt = tf.train.AdamOptimizer(learning_rate=0.00001)
train_op = opt.minimize(loss)
with tf.Session() as sess:
sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))
accuracies, steps = [], []
for t in range(400000):
# training
batch_index = np.random.randint(len(x_train), size=16) # 生成32个随机数,返回array数组
# print(batch_index)
sess.run(train_op, {tfx: x_train[batch_index], tfy:y_train[batch_index]}) # 注意feeddict可以不用了
if t % 50 == 0:
# testing
batch_index = np.random.randint(len(x_test), size=32) # 生成32个随机数,返回array数组
acc_, pred_, loss_ = sess.run([accuracy, prediction, loss], {tfx: x_test[batch_index], tfy:y_test[batch_index]})
accuracies.append(acc_)
steps.append(t)
print("Step: %i" % t, "| Accurate: %.2f" % acc_, "| Loss: %.2f" % loss_, )
发现准确率最高到78%,和用决策树差不多,比决策树高一点,和随机森林也差不多
感觉可能是因为样本数目不足