学习笔记:“Tensorflow基础泰坦尼克获救预测”课程的代码及注释

在网易云课堂学习了日月光华老师的课程Tensorflow基础泰坦尼克获救预测,照着老师的演示把代码码了一遍,并根据自己的理解加上了注释。虽然正确率只有76%,但在kaggle上成功提交了自己的第一份代码还是令我挺开心的。

课程地址:https://study.163.com/course/courseMain.htm?courseId=1004937015

Kaggle题目地址:https://www.kaggle.com/c/titanic/submit

#
#代码来源于课程:Tensorflow基础泰坦尼克获救预测(https://study.163.com/course/courseMain.htm?courseId=1004937015)
#感谢日月光华老师的课程
#

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.getcwd()                                                                                          #获取当前工作路径
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

data = pd.read_csv(r"E:/Computer/Python/SRTP/KaggleLearn/Titanic/all/train.csv")                     #读取csv文件

#数据预处理
data = data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp','Parch', 'Fare', 'Cabin', 'Embarked']]      #去掉无用数据
data['Age'] = data['Age'].fillna(data['Age'].mean())                                                 #fillna:将空值填补为xxx
data['Cabin'] = pd.factorize(data.Cabin)[0]                                                          #factorize:将字符串编码
data.fillna(0,inplace = True)
data['Sex'] = [1 if x == 'male' else 0 for x in data.Sex]                                            #x if y else z for a in b

data['p1'] = np.array(data['Pclass'] == 1).astype(np.int32)                                          #独热编码,将没有线性关系的分开
data['p2'] = np.array(data['Pclass'] == 2).astype(np.int32)                                          #astype 转化数据类型
data['p3'] = np.array(data['Pclass'] == 3).astype(np.int32)
del data['Pclass']

data.Embarked.unique()                                                                               #查看有哪几种不同的值

data['e1'] = np.array(data['Embarked'] == 'S').astype(np.int32)
data['e2'] = np.array(data['Embarked'] == 'C').astype(np.int32)
data['e3'] = np.array(data['Embarked'] == 'Q').astype(np.int32)
del data['Embarked']

data_train = data[['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'p1', 'p2',
       'p3', 'e1', 'e2', 'e3']]                                                                      #确立训练集
data_target = data['Survived'].values.reshape(len(data), 1)                                          #确立目标

np.shape(data_train), np.shape(data_target)                                                          #读取数据长度

#机器学习部分

import tensorflow as tf

x = tf.placeholder("float", shape=[None, 12])                                                        #被训练的x和y,其中x有12列,y有1列
y = tf.placeholder("float", shape=[None, 1])

weight = tf.Variable(tf.random_normal([12, 1]))                                                      #weight,12行1列,用于处理x的多行数据
bias = tf.Variable(tf.random_normal([1]))                                                            #bias,偏移量
output = tf.matmul(x, weight) + bias                                                                 #matmul为矩阵乘,注意x在前
pred = tf.cast(tf.sigmoid(output) > 0.5, tf.float32)                                                 #将sigmoid作为激励函数处理output(digmoid适合用于二分类问题),将大于0.5的视作一类,小于的视作一类。cast相当于转化把值为0或1


loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = y, logits = output))          #sigmoid_cross_entropy_with_logits是专用于为sigmoid计算损失值的函数。reduce_mean计算平均值,这里把向量值转化为标量值。label是目标值

train_step = tf.train.GradientDescentOptimizer(0.0003).minimize(loss)                                #用GradientDescentOptimizer进行训练,效率为0.0003(这个训练效率是可以自己调试的,用来控制学习的收敛速度。过大易过拟合)。minimize(loss)代表目标是减小loss
accuracy = tf.reduce_mean(tf.cast(tf.equal(pred, y), tf.float32))                                    #accuracy计算准确率。似乎只在画图时用到

#预处理测试集
data_test = pd.read_csv(r'E:/Computer/Python/SRTP/KaggleLearn/Titanic/all/test.csv')
data_test = data_test[['Pclass', 'Sex', 'Age', 'SibSp','Parch', 'Fare', 'Cabin', 'Embarked']]
data_test['Age'] = data_test['Age'].fillna(data_test['Age'].mean())
data_test['Cabin'] = pd.factorize(data_test.Cabin)[0]
data_test.fillna(0,inplace = True)
data_test['Sex'] = [1 if x == 'male' else 0 for x in data_test.Sex]
data_test['p1'] = np.array(data_test['Pclass'] == 1).astype(np.int32)
data_test['p2'] = np.array(data_test['Pclass'] == 2).astype(np.int32)
data_test['p3'] = np.array(data_test['Pclass'] == 3).astype(np.int32)
del data_test['Pclass']
data_test['e1'] = np.array(data_test['Embarked'] == 'S').astype(np.int32)
data_test['e2'] = np.array(data_test['Embarked'] == 'C').astype(np.int32)
data_test['e3'] = np.array(data_test['Embarked'] == 'Q').astype(np.int32)
del data_test['Embarked']

test_label = pd.read_csv(r'E:/Computer/Python/SRTP/KaggleLearn/Titanic/all/gender_submission.csv')
test_label = np.reshape(test_label.Survived.values.astype(np.float32),(418,1))                        #只取出测试集的Survived信息,并排成1列

sess = tf.Session()                                                                                   #tensorflow的会话机制
sess.run(tf.global_variables_initializer())                                                           #global_variables_initializer()初始化所有变量
loss_train = []
train_acc = []
test_acc = []

for i in range(25000):                                                                                #做25000次接近的训练
    index = np.random.permutation(len(data_target))                                                   #random.permutation生成一个排列,用于打乱顺序
    data_target = data_target.take(index)                                                             #将训练集打成乱序,防止过拟合。
    data_train = data_train.take(index)
    for n in range(len(data_target)//100 + 1):                                                        #//代表去掉余数的除法。这里每次放100个数据进行训练。理论上每次放入训练的数据越多越好,但受制于计算机性能只能按批次放入
        batch_xs = data_train[n * 100: n * 100 + 100]                                                 #取100个数据
        batch_ys = data_target[n * 100: n * 100 + 100]
        batch_ys = batch_ys.reshape(len(batch_ys), 1)
        sess.run(train_step, feed_dict = {x: batch_xs, y: batch_ys})
    if i % 1000 == 0:                                                                                 #每训练1000次,输出一次当前的损失值和精度
        loss_temp = sess.run(loss, feed_dict = {x: batch_xs, y: batch_ys})
        loss_train.append(loss_temp)
        train_acc_temp = sess.run(accuracy, feed_dict = {x : batch_xs, y : batch_ys})
        train_acc.append(train_acc_temp)
        test_acc_temp = sess.run(accuracy, feed_dict = {x : data_test, y : test_label})
        test_acc.append(test_acc_temp)
        print(loss_temp, train_acc_temp, test_acc_temp)

#将训练的过程可视化
import matplotlib.pyplot as plt                                                                       #需要用到matplotlib.pyplot
plt.plot(loss_train, 'k-')                                                                            #plt.plot:根据loss_train画黑色(-k)的折线图
plt.show()                                                                                            #显示图表

plt.plot(train_acc,'b-', label = 'train_acc')
plt.plot(test_acc, 'r--', label = 'test_acc')
plt.title('train and test accuracy')                                                                  #画标题
plt.legend()                                                                                          #画图例
plt.show()                                                                                            #显示图表

#将最终的预测结果输出成csv文件(以下是我自己写的)
res = sess.run(pred, feed_dict = {x: data_test})                                                      #得到最终预测结果
res_in_df = pd.DataFrame(res)                                                                         #把结果转换层DataFrame形式,方便输出
res_in_df.columns = ['res']
test = pd.read_csv(r'E:/Computer/Python/SRTP/KaggleLearn/Titanic/all/test.csv')
res_in_df['PassengerId'] = test['PassengerId']
res_in_df['Survived'] = res_in_df['res'].astype(int)
del res_in_df['res']
res_in_df.to_csv(r'E:/Computer/Python/SRTP/KaggleLearn/Titanic/all/Result.csv', index = 0)            #输出成csv文件

 

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值