kaggle tensorflow 泰坦尼克 逻辑回归

import pandas as pd
import numpy as np
import tensorflow as tf 
from sklearn import preprocessing
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources
path=r"E:\learn\pc_code\kaggle\Titanic\all"
data_train = pd.read_csv(path+'/'+'train.csv')

def random_forest_pre(test_unknow_age,df):
    tf.reset_default_graph()
    age_df=df[['Age','Fare','Parch','SibSp','Pclass']]
    known_age = age_df[age_df.Age.notnull()].values  #知道的做训练集
    unknown_age = age_df[age_df.Age.isnull()].values  
    test_x=known_age[:,1:]
    test_y=known_age[:,0]

    #param
    num_steps=1000
    batch_size=100
    num_classes=100
    num_features=4
    num_trees=10
    max_nodes=100000
    X=tf.placeholder(tf.float32,shape=[None,num_features])
    Y=tf.placeholder(tf.float32,shape=[None])
    hparams=tensor_forest.ForestHParams(num_classes=num_classes,
                                        num_features=num_features,
                                        num_trees=num_trees,
                                        max_nodes=max_nodes).fill()

    forest_graph=tensor_forest.RandomForestGraphs(params=hparams)
    train_op=forest_graph.training_graph(X,Y)
    loss_op=forest_graph.training_loss(X,Y)

    infer_op, _, _=forest_graph.inference_graph(X)
    correct_pre=tf.equal(tf.argmax(infer_op,1),tf.cast(Y,tf.int64))
    accuracy_op = tf.reduce_mean(tf.cast(correct_pre,tf.float32))
    init_vars=tf.group(tf.global_variables_initializer(),resources.initialize_resources(resources.shared_resources()))
    sess=tf.Session()
    sess.run(init_vars)
    for i in range(num_steps):
        permutation = np.random.permutation(test_y.shape[0])
        batch_x=test_x[permutation,:][0:batch_size]
        batch_y=test_y[permutation][0:batch_size]
        _, l=sess.run([train_op,loss_op],feed_dict={X:batch_x,Y:batch_y})
        if i%50==0 or i==1:
            acc=sess.run(accuracy_op,feed_dict={X:test_x,Y:test_y})
            print('Step: %i Loss: %f Accuracy: %f' %(i,l,acc)) 

    predictedAges=sess.run(tf.argmax(infer_op,1),feed_dict={X:unknown_age[:,1:]})
    pre_test_y=sess.run(tf.argmax(infer_op,1),feed_dict={X:test_unknow_age[:,1:]})
    df.loc[(df.Age.isnull()),'Age'] = predictedAges 
    return df,pre_test_y
data_test = pd.read_csv(path+'/'+'test.csv')
test_age_df=data_test[['Age','Fare','Parch','SibSp','Pclass']]
test_unknow_age=test_age_df[test_age_df.Age.isnull()].values

pce_df,pre_test_y=random_forest_pre(test_unknow_age,data_train)
data_test.loc[(data_test.Age.isnull()),'Age'] = pre_test_y
pce_df.loc[(pce_df.Cabin.notnull()),'Cabin']='Yes'
pce_df.loc[(pce_df.Cabin.isnull()),'Cabin']='No'

dummies_Cabin=pd.get_dummies(pce_df['Cabin'],prefix='Cabin')
dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data_train['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix= 'Pclass')

pce_df = pd.concat([pce_df, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
pce_df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)


scaler=preprocessing.StandardScaler()
age_scale_param=scaler.fit(pce_df['Age'].reshape(-1,1))
pce_df['Age_scaled'] = scaler.fit_transform(pce_df['Age'].reshape(-1,1), age_scale_param)
fare_scale_param=scaler.fit(pce_df['Fare'].reshape(-1,1))
pce_df['Fare_scaled'] = scaler.fit_transform(pce_df['Fare'].reshape(-1,1), fare_scale_param)

train_df = pce_df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
Survived_y_onehot=pd.get_dummies(train_df['Survived'], prefix= 'Survived')
train_Y=Survived_y_onehot.values
train_np=train_df.values
train_X=train_np[:,1:]
def logi_ressgion(pre_x,test_x,test_y,num_f):
    learning_rate = 0.002
    training_epochs=1000
    batch_size=80
    display_step=1
    x=tf.placeholder(tf.float32,[None,num_f])
    y=tf.placeholder(tf.float32,[None,2])
    W=tf.Variable(tf.random_normal([num_f,2], mean=0.0, stddev=1.0, dtype=tf.float32))
    b=tf.Variable(tf.zeros([2]))
    predict=tf.nn.softmax(tf.matmul(x,W)+b)
    loss=tf.reduce_mean(-tf.reduce_sum(y*tf.log(predict),reduction_indices=1))
    optimizer=tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
    init=tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(training_epochs):
            permutation = np.random.permutation(test_y.shape[0])
            avg_loss=0.
            for i in range(len(test_y)//batch_size-1):
                batch_x=test_x[permutation,:][i*batch_size:(i+1)*batch_size]
                batch_y=test_y[permutation,:][i*batch_size:(i+1)*batch_size]
                _, c=sess.run([optimizer,loss],feed_dict={x:batch_x,y:batch_y})
                avg_loss+=c
            print('Epoch:',epoch+1,'Loss:',avg_loss)
        correct_pre=tf.equal(tf.argmax(predict,1),tf.argmax(y,1))
        accuracy=tf.reduce_mean(tf.cast(correct_pre,tf.float32))
        print('Accuracy:',accuracy.eval({x:test_x,y:test_y}))
        return sess.run(tf.argmax(predict,1),feed_dict={x:pre_x})


data_test.loc[(data_test.Cabin.notnull()),'Cabin']='Yes'
data_test.loc[(data_test.Cabin.isnull()),'Cabin']='No'

dummies_Cabin_te=pd.get_dummies(data_test['Cabin'],prefix='Cabin')
dummies_Embarked_te = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')
dummies_Sex_te = pd.get_dummies(data_test['Sex'], prefix= 'Sex')
dummies_Pclass_te = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')
data_test = pd.concat([data_test, dummies_Cabin_te, dummies_Embarked_te, dummies_Sex_te, dummies_Pclass_te], axis=1)
data_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
scaler=preprocessing.StandardScaler()
age_scale_param=scaler.fit(data_test['Age'].reshape(-1,1))
data_test['Age_scaled'] = scaler.fit_transform(data_test['Age'].reshape(-1,1), age_scale_param)
fare_scale_param=scaler.fit(data_test['Fare'].reshape(-1,1))
data_test['Fare_scaled'] = scaler.fit_transform(data_test['Fare'].reshape(-1,1), fare_scale_param)
test_df = data_test.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
test_np=test_df.values
test_y=logi_ressgion(test_X,train_X,train_Y,len(train_X[0]))
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].values, 'Survived':test_y.astype(np.int32)}) 
result.to_csv(path+"/logistic_regression_predictions.csv", index=False)












  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值