import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn import preprocessing
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources
path=r"E:\learn\pc_code\kaggle\Titanic\all"
data_train = pd.read_csv(path+'/'+'train.csv')
def random_forest_pre(test_unknow_age,df):
tf.reset_default_graph()
age_df=df[['Age','Fare','Parch','SibSp','Pclass']]
known_age = age_df[age_df.Age.notnull()].values #知道的做训练集
unknown_age = age_df[age_df.Age.isnull()].values
test_x=known_age[:,1:]
test_y=known_age[:,0]
#param
num_steps=1000
batch_size=100
num_classes=100
num_features=4
num_trees=10
max_nodes=100000
X=tf.placeholder(tf.float32,shape=[None,num_features])
Y=tf.placeholder(tf.float32,shape=[None])
hparams=tensor_forest.ForestHParams(num_classes=num_classes,
num_features=num_features,
num_trees=num_trees,
max_nodes=max_nodes).fill()
forest_graph=tensor_forest.RandomForestGraphs(params=hparams)
train_op=forest_graph.training_graph(X,Y)
loss_op=forest_graph.training_loss(X,Y)
infer_op, _, _=forest_graph.inference_graph(X)
correct_pre=tf.equal(tf.argmax(infer_op,1),tf.cast(Y,tf.int64))
accuracy_op = tf.reduce_mean(tf.cast(correct_pre,tf.float32))
init_vars=tf.group(tf.global_variables_initializer(),resources.initialize_resources(resources.shared_resources()))
sess=tf.Session()
sess.run(init_vars)
for i in range(num_steps):
permutation = np.random.permutation(test_y.shape[0])
batch_x=test_x[permutation,:][0:batch_size]
batch_y=test_y[permutation][0:batch_size]
_, l=sess.run([train_op,loss_op],feed_dict={X:batch_x,Y:batch_y})
if i%50==0 or i==1:
acc=sess.run(accuracy_op,feed_dict={X:test_x,Y:test_y})
print('Step: %i Loss: %f Accuracy: %f' %(i,l,acc))
predictedAges=sess.run(tf.argmax(infer_op,1),feed_dict={X:unknown_age[:,1:]})
pre_test_y=sess.run(tf.argmax(infer_op,1),feed_dict={X:test_unknow_age[:,1:]})
df.loc[(df.Age.isnull()),'Age'] = predictedAges
return df,pre_test_y
data_test = pd.read_csv(path+'/'+'test.csv')
test_age_df=data_test[['Age','Fare','Parch','SibSp','Pclass']]
test_unknow_age=test_age_df[test_age_df.Age.isnull()].values
pce_df,pre_test_y=random_forest_pre(test_unknow_age,data_train)
data_test.loc[(data_test.Age.isnull()),'Age'] = pre_test_y
pce_df.loc[(pce_df.Cabin.notnull()),'Cabin']='Yes'
pce_df.loc[(pce_df.Cabin.isnull()),'Cabin']='No'
dummies_Cabin=pd.get_dummies(pce_df['Cabin'],prefix='Cabin')
dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data_train['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix= 'Pclass')
pce_df = pd.concat([pce_df, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
pce_df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
scaler=preprocessing.StandardScaler()
age_scale_param=scaler.fit(pce_df['Age'].reshape(-1,1))
pce_df['Age_scaled'] = scaler.fit_transform(pce_df['Age'].reshape(-1,1), age_scale_param)
fare_scale_param=scaler.fit(pce_df['Fare'].reshape(-1,1))
pce_df['Fare_scaled'] = scaler.fit_transform(pce_df['Fare'].reshape(-1,1), fare_scale_param)
train_df = pce_df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
Survived_y_onehot=pd.get_dummies(train_df['Survived'], prefix= 'Survived')
train_Y=Survived_y_onehot.values
train_np=train_df.values
train_X=train_np[:,1:]
def logi_ressgion(pre_x,test_x,test_y,num_f):
learning_rate = 0.002
training_epochs=1000
batch_size=80
display_step=1
x=tf.placeholder(tf.float32,[None,num_f])
y=tf.placeholder(tf.float32,[None,2])
W=tf.Variable(tf.random_normal([num_f,2], mean=0.0, stddev=1.0, dtype=tf.float32))
b=tf.Variable(tf.zeros([2]))
predict=tf.nn.softmax(tf.matmul(x,W)+b)
loss=tf.reduce_mean(-tf.reduce_sum(y*tf.log(predict),reduction_indices=1))
optimizer=tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
init=tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
permutation = np.random.permutation(test_y.shape[0])
avg_loss=0.
for i in range(len(test_y)//batch_size-1):
batch_x=test_x[permutation,:][i*batch_size:(i+1)*batch_size]
batch_y=test_y[permutation,:][i*batch_size:(i+1)*batch_size]
_, c=sess.run([optimizer,loss],feed_dict={x:batch_x,y:batch_y})
avg_loss+=c
print('Epoch:',epoch+1,'Loss:',avg_loss)
correct_pre=tf.equal(tf.argmax(predict,1),tf.argmax(y,1))
accuracy=tf.reduce_mean(tf.cast(correct_pre,tf.float32))
print('Accuracy:',accuracy.eval({x:test_x,y:test_y}))
return sess.run(tf.argmax(predict,1),feed_dict={x:pre_x})
data_test.loc[(data_test.Cabin.notnull()),'Cabin']='Yes'
data_test.loc[(data_test.Cabin.isnull()),'Cabin']='No'
dummies_Cabin_te=pd.get_dummies(data_test['Cabin'],prefix='Cabin')
dummies_Embarked_te = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')
dummies_Sex_te = pd.get_dummies(data_test['Sex'], prefix= 'Sex')
dummies_Pclass_te = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')
data_test = pd.concat([data_test, dummies_Cabin_te, dummies_Embarked_te, dummies_Sex_te, dummies_Pclass_te], axis=1)
data_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
scaler=preprocessing.StandardScaler()
age_scale_param=scaler.fit(data_test['Age'].reshape(-1,1))
data_test['Age_scaled'] = scaler.fit_transform(data_test['Age'].reshape(-1,1), age_scale_param)
fare_scale_param=scaler.fit(data_test['Fare'].reshape(-1,1))
data_test['Fare_scaled'] = scaler.fit_transform(data_test['Fare'].reshape(-1,1), fare_scale_param)
test_df = data_test.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
test_np=test_df.values
test_y=logi_ressgion(test_X,train_X,train_Y,len(train_X[0]))
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].values, 'Survived':test_y.astype(np.int32)})
result.to_csv(path+"/logistic_regression_predictions.csv", index=False)
kaggle tensorflow 泰坦尼克 逻辑回归
最新推荐文章于 2022-05-10 18:22:57 发布