逻辑回归实战 — Kaggle_Titanic

数据来源:https://www.kaggle.com/c/titanic

Training

import pandas
import numpy
import time
import matplotlib.pyplot as plt
%matplotlib inline

def prepareData(filename):
    data = pandas.read_csv(filename)
    
    data['Sex'] = data['Sex'].map({'female':0, 'male':1})
    
    data['Embarked'] = data['Embarked'].map({'S':1, 'C':2, 'Q':3})
    pier = [0 if numpy.isnan(item) else item for item in data['Embarked']]
    data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]
    
    age_avg = numpy.mean([0 if numpy.isnan(item) else item for item in data['Age']])
    data['Age'] = [age_avg if numpy.isnan(item) else item for item in data['Age']]
    #data['Age'] = [1/(1+numpy.exp(-item)) for item in data['Age']]
    data['Age'] = [(item-min(data['Age']))/(max(data['Age'])-min(data['Age'])) for item in data['Age']]
    
    #data['Fare'] = [1/(1+numpy.exp(-item)) for item in data['Fare']]
    data['Fare'] = [(item-min(data['Fare']))/(max(data['Fare'])-min(data['Fare'])) for item in data['Fare']]
    
    #data = data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
    data.insert(0, 'ones', 1)
    
    return data
def run(X, Y, theta, alpha, steps):
    init_time = time.time()
    costs = [getCost(X, Y, theta)]
    
    count = 0
    with open('titanic/model.txt','w') as f:
        for i in range(len(theta)):
            f.write('theta_' + str(i) + ',')
        f.write('cost\n')
        
        while count < steps:
            theta -= alpha*getGradient(X, Y, theta)
            cost = getCost(X, Y, theta)
            costs.append(cost)
            for item in theta:
                f.write(str(item)+',')
            f.write(str(cost)+'\n')
            count += 1
            
    time_spent = time.time()-init_time
    return costs, theta, time_spent
def getGradient(X, Y, theta):
    gradient = numpy.zeros(len(theta))
    for j in range(len(theta)):
        tmp = 0
        for x,y in zip(X,Y):
            tmp += x[j]*(y - 1/(1+numpy.exp(-numpy.dot(theta,x))))
        gradient[j] = -1/len(Y)*tmp
    return gradient
def getCost(X, Y, theta):
    cost = 0
    for x,y in zip(X,Y):
        cost += -numpy.log(numpy.exp(numpy.dot(theta,x)) + 1) + y*numpy.dot(theta,x)
    return -cost/len(Y)
def getAccuracy(train_X, train_Y, theta):
    Y_hat = []
    for x in train_X:
        y_hat = 1/(1+numpy.exp(-numpy.dot(theta, x)))
        if y_hat >= 0.5:
            Y_hat.append(1)
        else:
            Y_hat.append(0)

    correct = 0.0
    for i,j in zip(Y_hat, train_Y):
        if i == j:
            correct += 1

    accuracy = correct/len(Y_hat)
    return accuracy
train_data = prepareData('titanic/train.csv')
train_data.head(5)
onesPassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
01103Braund, Mr. Owen Harris10.27117410A/5 211710.014151NaN1
11211Cumings, Mrs. John Bradley (Florence Briggs Th...00.47222910PC 175990.139136C852
21313Heikkinen, Miss. Laina00.32143800STON/O2. 31012820.015469NaN1
31411Futrelle, Mrs. Jacques Heath (Lily May Peel)00.434531101138030.103644C1231
41503Allen, Mr. William Henry10.434531003734500.015713NaN1
train_Y = (train_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1))['Survived'].values
train_X = train_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1).drop(['Survived'], axis=1).values
theta = numpy.random.random(len(train_X[1]))
alpha = 0.001
steps = 10000

costs, theta, time_spent = run(train_X, train_Y, theta, alpha, steps)
accuracy = getAccuracy(train_X, train_Y, theta)

fig = plt.figure(figsize=(18,5))
ax1 = fig.add_subplot(121)
ax1.plot(range(steps+1), costs)
ax1.set_title('Logistic Regression for Titanic Problem -- Time spent: %f\nAccuracy: %f' % (time_spent, accuracy))
ax1.set_xlabel('steps')
ax1.set_ylabel('cost')

ax2 = fig.add_subplot(122)
ax2.plot(range(steps+1)[-1000:-1], costs[-1000:-1])
ax2.set_xlabel('steps')
ax2.set_ylabel('cost')

costs vs. steps

Testing

test_data = prepareData('titanic/test.csv')
test_data.head(5)
onesPassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
018923Kelly, Mr. James10.452723003309110.015282NaN3
118933Wilkes, Mrs. James (Ellen Needs)00.617566103632720.013663NaN1
218942Myles, Mr. Thomas Francis10.815377002402760.018909NaN3
318953Wirz, Mr. Albert10.353818003151540.016908NaN1
418963Hirvonen, Mrs. Alexander (Helga E Lindqvist)00.2878811131012980.023984NaN1
test_X = test_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1).values
Y_hat = []
for x in test_X:
    y_hat = 1/(1+numpy.exp(-numpy.dot(theta, x)))
    if y_hat >= 0.5:
        Y_hat.append(1)
    else:
        Y_hat.append(0)
results = pandas.DataFrame(Y_hat, columns=['Survived'])
results.insert(0, 'PassengerId', test_data['PassengerId'])
results.to_csv('titanic/results.csv')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值