数据来源:https://www.kaggle.com/c/titanic
Training
import pandas
import numpy
import time
import matplotlib.pyplot as plt
%matplotlib inline
def prepareData(filename):
data = pandas.read_csv(filename)
data['Sex'] = data['Sex'].map({'female':0, 'male':1})
data['Embarked'] = data['Embarked'].map({'S':1, 'C':2, 'Q':3})
pier = [0 if numpy.isnan(item) else item for item in data['Embarked']]
data['Embarked'] = [max(set(pier), key=pier.count) if item == 0 else item for item in pier]
age_avg = numpy.mean([0 if numpy.isnan(item) else item for item in data['Age']])
data['Age'] = [age_avg if numpy.isnan(item) else item for item in data['Age']]
#data['Age'] = [1/(1+numpy.exp(-item)) for item in data['Age']]
data['Age'] = [(item-min(data['Age']))/(max(data['Age'])-min(data['Age'])) for item in data['Age']]
#data['Fare'] = [1/(1+numpy.exp(-item)) for item in data['Fare']]
data['Fare'] = [(item-min(data['Fare']))/(max(data['Fare'])-min(data['Fare'])) for item in data['Fare']]
#data = data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
data.insert(0, 'ones', 1)
return data
def run(X, Y, theta, alpha, steps):
init_time = time.time()
costs = [getCost(X, Y, theta)]
count = 0
with open('titanic/model.txt','w') as f:
for i in range(len(theta)):
f.write('theta_' + str(i) + ',')
f.write('cost\n')
while count < steps:
theta -= alpha*getGradient(X, Y, theta)
cost = getCost(X, Y, theta)
costs.append(cost)
for item in theta:
f.write(str(item)+',')
f.write(str(cost)+'\n')
count += 1
time_spent = time.time()-init_time
return costs, theta, time_spent
def getGradient(X, Y, theta):
gradient = numpy.zeros(len(theta))
for j in range(len(theta)):
tmp = 0
for x,y in zip(X,Y):
tmp += x[j]*(y - 1/(1+numpy.exp(-numpy.dot(theta,x))))
gradient[j] = -1/len(Y)*tmp
return gradient
def getCost(X, Y, theta):
cost = 0
for x,y in zip(X,Y):
cost += -numpy.log(numpy.exp(numpy.dot(theta,x)) + 1) + y*numpy.dot(theta,x)
return -cost/len(Y)
def getAccuracy(train_X, train_Y, theta):
Y_hat = []
for x in train_X:
y_hat = 1/(1+numpy.exp(-numpy.dot(theta, x)))
if y_hat >= 0.5:
Y_hat.append(1)
else:
Y_hat.append(0)
correct = 0.0
for i,j in zip(Y_hat, train_Y):
if i == j:
correct += 1
accuracy = correct/len(Y_hat)
return accuracy
train_data = prepareData('titanic/train.csv')
train_data.head(5)
| ones | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0 | 3 | Braund, Mr. Owen Harris | 1 | 0.271174 | 1 | 0 | A/5 21171 | 0.014151 | NaN | 1 |
| 1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 0 | 0.472229 | 1 | 0 | PC 17599 | 0.139136 | C85 | 2 |
| 2 | 1 | 3 | 1 | 3 | Heikkinen, Miss. Laina | 0 | 0.321438 | 0 | 0 | STON/O2. 3101282 | 0.015469 | NaN | 1 |
| 3 | 1 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 0 | 0.434531 | 1 | 0 | 113803 | 0.103644 | C123 | 1 |
| 4 | 1 | 5 | 0 | 3 | Allen, Mr. William Henry | 1 | 0.434531 | 0 | 0 | 373450 | 0.015713 | NaN | 1 |
train_Y = (train_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1))['Survived'].values
train_X = train_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1).drop(['Survived'], axis=1).values
theta = numpy.random.random(len(train_X[1]))
alpha = 0.001
steps = 10000
costs, theta, time_spent = run(train_X, train_Y, theta, alpha, steps)
accuracy = getAccuracy(train_X, train_Y, theta)
fig = plt.figure(figsize=(18,5))
ax1 = fig.add_subplot(121)
ax1.plot(range(steps+1), costs)
ax1.set_title('Logistic Regression for Titanic Problem -- Time spent: %f\nAccuracy: %f' % (time_spent, accuracy))
ax1.set_xlabel('steps')
ax1.set_ylabel('cost')
ax2 = fig.add_subplot(122)
ax2.plot(range(steps+1)[-1000:-1], costs[-1000:-1])
ax2.set_xlabel('steps')
ax2.set_ylabel('cost')

Testing
test_data = prepareData('titanic/test.csv')
test_data.head(5)
| ones | PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 892 | 3 | Kelly, Mr. James | 1 | 0.452723 | 0 | 0 | 330911 | 0.015282 | NaN | 3 |
| 1 | 1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | 0 | 0.617566 | 1 | 0 | 363272 | 0.013663 | NaN | 1 |
| 2 | 1 | 894 | 2 | Myles, Mr. Thomas Francis | 1 | 0.815377 | 0 | 0 | 240276 | 0.018909 | NaN | 3 |
| 3 | 1 | 895 | 3 | Wirz, Mr. Albert | 1 | 0.353818 | 0 | 0 | 315154 | 0.016908 | NaN | 1 |
| 4 | 1 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | 0 | 0.287881 | 1 | 1 | 3101298 | 0.023984 | NaN | 1 |
test_X = test_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1).values
Y_hat = []
for x in test_X:
y_hat = 1/(1+numpy.exp(-numpy.dot(theta, x)))
if y_hat >= 0.5:
Y_hat.append(1)
else:
Y_hat.append(0)
results = pandas.DataFrame(Y_hat, columns=['Survived'])
results.insert(0, 'PassengerId', test_data['PassengerId'])
results.to_csv('titanic/results.csv')
本文介绍了一个使用逻辑回归算法预测泰坦尼克号乘客生存情况的项目。通过对原始数据进行预处理,包括缺失值填充、特征编码等步骤,并采用梯度下降法训练模型,最终在训练集上达到一定准确率。
1227

被折叠的 条评论
为什么被折叠?



