# 机器学习逻辑回归模型总结——从原理到sklearn实践

## 0x00 基本原理

y{0,1,2,3,...,n}
<script type="math/tex; mode=display" id="MathJax-Element-1">y \in \{0, 1, 2, 3, ..., n\}</script>，显然不能使用线性回归拟合。

hθ(x)=11+eθTx
<script type="math/tex; mode=display" id="MathJax-Element-5">h_\theta(x) = \frac{1}{1+e^{-\theta^Tx}}</script>

P(y=0|x;θ)+P(y=1|x;θ)=1
<script type="math/tex; mode=display" id="MathJax-Element-6">P(y=0|x;\theta) + P(y=1|x;\theta) = 1</script>

J(θ)=1mi=1m12(hθ(x(i))y(i))2
<script type="math/tex; mode=display" id="MathJax-Element-7">J(\theta) = \frac{1}{m}\sum_{i=1}^m{\frac{1}{2}(h_\theta(x^{(i)}) - y^{(i)})^2}</script>

## 0x01 算法实现

function g = sigmoid(z)
g = zeros(size(z));
g = 1 ./ (1+exp(-z));
end

Cost Function的实现：

function [J, grad] = costFunction(theta, X, y)
% 初始化
m = length(y);
J = 0;

% 损失函数的计算
temp = sigmoid(X*theta);
temp = temp(:,size(temp, 2));
J = (1/m) * sum((-y.*log(temp))-((1-y).*log(1-temp))) ;

% 损失函数的导数计算
for i=1:size(theta,1),
grad(i) = (1/m) * sum((temp - y).*X(:,i));
end;
end

function p = predict(theta, X)
m = size(X, 1);
p = zeros(m, 1);

% 计算类别，使用p向量返回
for i=1:m,
prop = sigmoid(X(i,:)*theta) ;
if prop >= 0.5,
p(i) = 1;
end;
end;
end;

## 0x03 sklearn库实践

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve, roc_curve, auc

skiprows=[2], names=['score1','score2','result'])
score_data = data.loc[:,['score1','score2']]
result_data = data.result

p = 0
for i in xrange(10):
x_train, x_test, y_train, y_test = \
train_test_split(score_data, result_data, test_size = 0.2)
model = LogisticRegression(C=1e9)
model.fit(x_train, y_train)
predict_y = model.predict(x_test)
p += np.mean(predict_y == y_test)

# 绘制图像
pos_data = data[data.result == 1].loc[:,['score1','score2']]
neg_data = data[data.result == 0].loc[:,['score1','score2']]

h = 0.02
x_min, x_max = score_data.loc[:, ['score1']].min() - .5, score_data.loc[:, ['score1']].max() + .5
y_min, y_max = score_data.loc[:, ['score2']].min() - .5, score_data.loc[:, ['score2']].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])

# 绘制边界和散点
Z = Z.reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
plt.scatter(x=pos_data.score1, y=pos_data.score2, color='black', marker='o')
plt.scatter(x=neg_data.score1, y=neg_data.score2, color='red', marker='*')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.show()

# 模型表现
precision, recall, thresholds = precision_recall_curve(y_test, answer)
print(classification_report(y_test, report, target_names = ['neg', 'pos']))
print("average precision:", p/100)  

               precision    recall  f1-score   support

neg       0.88      0.88      0.88         8
pos       0.92      0.92      0.92        12

avg / total       0.90      0.90      0.90        20
('average precision:', 0.089999999999999997)

09-30

05-13 2万+
07-05 1万+
06-12 141
06-16 8万+
07-05 116
06-14 2996
09-07 1643