Logistic Regression to do Binary Classification

最新推荐文章于 2021-09-16 11:25:57 发布

小小程序师

最新推荐文章于 2021-09-16 11:25:57 发布

阅读量1.4k

点赞数 1

分类专栏： DM & ML Python 文章标签： machine learning python theano

本文链接：https://blog.csdn.net/geniusluzh/article/details/41898929

版权

Python 同时被 2 个专栏收录

13 篇文章 0 订阅

订阅专栏

DM & ML

11 篇文章 0 订阅

订阅专栏

使用python的theano编写Logistic Regression进行二分类学习，使用到的数据集可以到这里下载。

我们知道Logistic Regression是在一个多元线性函数的基础上加了一个非线性函数，常用的非线性函数是Sigmoid函数。加上sigmoid之后的输出我们认为是对应分类为1的概率，所以需要学习的参数就是线性加权的系数和截距（bias）。

h(x) = wx + b

g(x) = 1 / ( 1 + exp(-h(x)) ) = 1 / ( 1 + exp( -wx-b ) )

那么对应的分类为1的概率可表示为：

p(y=1 | x; w, b) = g(x)

那么对于一个已知数据的概率表示为：

p(y | x; w, b) = g(x)^y (1 - g(x))^(1-y)

于是最后训练的目标函数就是要最大化已知数据的似然函数，将上面的概率进行连乘就是拟合训练数据的似然函数了。但是由于连乘在计算和精度上的问题，通常对似然函数进行log，如果是单个实例进行对数化结果就是：

log(p) = ylog(g(x)) + (1-y)log(1-g(x))

这个看起来有点像交叉熵，将这个对训练数据进行累加就是最后的log似然了。当然前面加一个符号就是负log似然，参数求解就是要最小化这个负log似然时对应的参数情况。常采用的方法是梯度下降。

下面贴上一份借助python theano实现的二分类Logistic Regression，最后输出的是在训练数据上的错误率，有兴趣的同学可以看看。代码中使用到的训练数据可以到这里下载。

# -*- coding: utf-8 -*-
"""
Created on Sun Nov 16 21:37:43 2014
@author: BrightHush
Example for Logistic Regression
"""

import time

import numpy

import theano
import theano.tensor as T

rng = numpy.random

class LogisticRegression(object):
    def __init__(self, input, n_in):
        self.w = theano.shared(
            value=rng.randn(n_in), 
            name='w', 
            borrow=True)
        
        self.b = theano.shared(value=.10, name='b')
        
        self.p_given_x = 1 / (1+T.exp(-T.dot(input, self.w) - self.b))
        self.y_given_x = self.p_given_x > 0.5
        
        self.params = [self.w, self.b]

    def negative_log_likelihood(self, y):
        ll = -y * T.log(self.p_given_x) - (1-y) * T.log(1 - self.p_given_x)
        cost = ll.mean() + 0.01 * (self.w ** 2).sum()
        return cost

    def errors(self, y):
        return T.mean(T.neq(self.y_given_x, y))

def generate_data():
    rng = numpy.random
    N = 1000
    feats = 5
    D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2))
    x = D[0]
    y = D[1]
    
    x, y = read_data()
    x_shared = theano.shared(numpy.asarray(x, 
                                           dtype=theano.config.floatX),
                                           borrow=True)
                                           
    y_shared = theano.shared(numpy.asarray(y, 
                                           dtype=theano.config.floatX),
                                           borrow=True)
    return x_shared, T.cast(y_shared, 'int32')

def sgd_optimization(learning_rate=0.13, n_epochs=1000, batch_size=100):
    train_x, train_y = generate_data()
    n_batches = train_x.get_value(borrow=True).shape[0] / batch_size
    
    index = T.lscalar()
    
    x = T.matrix('x')
    y = T.ivector('y')
    
    lr = LogisticRegression(x, train_x.get_value().shape[1])
    cost = lr.negative_log_likelihood(y)
    
    print 'compile function test_model...'
    test_model = theano.function(inputs=[index], 
                                 outputs=lr.errors(y), 
                                 givens={
                                    x : train_x[index*batch_size : (index+1)*batch_size], 
                                    y : train_y[index*batch_size : (index+1)*batch_size]
                                 })
    
    g_w = T.grad(cost=cost, wrt=lr.w)
    g_b = T.grad(cost=cost, wrt=lr.b)
    updates = [(lr.w, lr.w-learning_rate*g_w), 
               (lr.b, lr.b-learning_rate*g_b)]
    
    print 'complie function train_model...'
    train_model = theano.function(inputs=[index], 
                                  outputs=cost, 
                                  updates=updates, 
                                  givens={
                                      x : train_x[index*batch_size : (index+1)*batch_size],
                                      y : train_y[index*batch_size : (index+1)*batch_size]
                                  })
    
    
    best_train_error = numpy.Inf    
    start_time = time.clock()
    for epoch in xrange(n_epochs):
        for minibatch_index in xrange(n_batches):
            batch_cost = train_model(minibatch_index)
            
        train_errors = [test_model(i) for i in xrange(n_batches)]
        train_error = numpy.mean(train_errors)
        if best_train_error > train_error:
            best_train_error = train_error
            
        print 'epoch %d, best_train_error %lf, train_error %lf' \
            %(epoch, best_train_error, train_error)
            #print 'iterator %d %lf' %(epoch*n_batches + minibatch_index+1, batch_cost)
    end_time = time.clock()
    print 'cost %d' %(end_time-start_time)


def read_data():
    print 'load data...'    
    data = numpy.loadtxt('.\\titanic.dat', delimiter=',',  skiprows=8)
    x = []
    y = []
    for i in xrange(data.shape[0]):
        x.append(data[i,  : data.shape[1]-1])
        if data[i, -1]==-1.0:
            y.append(0)
        else:
            y.append(1)
    
    x = numpy.array(x)
    y = numpy.array(y)
    print '%d examples, %d columns every row' %(data.shape[0], data.shape[1])
    
    #normalize the fatures
    feature_min = x.min(0)    
    feature_max = x.max(0)
    x = x - numpy.array(feature_min)
    x = x / numpy.array(feature_max - feature_min)
    print x.min(0), x.max(0)
        
    return numpy.array(x), numpy.array(y)


if __name__ == '__main__':
    sgd_optimization()