逻辑回归Python代码

最新推荐文章于 2024-08-21 09:48:53 发布

冯校

最新推荐文章于 2024-08-21 09:48:53 发布

阅读量4.8k

点赞数 1

分类专栏：机器学习

本文链接：https://blog.csdn.net/xyfengbo/article/details/51799206

版权

机器学习专栏收录该内容

2 篇文章 0 订阅

订阅专栏

在工程应用中，我们得到一组数据，两维的，如下所示：

-0.017612   14.053064   0
-1.395634   4.662541    1
-0.752157   6.538620    0
-1.322371   7.152853    0
0.423363    11.054677   0
0.406704    7.067335    1
0.667394    12.741452   0
-2.460150   6.866805    1
0.569411    9.548755    0
-0.026632   10.427743   0
0.850433    6.920334    1
1.347183    13.175500   0
1.176813    3.167020    1
-1.781871   9.097953    0
-0.566606   5.749003    1
0.931635    1.589505    1
-0.024205   6.151823    1
-0.036453   2.690988    1
-0.196949   0.444165    1
1.014459    5.754399    1
1.985298    3.230619    1
-1.693453   -0.557540   1
-0.576525   11.778922   0
-0.346811   -1.678730   1
-2.124484   2.672471    1
1.217916    9.597015    0
-0.733928   9.098687    0
-3.642001   -1.618087   1
0.315985    3.523953    1
1.416614    9.619232    0
-0.386323   3.989286    1
0.556921    8.294984    1
1.224863    11.587360   0
-1.347803   -2.406051   1
1.196604    4.951851    1
0.275221    9.543647    0
0.470575    9.332488    0
-1.889567   9.542662    0
-1.527893   12.150579   0
-1.185247   11.309318   0
-0.445678   3.297303    1
1.042222    6.105155    1
-0.618787   10.320986   0
1.152083    0.548467    1
0.828534    2.676045    1
-1.237728   10.549033   0
-0.683565   -2.166125   1
0.229456    5.921938    1
-0.959885   11.555336   0
0.492911    10.993324   0
0.184992    8.721488    0
-0.355715   10.325976   0
-0.397822   8.058397    0
0.824839    13.730343   0
1.507278    5.027866    1
0.099671    6.835839    1
-0.344008   10.717485   0
1.785928    7.718645    1
-0.918801   11.560217   0
-0.364009   4.747300    1
-0.841722   4.119083    1
0.490426    1.960539    1
-0.007194   9.075792    0
0.356107    12.447863   0
0.342578    12.281162   0
-0.810823   -1.466018   1
2.530777    6.476801    1
1.296683    11.607559   0
0.475487    12.040035   0
-0.783277   11.009725   0
0.074798    11.023650   0
-1.337472   0.468339    1
-0.102781   13.763651   0
-0.147324   2.874846    1
0.518389    9.887035    0
1.015399    7.571882    0
-1.658086   -0.027255   1
1.319944    2.171228    1
2.056216    5.019981    1
-0.851633   4.375691    1
-1.510047   6.061992    0
-1.076637   -3.181888   1
1.821096    10.283990   0
3.010150    8.401766    1
-1.099458   1.688274    1
-0.834872   -1.733869   1
-0.846637   3.849075    1
1.400102    12.628781   0
1.752842    5.468166    1
0.078557    0.059736    1
0.089392    -0.715300   1
1.825662    12.693808   0
0.197445    9.744638    0
0.126117    0.922311    1
-0.679797   1.220530    1
0.677983    2.556666    1
0.761349    10.693862   0
-2.168791   0.143632    1
1.388610    9.341997    0
0.317029    14.739025   0

进行逻辑回归分类，

新建一个logRegresTest.py,如下所示：

# -*- coding: utf-8 -*-

import matplotlib.pyplot as plt
import numpy as np
import time

'''符号函数'''
def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))

'''
逻辑回归训练
'''
def train_logRegres(train_x, train_y, opts):
    startTime = time.time()
    numSamples, numFeatures = np.shape(train_x)
    alpha = opts['alpha'] #步长
    maxIter = opts['maxIter']#迭代次数
    #权重
    weights = np.ones((numFeatures, 1)) #初始化参数为1

    for k in range(maxIter):
        if opts['optimizeType'] == 'gradDescent': # 梯度下降算法
            output = sigmoid(train_x * weights)
            error = train_y - output
            weights = weights + alpha * train_x.transpose() * error
        elif opts['optimizeType'] == 'stocGradDescent': # 随机梯度下降
            for i in range(numSamples):
                output = sigmoid(train_x[i, :] * weights)
                error = train_y[i, 0] - output
                weights = weights + alpha * train_x[i, :].transpose() * error
        elif opts['optimizeType'] == 'smoothStocGradDescent': # 平稳随机梯度下降
            dataIndex = range(numSamples)
            for i in range(numSamples):
                alpha = 4.0 / (1.0 + k + i) + 0.01
                randIndex = int(np.random.uniform(0, len(dataIndex)))
                output = sigmoid(train_x[randIndex, :] * weights)
                error = train_y[randIndex, 0] - output
                weights = weights + alpha * train_x[randIndex, :].transpose() * error
                del(dataIndex[randIndex]) # during one interation, delete the optimized sample
        else:
            raise NameError('Not support optimize method type!')

    print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
    print(weights)
    return weights

'''逻辑回归测试'''
def test_LogRegres(weights, test_x, test_y):
    numSamples, numFeatures = np.shape(test_x)
    matchCount = 0
    for i in xrange(numSamples):
        predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
        if predict == bool(test_y[i, 0]):
            matchCount += 1
    accuracy = float(matchCount) / numSamples
    return accuracy

'''显示'''
def showLogRegres(weights, train_x, train_y):
    # notice: train_x and train_y is mat datatype
    numSamples, numFeatures = np.shape(train_x)
    if numFeatures != 3:
        print "Sorry! I can not draw because the dimension of your data is not 2!"
        return 1

    # draw all samples
    for i in xrange(numSamples):
        if int(train_y[i, 0]) == 0:
            plt.plot(train_x[i, 1], train_x[i, 2], 'or')
        elif int(train_y[i, 0]) == 1:
            plt.plot(train_x[i, 1], train_x[i, 2], 'ob')

    # draw the classify line
    min_x = min(train_x[:, 1])[0, 0]
    max_x = max(train_x[:, 1])[0, 0]
    weights = weights.getA()  # convert mat to array
    y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
    y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
    plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
    plt.xlabel('X1'); plt.ylabel('X2')
    plt.show()


建立logRegresmain函数，如下：

# -*- coding: utf-8 -*-
import numpy as np
from LogRegresTest import *


'''加载数据'''
def loadFile():
    train_x =[]
    train_y =[]
    fileIn = open('data/data.txt')
    for line in fileIn.readlines():
        lineArr = line.strip().split()
        train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])#y=w0+x1*w1+x2*w2,(3个维度)
        train_y.append(float(lineArr[2]))
    return np.mat(train_x), np.mat(train_y).transpose()



'''逻辑回归测试'''
def logRegresMain():
    print "step 1: loading data..."
    train_x, train_y = loadFile()
    test_x = train_x; test_y = train_y

    print "step 2: training..."
    alpha = 0.01
    maxIter = 200
    #gradDescent ,stocGradDescent ,smoothStocGradDescent
    optimizeType = 'gradDescent'#调用的方法

    opts = {'alpha': alpha, 'maxIter': maxIter, 'optimizeType': optimizeType}
    optimalWeights = train_logRegres(train_x, train_y, opts)

    ## step 3: testing
    print "step 3: testing..."
    accuracy = test_LogRegres(optimalWeights, test_x, test_y)

    ## step 4: show the result
    print "step 4: show the result..."
    print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
    showLogRegres(optimalWeights, train_x, train_y)


if __name__=='__main__':
    logRegresMain()
本代码支持gradDescent ,stocGradDescent ,smoothStocGradDescent
这三种优化算法，在步长为0.01，迭代次数为200的情况下，stocGradDescent的分类效果最好。