机器学习初学代码(一) LogisticRegression

### LogReg.py:
# -*- coding: utf-8 -*-
# author: Xin Chen


# *****************************************************
# 参考:
# http://blog.csdn.net/zouxy09/article/details/20319673
# https://zhuanlan.zhihu.com/p/21627018?refer=uqer2015
# *****************************************************


from numpy import *
import matplotlib.pyplot as plt
import time

# calculte the sigmoid function
def sigmoid(inX):
    return 1.0/(1+exp(-inX))


# train a logistic regression model using some optional optimize algorithm
# input: train_x is a mat datatype, each row stands for one sample
#        train_y is mat datatype too, each row is the corresponding label
#        opts is optimize option include step and maximum number of iterations
def trainLogRegres(train_x, train_y, opts):
    # calculate training time
    startTime = time.time()

    numSamples, numFeatures = shape(train_x)
    alpha = opts['alpha'];
    maxIter = opts['maxIter']
    weights = ones((numFeatures, 1))

    # optimize through gradient descent algorilthm
    for k in range(maxIter):
        if opts['optimizeType'] == 'gradDescent':  # gradient descent algorilthm
            output = sigmoid(train_x * weights)
            error = train_y - output
            weights = weights + alpha * train_x.transpose() * error
        elif opts['optimizeType'] == 'stocGradDescent':  # stochastic gradient descent
            for i in range(numSamples):
                output = sigmoid(train_x[i, :] * weights)
                error = train_y[i, 0] - output
                weights = weights + alpha * train_x[i, :].transpose() * error
        elif opts['optimizeType'] == 'smoothStocGradDescent':  # smooth stochastic gradient descent
            # randomly select samples to optimize for reducing cycle fluctuations
            dataIndex = range(numSamples)
            for i in range(numSamples):
                alpha = 4.0 / (1.0 + k + i) + 0.01
                randIndex = int(random.uniform(0, len(dataIndex)))
                output = sigmoid(train_x[dataIndex[randIndex], :] * weights)
                error = train_y[randIndex, 0] - output
                weights = weights + alpha * train_x[randIndex, :].transpose() * error
                del (dataIndex[randIndex])  # during one interation, delete the optimized sample
        elif opts['optimizeType'] == 'newton':
            output = sigmoid(train_x * weights)
            hessian = train_x.transpose() * (output * (output-1).transpose()) * train_x
            gradient = train_x.transpose() * (train_y - output)
            weights -= linalg.inv(hessian) * gradient
        else:
            raise NameError('Not support optimize method type!')
    print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
    return weights

        # test your trained Logistic Regression model given test set

def testLogRegres(weights, test_x, test_y):
    numSamples, numFeatures = shape(test_x)
    matchCount = 0
    for i in xrange(numSamples):
        predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
        if predict == bool(test_y[i, 0]):
            matchCount += 1
    accuracy = float(matchCount) / numSamples
    return accuracy


    # show your trained logistic regression model only available with 2-D data
    # 只有样本X是二维的时候才适用

def showLogRegres(weights, train_x, train_y, xlabel='X1', ylabel='X2'):
    # notice: train_x and train_y is mat datatype
    numSamples, numFeatures = shape(train_x)
    if numFeatures != 3:
        print "Sorry! I can not draw because the dimension of your data is not 2!"
        return 1

        # draw all samples
    '''
    for i in xrange(numSamples):
        if int(train_y[i, 0]) == 0:
            plt.plot(train_x[i, 1], train_x[i, 2], 'or')   # 'or'散点、红色
        elif int(train_y[i, 0]) == 1:
            plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
            # draw the classify line
        min_x = min(train_x[:, 1])[0, 0]
        max_x = max(train_x[:, 1])[0, 0]
        weights = mat(weights).getA()  # convert mat to array
        y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
        y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
        plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
        plt.xlabel('X1');
        plt.ylabel('X2')
        plt.show()
    '''

    X0 = []
    Y0 = []
    X1 = []
    Y1 = []
    for i in xrange(numSamples):
        if int(train_y[i, 0]) == 0:
            X0.append(train_x[i, 1])
            Y0.append(train_x[i, 2])
        elif int(train_y[i, 0]) == 1:
            X1.append(train_x[i, 1])
            Y1.append(train_x[i, 2])

    plt.scatter(X0, Y0, marker="o", c='red')
    plt.scatter(X1, Y1, marker="o", c='blue')
    min_x = min(train_x[:, 1])[0, 0]
    max_x = max(train_x[:, 1])[0, 0]
    weights = mat(weights).getA()  # convert mat to array

    y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
    y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
    plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
    plt.xlabel(xlabel);
    plt.ylabel(ylabel)
    plt.show()


#### watermelon3alpha.py:

# -*- coding: utf-8 -*-
# author: Xin Chen

import numpy as np
import pandas as pd
import LogReg

# data来自周志华书p89
data = {
    "index": range(1, 18),
    "density": [0.697, 0.774, 0.634, 0.608, 0.556, 0.403, 0.481, 0.437, 0.666, 0.243, 0.245, 0.343, 0.639, 0.657, 0.360, 0.593, 0.719],
    "sugar": [0.460, 0.376, 0.264, 0.318, 0.215, 0.237, 0.149, 0.211, 0.091, 0.267, 0.057, 0.099, 0.161, 0.198, 0.370, 0.042, 0.103],
    "label": [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
}
data = pd.DataFrame(data)
data["constant"] = 1.0
X = np.mat(data[["constant", "density", "sugar"]])
Y = np.mat(data["label"]).T

opts = {'alpha': 0.1, 'maxIter': 1000, 'optimizeType': 'stocGradDescent'}
# 'optimizeType': 'gradDescent', 'stocGradDescent', 'smoothStocGradDescent', 'newton'
optimalWeights = LogReg.trainLogRegres(X, Y, opts)
print 'optimalWeights=\n', optimalWeights

accuracy = LogReg.testLogRegres(optimalWeights, X, Y)
print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
LogReg.showLogRegres(optimalWeights, X, Y, xlabel='density', ylabel='sugar')


 

转载于:https://my.oschina.net/u/3590872/blog/1217814

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值