import numpy as np
from matplotlib import pyplot as plt


def load_dataset():
    data_list = []
    label_list = []
    with open('testSet.txt', 'r') as f:
        for line in f.readlines():
            line_arr = line.strip().split()
            data_list.append([1.0, float(line_arr[0]), float(line_arr[1])])
            label_list.append(int(line_arr[2]))
    return data_list, label_list


def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))


def grad_ascent(data_list, label_list):
    alpha = 0.001
    data_mat = np.matrix(data_list)
    label_mat = np.matrix(label_list).transpose()
    m, n = data_mat.shape
    max_cycles = 500
    weight = np.ones((n, 1))
    for i in range(max_cycles):
        h = sigmoid(data_mat * weight)
        error = label_mat - h
        weight = weight + alpha * data_mat.transpose() * error
    return weight


def plot_best_fit(weight):
    data_list, label_list = load_dataset()
    data_arr = np.array(data_list)
    n = data_arr.shape[0]
    x_cord1 = []
    y_cord1 = []
    x_cord2 = []
    y_cord2 = []
    for i in range(n):
        if label_list[i] == 1:
            x_cord1.append(data_arr[i, 1])
            y_cord1.append(data_arr[i, 2])
        else:
            x_cord2.append(data_arr[i, 1])
            y_cord2.append(data_arr[i, 2])
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(x_cord1, y_cord1, s=10, c='red', marker='s')
    ax.scatter(x_cord2, y_cord2, s=10, c='green')
    x = np.arange(-3.0, 3.0, 0.1)
    y = (-weight[0, 0] - weight[1, 0] * x) / weight[2, 0]
    ax.plot(x, y)
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.show()


def random_grad_ascent(data_list, label_list):
    data_mat = np.matrix(data_list)
    label_mat = np.matrix(label_list).transpose()
    m, n = data_mat.shape
    alpha = 0.01
    weight = np.ones((n, 1))
    for i in range(m):
        h = sigmoid(data_mat * weight)
        error = label_mat - h
        weight = weight + alpha * data_mat.transpose() * error
    return weight


def random_grad_ascent1(data_list, label_list, num=150):
    data_mat = np.matrix(data_list)
    label_mat = np.matrix(label_list).transpose()
    m, n = data_mat.shape
    weight = np.ones((n, 1))
    for i in range(num):
        data_index = range(m)
        for j in range(m):
            alpha = 4 / (1.0 + i + j) + 0.01
            rand_index = int(np.random.uniform(0, len(data_index)))
            h = sigmoid(data_mat[data_index[rand_index]] * weight)
            error = label_mat[rand_index] - h
            weight = weight + alpha * data_mat[data_index[rand_index]].transpose() * error
            # del data_index[rand_index]
    return weight


def classify_vector(x, weight):
    prob = sigmoid(sum(x * weight))
    return 1.0 if prob > 0.5 else 0.0


def colic_test():
    with open('horseColicTraining.txt', 'r') as f:
        train_set = []
        train_label = []
        for line in f.readlines():
            line_arr1 = line.strip().split("\t")
            line_arr2 = [float(_) for _ in line_arr1[:21]]
            train_set.append(line_arr2)
            train_label.append(float(line_arr1[21]))
    train_weight = random_grad_ascent1(data_list=train_set, label_list=train_label)
    error_count = 0
    num_test_vec = 0.0
    with open('horseColicTest.txt', 'r') as f:
        for line in f.readlines():
            num_test_vec += 1.0
            line_arr1 = line.strip().split("\t")
            line_arr2 = [float(_) for _ in line_arr1[:21]]
            if int(classify_vector(np.array(line_arr2), train_weight)) != int(line_arr1[21]):
                error_count += 1
    error_rate = error_count / num_test_vec
    return error_rate


def multi_test():
    num_test = 10
    error_sum = 0.0
    for i in range(num_test):
        error_sum += colic_test()
    print(f"num_test={num_test}, error_sum={error_sum},error_rate={error_sum / num_test}")


if __name__ == '__main__':
    data_list, label_list = load_dataset()
    weight1 = grad_ascent(data_list, label_list)
    plot_best_fit(weight1)
    weight2 = random_grad_ascent(data_list, label_list)
    plot_best_fit(weight2)
    weight3 = random_grad_ascent1(data_list, label_list, num=150)
    plot_best_fit(weight3)
    multi_test()
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 33.
  • 34.
  • 35.
  • 36.
  • 37.
  • 38.
  • 39.
  • 40.
  • 41.
  • 42.
  • 43.
  • 44.
  • 45.
  • 46.
  • 47.
  • 48.
  • 49.
  • 50.
  • 51.
  • 52.
  • 53.
  • 54.
  • 55.
  • 56.
  • 57.
  • 58.
  • 59.
  • 60.
  • 61.
  • 62.
  • 63.
  • 64.
  • 65.
  • 66.
  • 67.
  • 68.
  • 69.
  • 70.
  • 71.
  • 72.
  • 73.
  • 74.
  • 75.
  • 76.
  • 77.
  • 78.
  • 79.
  • 80.
  • 81.
  • 82.
  • 83.
  • 84.
  • 85.
  • 86.
  • 87.
  • 88.
  • 89.
  • 90.
  • 91.
  • 92.
  • 93.
  • 94.
  • 95.
  • 96.
  • 97.
  • 98.
  • 99.
  • 100.
  • 101.
  • 102.
  • 103.
  • 104.
  • 105.
  • 106.
  • 107.
  • 108.
  • 109.
  • 110.
  • 111.
  • 112.
  • 113.
  • 114.
  • 115.
  • 116.
  • 117.
  • 118.
  • 119.
  • 120.
  • 121.
  • 122.
  • 123.
  • 124.
  • 125.
  • 126.
  • 127.
  • 128.
  • 129.
  • 130.
  • 131.
  • 132.
  • 133.
  • 134.
  • 135.

其他logistic示例或者基于主流机器学习框架实现的logistic代码地址:

https://gitee.com/navysummer/machine-learning/tree/master/logistic
  • 1.