【python】实现logistic regression

数据使用的是uci里的Breast+Cancer数据。
Breast+Cancer数据

# -coding: utf-8
import numpy as np
import random

ages = ['10-19','20-29','30-39','40-49','50-59','60-69','70-79','80-89','90-99']
menos = ['lt40','ge40','premeno']
tumos = ['0-4','5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59']
invs = ['0-2','3-5','6-8','9-11','12-14','15-17','18-20','21-23','24-26','27-29','30-32','33-35','36-39']
caps = ['yes','no']
degs = ['1','2','3']
bres = ['left','right']
quads = ['left_up','left_low','right_up','right_low','central']
irrs = ['yes','no']

# 读取数据并分类计数
f = open(r"C:\Users\65465\Documents\data\breast-cancer.txt")
line = f.readline()
data_list = []
while line:
    nume = list(map(str,line.split()))
    data_list.append(nume)
    line = f.readline()
f.close()

result = []
for element in data_list:
    for e in element:
        result.append(e)

ala = []
q = 0
for ele in result:
    strm = ele
    strm.split(",")
    ala.insert(q,strm.split(","))
    q += 1

suma = 0
summ = 0
sumt = 0
sumi = 0
sumc = 0
sumd = 0
sumb = 0
sumq = 0
sumir = 0
for ele in ala:
    if ele[0] == 'no-recurrence-events':     # 分类标签数据   全部化为数值型
        ele[0] = 0
    elif ele[0] == 'recurrence-events':
        ele[0] = 1
    for i in range(0,9):                     # 分类属性数据
        if ele[1] == ages[i]:
            ele[1] = i
            suma += i
    for i in range(0,3):
        if ele[2] == menos[i]:
            ele[2] = i
            summ += i
    for i in range(0,12):
        if ele[3] == tumos[i]:
            ele[3] = i
            sumt += i
    for i in range(0,13):
        if ele[4] == invs[i]:
            ele[4] = i
            sumi += i
    for i in range(0,2):
        if ele[5] == caps[i]:
            ele[5] = i
            sumc += i
    for i in range(0,3):
        if ele[6] == degs[i]:
            ele[6] = i
            sumd += i
    for i in range(0,2):
        if ele[7] == bres[i]:
            ele[7] = i
            sumb += i
    for i in range(0,5):
        if ele[8] == quads[i]:
            ele[8] = i
            sumq += i
    for i in range(0,2):
        if ele[9] == irrs[i]:
            ele[9] = i
            sumir += i

for ele in ala:                # 处理遗漏数据,取平均值
    for i in range(0,9):                     # 分类属性数据
        if ele[1] == '?':
            ele[1] = suma/len(ala)
    for i in range(0,3):
        if ele[2] == '?':
            ele[2] = summ/len(ala)
    for i in range(0,12):
        if ele[3] == '?':
            ele[3] = sumt/len(ala)
    for i in range(0,13):
        if ele[4] == '?':
            ele[4] = sumi/len(ala)
    for i in range(0,2):
        if ele[5] == '?':
            ele[5] = sumc/len(ala)
    for i in range(0,3):
        if ele[6] == '?':
            ele[6] = sumd/len(ala)
    for i in range(0,2):
        if ele[7] == '?':
            ele[7] = sumb/len(ala)
    for i in range(0,5):
        if ele[8] == '?':
            ele[8] = sumq/len(ala)
    for i in range(0,2):
        if ele[9] == '?':
            ele[9] = sumir/len(ala)

alasam = random.sample(ala,95)      # 随机抽取95个作为训练集

test_attrl = []
test_labell = []
for ele in ala:
    test_labell.append(ele[0])
    at = []
    for i in range(1,9):
        at.append(ele[i])
    test_attrl.append(at)

data_attrl = []
data_labell = []
for ele in alasam:
    data_labell.append(ele[0])
    at = []
    for i in range(1,9):
        at.append(ele[i])
    data_attrl.append(at)  # 分类属性数据

# 将列表转为矩阵
test_attr = np.mat(test_attrl)
test_label = np.mat(test_labell).transpose()
data_attr = np.mat(data_attrl)
data_label = np.mat(data_labell).transpose()


# 初始化参数w
w = np.ones((len(data_attrl[0])+1, 1))

# 属性矩阵最后添加一列全1列(参数w中有常数参数)
a = np.ones((len(data_attrl), 1))
data_attr = np.c_[data_attr, a]

# 步长
n = 0.0001

def sigmoid(z):                            # 对数几率函数
	return 1.0 / (1 + np.exp(-z))

def test(dataset, labelset, w):
    data = np.mat(dataset)
    a = np.ones((len(dataset), 1))
    data = np.c_[data, a]

    # 使用训练好的参数w进行计算
    y = sigmoid(np.dot(data, w))
    b, c = np.shape(y)

    # 记录预测正确的个数,用于计算正确率
    rightcount = 0

    for i in range(b):
        flag = -1       # 预测标签
        if y[i, 0] > 0.5:           # 大于0.5的为正例
            flag = 1
        else:                       # 小于等于0.5的为反例
            flag = 0
        # 记录预测正确的个数
        if labelset[i] == flag:
            rightcount += 1
    # 正确率
    rightrate = rightcount / len(dataset)
    return rightrate

rightrate = 0
dest = input('Please input the value of final right rate: ')
while rightrate < float(dest):
    # 计算当前参数w下的预测值
    c = sigmoid(np.dot((data_attr.astype(float)), w))

    # 梯度下降的计算过程,对照着梯度下降的公式
    b = c - data_label
    change = np.dot(np.transpose(data_attr), b)
    w = w - change * n

    # 预测,更新正确率
    rightrate = test(test_attr, test_label, w)


# 最终测试
for sample in test_attr:
    data = np.mat(test_attr)
    a = np.ones((len(test_attr), 1))
    data = np.c_[data, a]

    # 使用训练好的参数w进行计算
    y = sigmoid(np.dot(data, w))
    b, c = np.shape(y)

right = 0
for i in range(b):
    if y[i, 0] > 0.5:    # 大于0.5的为正例
        if test_labell[i] == 1:
            right += 1
    if y[i, 0] < 0.5:    # 小于0.5的为反例
        if test_labell[i] == 0:
            right += 1
right_rate = right / b
print('Rightrate is ',rightrate)

输出结果为正确率。也可设置为输出该例为正确还是错误。

参考资料:
机器学习 对数几率回归模型(Python实现)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值