数据使用的是uci里的Breast+Cancer数据。
Breast+Cancer数据
# -coding: utf-8
import numpy as np
import random
ages = ['10-19','20-29','30-39','40-49','50-59','60-69','70-79','80-89','90-99']
menos = ['lt40','ge40','premeno']
tumos = ['0-4','5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59']
invs = ['0-2','3-5','6-8','9-11','12-14','15-17','18-20','21-23','24-26','27-29','30-32','33-35','36-39']
caps = ['yes','no']
degs = ['1','2','3']
bres = ['left','right']
quads = ['left_up','left_low','right_up','right_low','central']
irrs = ['yes','no']
# 读取数据并分类计数
f = open(r"C:\Users\65465\Documents\data\breast-cancer.txt")
line = f.readline()
data_list = []
while line:
nume = list(map(str,line.split()))
data_list.append(nume)
line = f.readline()
f.close()
result = []
for element in data_list:
for e in element:
result.append(e)
ala = []
q = 0
for ele in result:
strm = ele
strm.split(",")
ala.insert(q,strm.split(","))
q += 1
suma = 0
summ = 0
sumt = 0
sumi = 0
sumc = 0
sumd = 0
sumb = 0
sumq = 0
sumir = 0
for ele in ala:
if ele[0] == 'no-recurrence-events': # 分类标签数据 全部化为数值型
ele[0] = 0
elif ele[0] == 'recurrence-events':
ele[0] = 1
for i in range(0,9): # 分类属性数据
if ele[1] == ages[i]:
ele[1] = i
suma += i
for i in range(0,3):
if ele[2] == menos[i]:
ele[2] = i
summ += i
for i in range(0,12):
if ele[3] == tumos[i]:
ele[3] = i
sumt += i
for i in range(0,13):
if ele[4] == invs[i]:
ele[4] = i
sumi += i
for i in range(0,2):
if ele[5] == caps[i]:
ele[5] = i
sumc += i
for i in range(0,3):
if ele[6] == degs[i]:
ele[6] = i
sumd += i
for i in range(0,2):
if ele[7] == bres[i]:
ele[7] = i
sumb += i
for i in range(0,5):
if ele[8] == quads[i]:
ele[8] = i
sumq += i
for i in range(0,2):
if ele[9] == irrs[i]:
ele[9] = i
sumir += i
for ele in ala: # 处理遗漏数据,取平均值
for i in range(0,9): # 分类属性数据
if ele[1] == '?':
ele[1] = suma/len(ala)
for i in range(0,3):
if ele[2] == '?':
ele[2] = summ/len(ala)
for i in range(0,12):
if ele[3] == '?':
ele[3] = sumt/len(ala)
for i in range(0,13):
if ele[4] == '?':
ele[4] = sumi/len(ala)
for i in range(0,2):
if ele[5] == '?':
ele[5] = sumc/len(ala)
for i in range(0,3):
if ele[6] == '?':
ele[6] = sumd/len(ala)
for i in range(0,2):
if ele[7] == '?':
ele[7] = sumb/len(ala)
for i in range(0,5):
if ele[8] == '?':
ele[8] = sumq/len(ala)
for i in range(0,2):
if ele[9] == '?':
ele[9] = sumir/len(ala)
alasam = random.sample(ala,95) # 随机抽取95个作为训练集
test_attrl = []
test_labell = []
for ele in ala:
test_labell.append(ele[0])
at = []
for i in range(1,9):
at.append(ele[i])
test_attrl.append(at)
data_attrl = []
data_labell = []
for ele in alasam:
data_labell.append(ele[0])
at = []
for i in range(1,9):
at.append(ele[i])
data_attrl.append(at) # 分类属性数据
# 将列表转为矩阵
test_attr = np.mat(test_attrl)
test_label = np.mat(test_labell).transpose()
data_attr = np.mat(data_attrl)
data_label = np.mat(data_labell).transpose()
# 初始化参数w
w = np.ones((len(data_attrl[0])+1, 1))
# 属性矩阵最后添加一列全1列(参数w中有常数参数)
a = np.ones((len(data_attrl), 1))
data_attr = np.c_[data_attr, a]
# 步长
n = 0.0001
def sigmoid(z): # 对数几率函数
return 1.0 / (1 + np.exp(-z))
def test(dataset, labelset, w):
data = np.mat(dataset)
a = np.ones((len(dataset), 1))
data = np.c_[data, a]
# 使用训练好的参数w进行计算
y = sigmoid(np.dot(data, w))
b, c = np.shape(y)
# 记录预测正确的个数,用于计算正确率
rightcount = 0
for i in range(b):
flag = -1 # 预测标签
if y[i, 0] > 0.5: # 大于0.5的为正例
flag = 1
else: # 小于等于0.5的为反例
flag = 0
# 记录预测正确的个数
if labelset[i] == flag:
rightcount += 1
# 正确率
rightrate = rightcount / len(dataset)
return rightrate
rightrate = 0
dest = input('Please input the value of final right rate: ')
while rightrate < float(dest):
# 计算当前参数w下的预测值
c = sigmoid(np.dot((data_attr.astype(float)), w))
# 梯度下降的计算过程,对照着梯度下降的公式
b = c - data_label
change = np.dot(np.transpose(data_attr), b)
w = w - change * n
# 预测,更新正确率
rightrate = test(test_attr, test_label, w)
# 最终测试
for sample in test_attr:
data = np.mat(test_attr)
a = np.ones((len(test_attr), 1))
data = np.c_[data, a]
# 使用训练好的参数w进行计算
y = sigmoid(np.dot(data, w))
b, c = np.shape(y)
right = 0
for i in range(b):
if y[i, 0] > 0.5: # 大于0.5的为正例
if test_labell[i] == 1:
right += 1
if y[i, 0] < 0.5: # 小于0.5的为反例
if test_labell[i] == 0:
right += 1
right_rate = right / b
print('Rightrate is ',rightrate)
输出结果为正确率。也可设置为输出该例为正确还是错误。
参考资料:
机器学习 对数几率回归模型(Python实现)