数据使用的是uci里的Breast+Cancer数据。
Breast+Cancer数据
首先对数据进行基本处理。使用list进行一系列操作完成txt文件中数据的读取,并按类别取出需要使用的部分,因为数据量不大,这里就没有对数据再进行划分,而是直接使用了全部的数据来进行学习。
简便起见,使用的是朴素贝叶斯分类,并运用了拉普拉斯修正。
# -coding: utf-8
# from collections import Counter
ages = ['10-19','20-29','30-39','40-49','50-59','60-69','70-79','80-89','90-99']
menos = ['lt40','ge40','premeno']
tumos = ['0-4','5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59']
invs = ['0-2','3-5','6-8','9-11','12-14','15-17','18-20','21-23','24-26','27-29','30-32','33-35','36-39']
caps = ['yes','no']
degs = ['1','2','3']
bres = ['left','right']
quads = ['left_up','left_low','right_up','right_low','central']
irrs = ['yes','no']
# 读取数据并分类计数
f = open(r"C:\Users\65465\Documents\data\breast-cancer.txt")
line = f.readline()
data_list = []
while line:
nume = list(map(str,line.split()))
data_list.append(nume)
line = f.readline()
f.close()
result = []
for element in data_list:
for e in element:
result.append(e)
ala = []
q = 0
for ele in result:
strm = ele
strm.split(",")
ala.insert(q,strm.split(","))
q += 1
no = []
yes = []
for i in range(0,len(ala)):
if ala[i][0] == 'no-recurrence-events':
no.append(ala[i])
elif ala[i][0] == 'recurrence-events':
yes.append(ala[i])
# c = Counter(x for sublist in no for x in sublist)
# print(dict(c))
no_age = [0 for i in range(9)]
yes_age = [0 for i in range(9)]
no_meno = [0,0,0]
yes_meno = [0,0,0]
no_tumo = [0 for i in range(12)]
yes_tumo = [0 for i in range(12)]
no_inv = [0 for i in range(13)]
yes_inv = [0 for i in range(13)]
no_cap = [0,0]
yes_cap = [0,0]
no_deg = [0,0,0]
yes_deg = [0,0,0]
no_bre = [0,0]
yes_bre = [0,0]
no_quad = [0 for i in range(5)]
yes_quad = [0 for i in range(5)]
no_irr = [0,0]
yes_irr = [0,0]
for elem in no:
for i in range(0,9):
if elem[1] == ages[i]:
no_age[i] += 1
break
for i in range(0,3):
if elem[2] == menos[i]:
no_meno[i] += 1
break
for i in range(0,12):
if elem[3] == tumos[i]:
no_tumo[i] += 1
break
for i in range(0,13):
if elem[4] == invs[i]:
no_inv[i] += 1
break
for i in range(0,2):
if elem[5] == caps[i]:
no_cap[i] += 1
break
for i in range(0,3):
if elem[6] == degs[i]:
no_deg[i] += 1
break
for i in range(0,2):
if elem[7] == bres[i]:
no_bre[i] += 1
break
for i in range(0,5):
if elem[8] == quads[i]:
no_quad[i] += 1
break
for i in range(0,2):
if elem[9] == irrs[i]:
no_irr[i] += 1
break
for elem in yes:
for i in range(0,9):
if elem[1] == ages[i]:
yes_age[i] += 1
break
for i in range(0,3):
if elem[2] == menos[i]:
yes_meno[i] += 1
break
for i in range(0,12):
if elem[3] == tumos[i]:
yes_tumo[i] += 1
break
for i in range(0,13):
if elem[4] == invs[i]:
yes_inv[i] += 1
break
for i in range(0,2):
if elem[5] == caps[i]:
yes_cap[i] += 1
break
for i in range(0,3):
if elem[6] == degs[i]:
yes_deg[i] += 1
break
for i in range(0,2):
if elem[7] == bres[i]:
yes_bre[i] += 1
break
for i in range(0,5):
if elem[8] == quads[i]:
yes_quad[i] += 1
break
for i in range(0,2):
if elem[9] == irrs[i]:
yes_irr[i] += 1
break
'''
print('The order is in the index.(breast-cancer.names)')
print('Number of no-recurrence-events age:', no_age)
print('Number of no-recurrence-events menopause:', no_meno)
print('Number of no-recurrence-events tumor-size:', no_tumo)
print('Number of no-recurrence-events inv-nodes:', no_inv)
print('Number of no-recurrence-events node-caps:', no_cap)
print('Number of no-recurrence-events deg-malig:', no_deg)
print('Number of no-recurrence-events breast:', no_bre)
print('Number of no-recurrence-events breast-quad:', no_quad)
print('Number of no-recurrence-events irradiat:', no_irr)
print('Number of recurrence-events age:', yes_age)
print('Number of recurrence-events menopause:', yes_meno)
print('Number of recurrence-events tumor-size:', yes_tumo)
print('Number of recurrence-events inv-nodes:', yes_inv)
print('Number of recurrence-events node-caps:', yes_cap)
print('Number of recurrence-events deg-malig:', yes_deg)
print('Number of recurrence-events breast:', yes_bre)
print('Number of recurrence-events breast-quad:', yes_quad)
print('Number of recurrence-events irradiat:', yes_irr)
'''
# 下面计算各个概率
# 先验概率
total = len(no) + len(yes)
p_yes = (len(yes)+1) / (total+2)
p_no = (len(no)+1) / (total+2)
# 每个属性的条件概率
p_age_no = []
p_age_yes = []
for i in range(0,9):
p_age_no.append((no_age[i] + 1) / (len(no) + 9))
p_age_yes.append((yes_age[i] + 1) / (len(yes) + 9))
p_meno_no = []
p_meno_yes = []
for i in range(0,3):
p_meno_no.append((no_meno[i] + 1) / (len(no) + 3))
p_meno_yes.append((yes_meno[i] + 1) / (len(yes) + 3))
p_tumo_no = []
p_tumo_yes = []
for i in range(0,12):
p_tumo_no.append((no_tumo[i] + 1) / (len(no) + 12))
p_tumo_yes.append((yes_tumo[i] + 1) / (len(yes) + 12))
p_inv_no = []
p_inv_yes = []
for i in range(0,13):
p_inv_no.append((no_inv[i] + 1) / (len(no) + 13))
p_inv_yes.append((yes_inv[i] + 1) / (len(yes) + 13))
p_cap_no = []
p_cap_yes = []
for i in range(0,2):
p_cap_no.append((no_cap[i] + 1) / (len(no) + 2))
p_cap_yes.append((yes_cap[i] + 1) / (len(yes) + 2))
p_deg_no = []
p_deg_yes = []
for i in range(0,3):
p_deg_no.append((no_deg[i] + 1) / (len(no) + 3))
p_deg_yes.append((yes_deg[i] + 1) / (len(yes) + 3))
p_bre_no = []
p_bre_yes = []
for i in range(0,2):
p_bre_no.append((no_bre[i] + 1) / (len(no) + 2))
p_bre_yes.append((yes_bre[i] + 1) / (len(yes) + 2))
p_quad_no = []
p_quad_yes = []
for i in range(0,5):
p_quad_no.append((no_quad[i] + 1) / (len(no) + 5))
p_quad_yes.append((yes_quad[i] + 1) / (len(yes) + 5))
p_irr_no = []
p_irr_yes = []
for i in range(0,2):
p_irr_no.append((no_irr[i] + 1) / (len(no) + 2))
p_irr_yes.append((yes_irr[i] + 1) / (len(yes) + 2))
# 测试样本
str = input("请选择从键盘输入数据 或 读取已存储数据\n0 for 键盘输入,1 for 读取 :")
if str == '0':
in_age = input("age: ")
in_meno = input("menopause: ")
in_tumo = input("tumor-size: ")
in_inv = input("inv-nodes: ")
in_cap = input("node-caps: ")
in_deg = input("deg-malig: ")
in_bre = input("breast: ")
in_quad = input("breast-quad: ")
in_irr = input("irradiat: ")
if str == '1':
str1 = input("请选择第几个数据(1-5):")
if str1 == '1':
# no-recurrence-events,40-49,premeno,20-24,3-5,no,2,right,left_low,no
in_age = '40-49'
in_meno = 'premeno'
in_tumo = '20-24'
in_inv = '3-5'
in_cap = 'no'
in_deg = '2'
in_bre = 'right'
in_quad = 'left_low'
in_irr = 'no'
if str1 == '2':
# no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
in_age = '30-39'
in_meno = 'premeno'
in_tumo = '30-34'
in_inv = '0-2'
in_cap = 'no'
in_deg = '3'
in_bre = 'left'
in_quad = 'left_low'
in_irr = 'no'
if str1 == '3':
# no-recurrence-events,20-29,premeno,35-39,0-2,no,2,right,right_up,no
in_age = '20-29'
in_meno = 'premeno'
in_tumo = '35-39'
in_inv = '0-2'
in_cap = 'no'
in_deg = '2'
in_bre = 'right'
in_quad = 'right_up'
in_irr = 'no'
if str1 == '4':
# recurrence-events,30-39,premeno,30-34,9-11,no,2,right,left_up,yes
in_age = '30-39'
in_meno = 'premeno'
in_tumo = '30-34'
in_inv = '9-11'
in_cap = 'no'
in_deg = '2'
in_bre = 'right'
in_quad = 'left_up'
in_irr = 'yes'
if str1 == '5':
# no-recurrence-events,60-69,ge40,10-14,0-2,no,1,left,left_up,no
in_age = '60-69'
in_meno = 'ge40'
in_tumo = '10-14'
in_inv = '0-2'
in_cap = 'no'
in_deg = '1'
in_bre = 'left'
in_quad = 'left_up'
in_irr = 'no'
op_no = p_no
op_yes = p_yes
for i in range(0,9):
if in_age == ages[i]:
op_no *= p_age_no[i]
op_yes *= p_age_yes[i]
break
for i in range(0, 3):
if in_meno == menos[i]:
op_no *= p_meno_no[i]
op_yes *= p_meno_yes[i]
break
for i in range(0, 12):
if in_tumo == tumos[i]:
op_no *= p_tumo_no[i]
op_yes *= p_tumo_yes[i]
break
for i in range(0, 13):
if in_inv == invs[i]:
op_no *= p_inv_no[i]
op_yes *= p_inv_yes[i]
break
for i in range(0, 2):
if in_cap == caps[i]:
op_no *= p_cap_no[i]
op_yes *= p_cap_yes[i]
break
for i in range(0, 3):
if in_deg == degs[i]:
op_no *= p_deg_no[i]
op_yes *= p_deg_yes[i]
break
for i in range(0, 2):
if in_bre == bres[i]:
op_no *= p_bre_no[i]
op_yes *= p_bre_yes[i]
break
for i in range(0, 5):
if in_quad == quads[i]:
op_no *= p_quad_no[i]
op_yes *= p_quad_yes[i]
break
for i in range(0, 2):
if in_irr == irrs[i]:
op_no *= p_irr_no[i]
op_yes *= p_irr_yes[i]
break
if op_no > op_yes:
print("该样例贝叶斯判断为 no-recurrence-events")
elif op_no < op_yes:
print("该样例贝叶斯判断为 recurrence-events")
else:
print("该样例贝叶斯判断为 no-recurrence-events 或 recurrence-events 均可")
其中,结果检验的方式有两种。可以选择从键盘输入数据,也可以选择读取预设的五组数据。
下面选择读取存储的第一组数据:
该组数据确实为no-recurrence-events。