【python】实现贝叶斯分类算法

数据使用的是uci里的Breast+Cancer数据。
Breast+Cancer数据

首先对数据进行基本处理。使用list进行一系列操作完成txt文件中数据的读取,并按类别取出需要使用的部分,因为数据量不大,这里就没有对数据再进行划分,而是直接使用了全部的数据来进行学习。

简便起见,使用的是朴素贝叶斯分类,并运用了拉普拉斯修正。

# -coding: utf-8
# from collections import Counter

ages = ['10-19','20-29','30-39','40-49','50-59','60-69','70-79','80-89','90-99']
menos = ['lt40','ge40','premeno']
tumos = ['0-4','5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59']
invs = ['0-2','3-5','6-8','9-11','12-14','15-17','18-20','21-23','24-26','27-29','30-32','33-35','36-39']
caps = ['yes','no']
degs = ['1','2','3']
bres = ['left','right']
quads = ['left_up','left_low','right_up','right_low','central']
irrs = ['yes','no']

# 读取数据并分类计数
f = open(r"C:\Users\65465\Documents\data\breast-cancer.txt")
line = f.readline()
data_list = []
while line:
    nume = list(map(str,line.split()))
    data_list.append(nume)
    line = f.readline()
f.close()

result = []
for element in data_list:
    for e in element:
        result.append(e)

ala = []
q = 0
for ele in result:
    strm = ele
    strm.split(",")
    ala.insert(q,strm.split(","))
    q += 1

no = []
yes = []
for i in range(0,len(ala)):
    if ala[i][0] == 'no-recurrence-events':
        no.append(ala[i])
    elif ala[i][0] == 'recurrence-events':
        yes.append(ala[i])

# c = Counter(x for sublist in no for x in sublist)
# print(dict(c))

no_age = [0 for i in range(9)]
yes_age = [0 for i in range(9)]
no_meno = [0,0,0]
yes_meno = [0,0,0]
no_tumo = [0 for i in range(12)]
yes_tumo = [0 for i in range(12)]
no_inv = [0 for i in range(13)]
yes_inv = [0 for i in range(13)]
no_cap = [0,0]
yes_cap = [0,0]
no_deg = [0,0,0]
yes_deg = [0,0,0]
no_bre = [0,0]
yes_bre = [0,0]
no_quad = [0 for i in range(5)]
yes_quad = [0 for i in range(5)]
no_irr = [0,0]
yes_irr = [0,0]

for elem in no:
    for i in range(0,9):
        if elem[1] == ages[i]:
            no_age[i] += 1
            break
    for i in range(0,3):
        if elem[2] == menos[i]:
            no_meno[i] += 1
            break
    for i in range(0,12):
        if elem[3] == tumos[i]:
            no_tumo[i] += 1
            break
    for i in range(0,13):
        if elem[4] == invs[i]:
            no_inv[i] += 1
            break
    for i in range(0,2):
        if elem[5] == caps[i]:
            no_cap[i] += 1
            break
    for i in range(0,3):
        if elem[6] == degs[i]:
            no_deg[i] += 1
            break
    for i in range(0,2):
        if elem[7] == bres[i]:
            no_bre[i] += 1
            break
    for i in range(0,5):
        if elem[8] == quads[i]:
            no_quad[i] += 1
            break
    for i in range(0,2):
        if elem[9] == irrs[i]:
            no_irr[i] += 1
            break

for elem in yes:
    for i in range(0,9):
        if elem[1] == ages[i]:
            yes_age[i] += 1
            break
    for i in range(0,3):
        if elem[2] == menos[i]:
            yes_meno[i] += 1
            break
    for i in range(0,12):
        if elem[3] == tumos[i]:
            yes_tumo[i] += 1
            break
    for i in range(0,13):
        if elem[4] == invs[i]:
            yes_inv[i] += 1
            break
    for i in range(0,2):
        if elem[5] == caps[i]:
            yes_cap[i] += 1
            break
    for i in range(0,3):
        if elem[6] == degs[i]:
            yes_deg[i] += 1
            break
    for i in range(0,2):
        if elem[7] == bres[i]:
            yes_bre[i] += 1
            break
    for i in range(0,5):
        if elem[8] == quads[i]:
            yes_quad[i] += 1
            break
    for i in range(0,2):
        if elem[9] == irrs[i]:
            yes_irr[i] += 1
            break

'''
print('The order is in the index.(breast-cancer.names)')
print('Number of no-recurrence-events age:', no_age)
print('Number of no-recurrence-events menopause:', no_meno)
print('Number of no-recurrence-events tumor-size:', no_tumo)
print('Number of no-recurrence-events inv-nodes:', no_inv)
print('Number of no-recurrence-events node-caps:', no_cap)
print('Number of no-recurrence-events deg-malig:', no_deg)
print('Number of no-recurrence-events breast:', no_bre)
print('Number of no-recurrence-events breast-quad:', no_quad)
print('Number of no-recurrence-events irradiat:', no_irr)

print('Number of recurrence-events age:', yes_age)
print('Number of recurrence-events menopause:', yes_meno)
print('Number of recurrence-events tumor-size:', yes_tumo)
print('Number of recurrence-events inv-nodes:', yes_inv)
print('Number of recurrence-events node-caps:', yes_cap)
print('Number of recurrence-events deg-malig:', yes_deg)
print('Number of recurrence-events breast:', yes_bre)
print('Number of recurrence-events breast-quad:', yes_quad)
print('Number of recurrence-events irradiat:', yes_irr)
'''

# 下面计算各个概率
# 先验概率
total = len(no) + len(yes)
p_yes = (len(yes)+1) / (total+2)
p_no = (len(no)+1) / (total+2)

# 每个属性的条件概率
p_age_no = []
p_age_yes = []
for i in range(0,9):
    p_age_no.append((no_age[i] + 1) / (len(no) + 9))
    p_age_yes.append((yes_age[i] + 1) / (len(yes) + 9))

p_meno_no = []
p_meno_yes = []
for i in range(0,3):
    p_meno_no.append((no_meno[i] + 1) / (len(no) + 3))
    p_meno_yes.append((yes_meno[i] + 1) / (len(yes) + 3))

p_tumo_no = []
p_tumo_yes = []
for i in range(0,12):
    p_tumo_no.append((no_tumo[i] + 1) / (len(no) + 12))
    p_tumo_yes.append((yes_tumo[i] + 1) / (len(yes) + 12))

p_inv_no = []
p_inv_yes = []
for i in range(0,13):
    p_inv_no.append((no_inv[i] + 1) / (len(no) + 13))
    p_inv_yes.append((yes_inv[i] + 1) / (len(yes) + 13))

p_cap_no = []
p_cap_yes = []
for i in range(0,2):
    p_cap_no.append((no_cap[i] + 1) / (len(no) + 2))
    p_cap_yes.append((yes_cap[i] + 1) / (len(yes) + 2))

p_deg_no = []
p_deg_yes = []
for i in range(0,3):
    p_deg_no.append((no_deg[i] + 1) / (len(no) + 3))
    p_deg_yes.append((yes_deg[i] + 1) / (len(yes) + 3))

p_bre_no = []
p_bre_yes = []
for i in range(0,2):
    p_bre_no.append((no_bre[i] + 1) / (len(no) + 2))
    p_bre_yes.append((yes_bre[i] + 1) / (len(yes) + 2))

p_quad_no = []
p_quad_yes = []
for i in range(0,5):
    p_quad_no.append((no_quad[i] + 1) / (len(no) + 5))
    p_quad_yes.append((yes_quad[i] + 1) / (len(yes) + 5))

p_irr_no = []
p_irr_yes = []
for i in range(0,2):
    p_irr_no.append((no_irr[i] + 1) / (len(no) + 2))
    p_irr_yes.append((yes_irr[i] + 1) / (len(yes) + 2))


# 测试样本
str = input("请选择从键盘输入数据 或 读取已存储数据\n0 for 键盘输入,1 for 读取 :")
if str == '0':
    in_age = input("age: ")
    in_meno = input("menopause: ")
    in_tumo = input("tumor-size: ")
    in_inv = input("inv-nodes: ")
    in_cap = input("node-caps: ")
    in_deg = input("deg-malig: ")
    in_bre = input("breast: ")
    in_quad = input("breast-quad: ")
    in_irr = input("irradiat: ")

if str == '1':
    str1 = input("请选择第几个数据(1-5):")
    if str1 == '1':
        # no-recurrence-events,40-49,premeno,20-24,3-5,no,2,right,left_low,no
        in_age = '40-49'
        in_meno = 'premeno'
        in_tumo = '20-24'
        in_inv = '3-5'
        in_cap = 'no'
        in_deg = '2'
        in_bre = 'right'
        in_quad = 'left_low'
        in_irr = 'no'
    if str1 == '2':
        # no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
        in_age = '30-39'
        in_meno = 'premeno'
        in_tumo = '30-34'
        in_inv = '0-2'
        in_cap = 'no'
        in_deg = '3'
        in_bre = 'left'
        in_quad = 'left_low'
        in_irr = 'no'
    if str1 == '3':
        # no-recurrence-events,20-29,premeno,35-39,0-2,no,2,right,right_up,no
        in_age = '20-29'
        in_meno = 'premeno'
        in_tumo = '35-39'
        in_inv = '0-2'
        in_cap = 'no'
        in_deg = '2'
        in_bre = 'right'
        in_quad = 'right_up'
        in_irr = 'no'
    if str1 == '4':
        # recurrence-events,30-39,premeno,30-34,9-11,no,2,right,left_up,yes
        in_age = '30-39'
        in_meno = 'premeno'
        in_tumo = '30-34'
        in_inv = '9-11'
        in_cap = 'no'
        in_deg = '2'
        in_bre = 'right'
        in_quad = 'left_up'
        in_irr = 'yes'
    if str1 == '5':
        # no-recurrence-events,60-69,ge40,10-14,0-2,no,1,left,left_up,no
        in_age = '60-69'
        in_meno = 'ge40'
        in_tumo = '10-14'
        in_inv = '0-2'
        in_cap = 'no'
        in_deg = '1'
        in_bre = 'left'
        in_quad = 'left_up'
        in_irr = 'no'



op_no = p_no
op_yes = p_yes

for i in range(0,9):
    if in_age == ages[i]:
        op_no *= p_age_no[i]
        op_yes *= p_age_yes[i]
        break
for i in range(0, 3):
    if in_meno == menos[i]:
        op_no *= p_meno_no[i]
        op_yes *= p_meno_yes[i]
        break
for i in range(0, 12):
    if in_tumo == tumos[i]:
        op_no *= p_tumo_no[i]
        op_yes *= p_tumo_yes[i]
        break
for i in range(0, 13):
    if in_inv == invs[i]:
        op_no *= p_inv_no[i]
        op_yes *= p_inv_yes[i]
        break
for i in range(0, 2):
    if in_cap == caps[i]:
        op_no *= p_cap_no[i]
        op_yes *= p_cap_yes[i]
        break
for i in range(0, 3):
    if in_deg == degs[i]:
        op_no *= p_deg_no[i]
        op_yes *= p_deg_yes[i]
        break
for i in range(0, 2):
    if in_bre == bres[i]:
        op_no *= p_bre_no[i]
        op_yes *= p_bre_yes[i]
        break
for i in range(0, 5):
    if in_quad == quads[i]:
        op_no *= p_quad_no[i]
        op_yes *= p_quad_yes[i]
        break
for i in range(0, 2):
    if in_irr == irrs[i]:
        op_no *= p_irr_no[i]
        op_yes *= p_irr_yes[i]
        break


if op_no > op_yes:
    print("该样例贝叶斯判断为 no-recurrence-events")
elif op_no < op_yes:
    print("该样例贝叶斯判断为 recurrence-events")
else:
    print("该样例贝叶斯判断为 no-recurrence-events 或 recurrence-events 均可")

其中,结果检验的方式有两种。可以选择从键盘输入数据,也可以选择读取预设的五组数据。
下面选择读取存储的第一组数据:
检验
该组数据确实为no-recurrence-events。

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值