频繁集
创建表格
#!/usr/bin/python
#coding=utf-8
#
import xlwt
import numpy as np
support = 0.5
confidence = 0.9
# 创建一个workbook 设置编码
workbook = xlwt.Workbook(encoding = 'utf-8')
# 创建一个worksheet
worksheet = workbook.add_sheet('My Worksheet')
# 写入excel
# 参数对应 行, 列, 值
# 写入ID
id = ['001', '002','003', '004', '005' ]
for i in range(len(id)):
worksheet.write(i,0,id[i])
# 写入数据集
# # A
for i in range(len(id)):
worksheet.write(i,1,'A')
# # B
worksheet.write(1,2,'B')
worksheet.write(3,2,'B')
# # C
worksheet.write(2,3,'C')
worksheet.write(3,3,'C')
worksheet.write(4,3,'C')
# # E
worksheet.write(2,4,'E')
worksheet.write(4,4,'E')
# 保存
workbook.save('Frequent binomial set_189094340.xls')
结果:
读取表格
import xlrd
import xlwt
# coding=utf-8
# 打开文件
data = xlrd.open_workbook(r'C:\Users\yangwenrui_only\PycharmProjects\pythonProject\Frequent binomial set_189094340.xls')
# 查看工作表
data.sheet_names()
print("sheets:" + str(data.sheet_names()))
# 通过文件名获得工作表,获取工作表1
table = data.sheet_by_name('My Worksheet')
# 打印data.sheet_names()可发现,返回的值为一个列表,通过对列表索引操作获得工作表1
# table = data.sheet_by_index(0)
# 获取行数和列数
# 行数:table.nrows
# 列数:table.ncols
print("总行数:" + str(table.nrows))
print("总列数:" + str(table.ncols))
# 获取整行的值 和整列的值,返回的结果为数组
# 整行值:table.row_values(start,end)
# 整列值:table.col_values(start,end)
# 参数 start 为从第几个开始打印,
# end为打印到那个位置结束,默认为none
print("整列值:" + str(table.col_values(0)))
# 获取某个单元格的值,例如获取B3单元格值
cell_B3 = table.cell(3,2).value
print("第三行第二列的值:" + cell_B3)
结果:
一项频繁集判断
import xlrd
import xlwt
import numpy as np
# coding=utf-8
# 打开文件
data = xlrd.open_workbook(r'C:\Users\yangwenrui_only\PycharmProjects\pythonProject\Frequent binomial set_189094340.xls')
# 创建表格和表页
workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('output')
# 查看工作表
data.sheet_names()
print("sheets:" + str(data.sheet_names()))
# 通过文件名获得工作表,获取工作表1
table = data.sheet_by_name('My Worksheet')
# 打印data.sheet_names()可发现,返回的值为一个列表,通过对列表索引操作获得工作表1
# table = data.sheet_by_index(0)
# 获取行数和列数
# 行数:table.nrows
# 列数:table.ncols
print("总行数:" + str(table.nrows))
print("总列数:" + str(table.ncols))
# 获取整行的值 和整列的值,返回的结果为数组
# 整行值:table.row_values(start,end)
# 整列值:table.col_values(start,end)
# 参数 start 为从第几个开始打印,
# end为打印到那个位置结束,默认为none
print("整列值:" + str(table.col_values(0)))
# 获取某个单元格的值,例如获取B3单元格值
cell_B3 = table.cell(3,2).value
print("第三行第二列的值:" + cell_B3)
# 参数区域
support=0.5
confidence=0.9
sum=5
# 一项频繁集处理
A=B=C=E=0
id = ['001', '002','003', '004', '005' ]
for i in range(len(id)):
if table.cell(i,1).value=='A':
A=A+1
if table.cell(i,2).value=='B':
B=B+1
if table.cell(i,3).value=='C':
C=C+1
if table.cell(i,4).value=='E':
E=E+1
itemset_one=['A', 'B', 'C', 'E']
for i in range(len(itemset_one)):
worksheet.write(i,0,itemset_one[i])
worksheet.write(0,1,A)
worksheet.write(1,1,B)
worksheet.write(2,1,C)
worksheet.write(3,1,E)
worksheet.write(0,2,A/sum)
worksheet.write(1,2,B/sum)
worksheet.write(2,2,C/sum)
worksheet.write(3,2,E/sum)
print("表格输出完成")
# 二项频繁集处理
# 保存
workbook.save('Output189094340.xls')
二项频繁集判断
先添加刚才添加的文件:
# 打开文件
data = xlrd.open_workbook(r'C:\Users\yangwenrui_only\PycharmProjects\pythonProject\Frequent binomial set_189094340.xls')
data1 = xlrd.open_workbook(r'C:\Users\yangwenrui_only\PycharmProjects\pythonProject\Output189094340.xls')
# 创建表格和表页
workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('output')
# 查看工作表
data.sheet_names()
print("sheets:" + str(data.sheet_names()))
# 通过文件名获得工作表,获取工作表1
table = data.sheet_by_name('My Worksheet')
table1 = data1.sheet_by_name('output')
再根据Apriori原理筛选掉非频繁集的超集:
import xlrd
import xlwt
import numpy as np
# coding=utf-8
# 打开文件
data = xlrd.open_workbook(r'C:\Users\yangwenrui_only\PycharmProjects\pythonProject\Frequent binomial set_189094340.xls')
data1 = xlrd.open_workbook(r'C:\Users\yangwenrui_only\PycharmProjects\pythonProject\Output189094340.xls')
# 创建表格和表页
workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('output')
# 查看工作表
data.sheet_names()
print("sheets:" + str(data.sheet_names()))
# 通过文件名获得工作表,获取工作表1
table = data.sheet_by_name('My Worksheet')
table1 = data1.sheet_by_name('output')
# 打印data.sheet_names()可发现,返回的值为一个列表,通过对列表索引操作获得工作表1
# table = data.sheet_by_index(0)
# 获取行数和列数
# 行数:table.nrows
# 列数:table.ncols
print("总行数:" + str(table.nrows))
print("总列数:" + str(table.ncols))
# 获取整行的值 和整列的值,返回的结果为数组
# 整行值:table.row_values(start,end)
# 整列值:table.col_values(start,end)
# 参数 start 为从第几个开始打印,
# end为打印到那个位置结束,默认为none
print("整列值:" + str(table.col_values(0)))
# 获取某个单元格的值,例如获取B3单元格值
cell_B3 = table.cell(3,2).value
print("第三行第二列的值:" + cell_B3)
# 参数区域
support=0.5
confidence=0.9
sum=5
# 一项频繁集处理
A=B=C=E=0
id = ['001', '002','003', '004', '005' ]
for i in range(len(id)):
if table.cell(i,1).value=='A':
A=A+1
if table.cell(i,2).value=='B':
B=B+1
if table.cell(i,3).value=='C':
C=C+1
if table.cell(i,4).value=='E':
E=E+1
itemset_one=['A', 'B', 'C', 'E']
for i in range(len(itemset_one)):
worksheet.write(i,0,itemset_one[i])
worksheet.write(0,1,A)
worksheet.write(1,1,B)
worksheet.write(2,1,C)
worksheet.write(3,1,E)
worksheet.write(0,2,A/sum)
worksheet.write(1,2,B/sum)
worksheet.write(2,2,C/sum)
worksheet.write(3,2,E/sum)
print("表格输出完成")
# 二项频繁集处理
# 判断上一环节不符合的
m = 0
Null = None
itemset_two = []
for i in range(len(itemset_one)):
if table1.cell(i,2).value>=support:
worksheet.write(i,3,table1.cell(i,0).value)
if table1.cell(i,3).value != "":
print(table1.cell(i,3).value)
itemset_two.append(table1.cell(i,3).value)
print(itemset_two)
# 组合出频繁二项集
n = 2
for i in range(len(itemset_two)):
if itemset_two[i] != "":
m = m+1
print("有",m,"个符合")
# 保存
workbook.save('Output189094340.xls')
筛选频繁二项集
# 组合出频繁二项集
n = 2
for i in range(len(itemset_two)):
if itemset_two[i] != "":
m = m+1
print("有",m,"个符合")
# 筛选出频繁二项集组合
i = 0
j = 1
itemset_two_num = 0
itemset_two_output=[]
while i<m:
if m<=2:
itemset_two_output.append(itemset_two[i]+itemset_two[i+j])
worksheet.write(i, 4, itemset_two_output[i])
itemset_two_num = 0
break
elif i+j<=m-1:
while itemset_two[i+j] !="":
itemset_two_output.append(itemset_two[i]+itemset_two[i+j])
print(itemset_two_output)
worksheet.write(i,4,itemset_two_output[i])
j = j+1
if itemset_two[i + j] =="":
break
i = i+1
j = 1
itemset_two_num += 1
print(itemset_two_num)
if i==m:
break
# 测试
print(itemset_two_output)
# 频繁二项集处理
A=B=C=E=0
id = ['001', '002','003', '004', '005']
k = 0
pop = 0
once =1
condition=0
if table.cell(3,1).value == itemset_two_output[0][0] and table.cell(3,3).value == itemset_two_output[0][1]:
print("ok")
while k == itemset_two_num:
for i in range(len(id)):
condition1 = False
condition2 = False
print(condition1)
print(condition2)
print("functionI:", i)
for j in range(len(id)):
print("i:",i)
print("j:",j)
if table.cell(i, j).value == itemset_two_output[k][0]:
print("success1")
condition1=True
break
for t in range(len(id)):
print("i:",i)
print("t:",t)
if table.cell(i, t).value == itemset_two_output[k][1]:
print("success2")
condition2=True
break
if condition1 == True and condition2 == True:
print("情况:",condition1,condition2)
pop = pop+1
once = once + 1
结果:
结果判断
if once==i+1 :
k=k+1太阳
worksheet.write(k - 1, 5, once - 2)
worksheet.write(k -1,6,(once-2)/sum)
worksheet.write(k-1,7,((once-2)/sum)/table1.cell(k-1,2).value)
if table1.cell(k-1,6).value>=support and table1.cell(k-1,7).value>=confidence:
worksheet.write(k-1,8,"该关系可用")
由于信任度设置为0.9所以该关系不满足条件 结果未显示: