1.测试数据生成
#生成数据的方法
def dataInit():
a=['a1','a2','a3','a4','a5']
b=['b1','b2','b3','b4']
c=['c1','c2','c3']
d=['d1','d2']
dataList=[]
for i in range(21):
ax=random.randint(0,4)
bx=random.randint(0,3)
cx=random.randint(0,2)
dx=random.randint(0,1)
data=[a[ax],b[bx],c[cx],d[dx]]
dataList.append(data)
#print(dataList)
dataList=DataFrame(dataList)
dataList.to_csv('data.csv',index=False,encoding="utf_8_sig")
#生成数据如下
'''
a1,b3,c1,d2
a2,b4,c1,d2
a4,b2,c1,d2
a1,b2,c1,d2
a1,b2,c1,d1
a5,b1,c1,d1
a5,b4,c3,d2
a5,b3,c2,d2
a2,b1,c3,d1
a1,b3,c3,d1
a4,b3,c1,d2
a3,b3,c1,d1
a5,b4,c2,d1
a2,b4,c1,d2
a1,b2,c3,d2
a2,b3,c2,d1
a5,b4,c3,d1
a3,b2,c1,d1
a3,b1,c2,d1
a1,b1,c2,d2
a3,b2,c3,d1
'''
2.数据预处理
dataList=pd.read_csv('data.csv')
#print(dataList)
#print(dataList.columns.size)
#计算每个维度的取值个数
#计算每个维度的取值集合
lenN=dataList.columns.size
numOfALL=[0]*lenN
#[5,4,3,2]
valueOfAll=[]
#[[a1,a2....],[b1,b2...]]
for j in range(lenN):
valueOfSingle=[] #单个维度的取值
for i in range(len(dataList)):
if dataList.iloc[i, j] not in valueOfSingle:
valueOfSingle.append(dataList.iloc[i, j])
valueOfAll.append(valueOfSingle)
numOfALL[j]=len(valueOfSingle)
print(valueOfAll)
print(numOfALL)
3.BUC算法实现
def BUC(tempList,n,curN,min_sup=3): #curN代表当前维度游标,最小支持度设为3
if curN==n:
return #退出递归
for i in range(numOfALL[curN]):
tempList.append(valueOfAll[curN][i])
#print(tempList)
#print(count(tempList,dataList))
if count(tempList,dataList)>=min_sup:
#if tempList not in result: #查重
print("%s :%d" %(str(tempList),count(tempList,dataList)))
BUC(tempList,n,curN+1) #加一维度递归调用
#tempList.pop() #记得删除temp
tempList.pop()
#划分放弃该维度 AB,ABC,ABCD,ABD走完之后走AC
#即对下一维进行划分
BUC(tempList,n,curN+1)
#计算list出现次数
def count(list,data):
number=0
for i in range(len(data)):
isIn=True
for j in range(len(list)):
#print(type(data.iloc[i]))#series对象
#print(data.iloc[i].tolist())
if list[j] not in data.iloc[i].tolist(): #将series对象转list才能not in
isIn=False
break
if isIn:
number=number+1
return number
if __name__=='__main__':
BUC([], 4, 0)