对读者借书的书目进行关联规则处理,通过关联规则来查找读者借书之间的关系.
首先获取读者证号、索书号列表,由于部分数据的索书号为空,或者出现异常值,所以需要对索书号进行数据清洗:
1、数据清洗
"""
数据要求:read_num, book_id
"""
pf = pd.read_csv('new_data.csv', encoding='gbk')
# print pf.head()
"""
处理缺失数据、异常数据
"""
data = pf[['read_num', 'book_id']].copy()
print data
print len(data) # 182508
print '-----------------------------------------'
# 删除空值
data = data.dropna() # dropna()函数返回一个包含非空数据和索引值的Series
print data
print len(data) # 182427
print '-----------------------------------------'
# 重复判断
data_is_duplicate = data.duplicated()
# print data_is_duplicate
print '-----------------------------------------'
# 去除重复
data = data.drop_duplicates()
print data
print len(data)
print '-----------------------------------------'
# 异常值处理,去除book_id中的异常值,由于book_id的值全部为大写字母
data = data[(data['book_id'] >= 'A') & (data['book_id'] <= 'Z')]
print data
print len(data)
2、标签算法
数据集中的索书号全部都是英文字母,在进行数据分析的过程中必须全部转化为数字,所以这里使用标签算法将所有字母全部转换为数字,便于数据分析的处理。
"""
算法:获取标签
"""
def add_label(s):
l = []
m = []
for i in range(len(s)):
if i == 0:
m = []
l = [1]
else:
m.append(s[i - 1])
if s[i] in m:
if m.index(s[i]) == 0:
l.append(1)
else:
l.append(l[m.index(s[i])])
else:
l.append(max(l) + 1)
return l
"""
注意:索书号的首位都是大写字母,出现非大写字母时,就将那一条数据删除,减少误差
"""
read_num = data['read_num'].tolist()
book_id = data['book_id'].tolist()
book_id = add_label(book_id)
# print book_id[:10]
# print read_num[:10]
# 转换成二维数组
new_aprior = []
new_aprior.append(read_num)
new_aprior.append(book_id)
m = np.array(new_aprior).T
# print 'm:', m
# print m[1][0]
# print list(m[1])
3、多值处理
数据处理的过程中,机器学习算法的应用必须满足一定的条件,对于算法的输入数据必须满足格式要求,对于关联规则而言,就要将每个顾客的购物放到同一个list中,才能对所有顾客的购物篮进行关联规则算法的应用。
"""
算法:多值处理
"""
# 字典多值处理
res = {} # 多值字典
for item in m:
k = item[0]
if not res.has_key(k): # 给定的键在字典中,就返回true,否则返回false
res[item[0]] = []
res[item[0]].append(item[1])
# print res
print len(res)
# print res
# 将字典的值全部加到一个新的list中
new = []
for item in res:
if res.has_key(item):
new.append(res.get(item)) # 获取键值
# print new # 得到了每个同学借书的书目 (list)
new_array = np.array(new)
print new_array
print '---------------------------------------------------------------'
4、关联规则
关联规则算法的使用,在最小支持度和最小置信度的选取上一定要进行衡量,值选择太大了,就会导致没有结果产生,太小了,产生的结果就没有任何意义了;这里最小支持度为0.1,最小置信度为0.2
最小支持度: 一个项集出现的概率,A、B两件商品同时购买的概率,A、B两本书同时被借出的概率。
最小置信度:购买A商品的基础上,购买B商品的概率,借出A书的基础上,借出B书的概率。
"""
对学生的借书的书目进行关联规则处理,通过关联规则来查找学生借书之间的关系
"""
# def loadDataSet(): # 加载数据集
# return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
def createC1(dataSet): # 构建所有候选项集的集合,数据中有哪些项
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item]) # C1是列表,对于每一项进行添加,[1,3,4,2,5]
C1.sort()
return map(frozenset, C1)
def scanD(D, Ck, minSupport): # 由候选项集生成符合最小支持度的项集L。参数分别为数据集、候选项集列表,最小支持度
ssCnt = {}
for tid in D: # 对于数据集里的每一条记录
for can in Ck: # 每个候选项集can
if can.issubset(tid): # 若是候选集can是作为记录的子集,那么其值+1,对其计数
if not ssCnt.has_key(can): # ssCnt[can] = ssCnt.get(can,0)+1一句可破,没有的时候为0,加上1,有的时候用get取出,加1
ssCnt[can] = 1
else:
ssCnt[can] += 1
numItems = float(len(D))
retList = []
supportData = {}
for key in ssCnt:
support = ssCnt[key] / numItems # 除以总的记录条数,即为其支持度
if support >= minSupport:
retList.insert(0, key) # 超过最小支持度的项集,将其记录下来。
supportData[key] = support
return retList, supportData
def aprioriGen(Lk, k): # 创建符合置信度的项集Ck,
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i + 1,
lenLk): # k=3时,[:k-2]即取[0],对{0,1},{0,2},{1,2}这三个项集来说,L1=0,L2=0,将其合并得{0,1,2},当L1=0,L2=1不添加,
L1 = list(Lk[i])[:k - 2]
L2 = list(Lk[j])[:k - 2]
L1.sort()
L2.sort()
if L1 == L2:
retList.append(Lk[i] | Lk[j])
return retList
def apriori(dataSet, minSupport=0.1): # 最小支持度
C1 = createC1(dataSet)
D = map(set, dataSet)
L1, supportData = scanD(D, C1, minSupport)
L = [L1] # L将包含满足最小支持度,即经过筛选的所有频繁n项集,这里添加频繁1项集
k = 2
while (len(L[k - 2]) > 0): # k=2开始,由频繁1项集生成频繁2项集,直到下一个打的项集为空
Ck = aprioriGen(L[k - 2], k)
Lk, supK = scanD(D, Ck, minSupport)
supportData.update(supK) # supportData为字典,存放每个项集的支持度,并以更新的方式加入新的supK
L.append(Lk)
k += 1
return L, supportData
# dataSet = loadDataSet() # 加载数据集
dataSet = new
print dataSet
C1 = createC1(dataSet) # 候选项集的获取,候选项集是数据集中项的集合
print "所有候选1项集C1:\n", C1 # [1,2,3,4,5]
D = map(set, dataSet)
print "数据集D:\n", D # 数据集格式:[set(1,3,4), set(2,3,5), set(1,2,3,5), set(2,5)]
L1, supportData0 = scanD(D, C1, 0.1) # 符合最小支持度的频繁1项集L1
print "符合最小支持度的频繁1项集L1:\n", L1
L, suppData = apriori(dataSet) # 所有符合最小支持度的项集L
print "所有符合最小支持度的项集L:\n", L
print "频繁2项集:\n", aprioriGen(L[0], 2)
L, suppData = apriori(dataSet, minSupport=0.1)
print "所有符合最小支持度为0.1的项集L:\n", L
L, suppData = apriori(dataSet, minSupport=0.2)
print "所有符合最小支持度为0.2的项集L:\n", L
print '-----------------------------------------------------------'
def generateRules(L, supportData, minConf=0.1): # 最小置信度
bigRuleList = [] # 规则存放在bigRuleList列表中
for i in range(1, len(L)):
for freqSet in L[i]: # L0是频繁1项集,没关联规则
H1 = [frozenset([item]) for item in freqSet] # H1存放频繁i项集的某个频繁项集的单个元素集合,频繁3项集的{0,1,2}的{{0},{1},{2}
if i > 1:
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) # 从频繁3项集开始,从置信度算出关联规则
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf) # 对频繁2项集,计算置信度
return bigRuleList
def calcConf(freqSet, H, supportData, br1, minConf=0.1): # 计算置信度函数,最小置信度
prunedH = []
for conseq in H:
conf = supportData[freqSet] / supportData[
freqSet - conseq] # conf({2}) = s({{0},{1},{2}})/s({{0},{1},{2}}-{2})
if conf >= minConf:
print freqSet - conseq, "——>", conseq, "conf:", conf # 那么有{{0},{1}}——>{{2}}
br1.append((freqSet - conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
def rulesFromConseq(freqSet, H, supportData, br1, minConf=0.1):
m = len(H[0]) # m,频繁m项集
if (len(freqSet)) > (m + 1):
Hmp1 = aprioriGen(H, m + 1) # 由H,创建m+1项集
Hmp1 = calcConf(freqSet, Hmp1, supportData, br1, minConf) # 保留符合置信度的m+1项集,Hmp1 = prunedH
if (len(Hmp1) > 1):
rulesFromConseq(freqSet, Hmp1, supportData, br1, minConf)
L, suppData = apriori(dataSet, minSupport=0.1) # 符合最小支持度的项集
rules = generateRules(L, suppData, minConf=0.2) # 最小置信度
print rules
print '------------------------------------------------------------------------'
rules = generateRules(L, suppData, minConf=0.5)
print rules
Output:
符合最小支持度的频繁1项集L1:
[frozenset([12]), frozenset([6]), frozenset([4]), frozenset([14]), frozenset([8]), frozenset([11]), frozenset([13]), frozenset([10]), frozenset([5]), frozenset([7]), frozenset([9])]
所有符合最小支持度的项集L:
[[frozenset([12]), frozenset([6]), frozenset([4]), frozenset([14]), frozenset([8]), frozenset([11]), frozenset([13]), frozenset([10]), frozenset([5]), frozenset([7]), frozenset([9])], [frozenset([10, 6]), frozenset([9, 6]), frozenset([6, 7])], []]
频繁2项集:
[frozenset([12, 6]), frozenset([4, 12]), frozenset([12, 14]), frozenset([8, 12]), frozenset([11, 12]), frozenset([12, 13]), frozenset([10, 12]), frozenset([12, 5]), frozenset([12, 7]), frozenset([9, 12]), frozenset([4, 6]), frozenset([14, 6]), frozenset([8, 6]), frozenset([11, 6]), frozenset([13, 6]), frozenset([10, 6]), frozenset([5, 6]), frozenset([6, 7]), frozenset([9, 6]), frozenset([4, 14]), frozenset([8, 4]), frozenset([11, 4]), frozenset([4, 13]), frozenset([10, 4]), frozenset([4, 5]), frozenset([4, 7]), frozenset([9, 4]), frozenset([8, 14]), frozenset([11, 14]), frozenset([13, 14]), frozenset([10, 14]), frozenset([5, 14]), frozenset([14, 7]), frozenset([9, 14]), frozenset([8, 11]), frozenset([8, 13]), frozenset([8, 10]), frozenset([8, 5]), frozenset([8, 7]), frozenset([8, 9]), frozenset([11, 13]), frozenset([10, 11]), frozenset([11, 5]), frozenset([11, 7]), frozenset([9, 11]), frozenset([10, 13]), frozenset([13, 5]), frozenset([13, 7]), frozenset([9, 13]), frozenset([10, 5]), frozenset([10, 7]), frozenset([9, 10]), frozenset([5, 7]), frozenset([9, 5]), frozenset([9, 7])]
所有符合最小支持度为0.1的项集L:
[[frozenset([12]), frozenset([6]), frozenset([4]), frozenset([14]), frozenset([8]), frozenset([11]), frozenset([13]), frozenset([10]), frozenset([5]), frozenset([7]), frozenset([9])], [frozenset([10, 6]), frozenset([9, 6]), frozenset([6, 7])], []]
所有符合最小支持度为0.2的项集L:
[[frozenset([6]), frozenset([10]), frozenset([7]), frozenset([9])], []]
-----------------------------------------------------------
frozenset([6]) ——> frozenset([10]) conf: 0.283821263482
frozenset([10]) ——> frozenset([6]) conf: 0.518143459916
frozenset([6]) ——> frozenset([9]) conf: 0.263790446841
frozenset([9]) ——> frozenset([6]) conf: 0.522269676632
frozenset([7]) ——> frozenset([6]) conf: 0.658473105842
frozenset([6]) ——> frozenset([7]) conf: 0.350847457627
[(frozenset([6]), frozenset([10]), 0.28382126348228043), (frozenset([10]), frozenset([6]), 0.5181434599156117), (frozenset([6]), frozenset([9]), 0.2637904468412943), (frozenset([9]), frozenset([6]), 0.5222696766320928), (frozenset([7]), frozenset([6]), 0.6584731058415269), (frozenset([6]), frozenset([7]), 0.3508474576271186)]
------------------------------------------------------------------------
frozenset([10]) ——> frozenset([6]) conf: 0.518143459916
frozenset([9]) ——> frozenset([6]) conf: 0.522269676632
frozenset([7]) ——> frozenset([6]) conf: 0.658473105842
[(frozenset([10]), frozenset([6]), 0.5181434599156117), (frozenset([9]), frozenset([6]), 0.5222696766320928), (frozenset([7]), frozenset([6]), 0.6584731058415269)]