The Firefox Temporal Defect Dataset
这篇论文是加拿大学者在去年夏天时在Bugzilla上的Firefox的bug反馈讨论组中的1998~2014年反馈数据得到的一些挖掘成果,其实说是成果也可能抬高了它,其实就是发现了在bug反馈的过程中,开发者之间存在着某种比较频繁的交流模式。具体这种模式是什么样的,在后文会有提及,这与我目前所做的课题很相似,所以写一篇博文记录一下。
首先展示一下数据形式,数据为CSV格式,即逗号分隔符形式,这种形式的好处是直观简介,易于开发者快速使用,数据形式如下图1:
数据集说明:A列:bug_id,B列:该行为的提出时间,C列:这是一种什么行为,D列:该行为距离bug提出经过了多少天。
每一种行为都由一个字母所表示。
具体的字母含义如下图2所示:
其中相同类型的行为我用同一种颜色来表示,这些行为一般在一个bug钟只出现几个,比如:在图1中第1~4列,bug_id=322067的这个bug,只有四种行为就将bug解决了,而bug_id=322075的bug只通过两个行为,一天之内就解决了,可见不同bug的难度不同,处理的过程也是不一样的。
原作者做了什么?
由于这些行为是有时间顺序的,原作者将这些行为进行了基于时间序列的频繁模式挖掘,挖掘出了一些频繁出现的行为模式,如下图3所示:
我们可以看到,二元组存在的行为模式的频度很高,举例:N-C模式,表示先发生了N行为再发生了C行为,它们是有严格的发生次序的,在16年中一共发生了1952次。而五元组N-C-W-C-Z形式同样是有严格的发生次序的,出现了143次。我估计一定还会有六元、七元组的时间序列模式,只不过频度会越来越少而已。
用GSP算法就能轻松求解这些频繁模式,算法参见:http://blog.csdn.net/chixujohnny/article/details/47335911
我要做什么?
我所要做的是使用一种扩展的时间序列频繁模式挖掘,扩展出更多的模式来,比如:二元组C-C,三元组C-C-C,五元组C-C-C-C-C模式,就可以化简为(C)模式,表示多次出现了行为C,具体算法参见:http://blog.csdn.net/chixujohnny/article/details/50569430
这中模式之外我还要再加一种“乱序”的模式,具体算法还没有实现,如果实现了我会在下面加上链接。
预期结果?
对于最终的结果,我不确定能挖掘出里程碑式意义的东西来,但是做学术好歹也要先尝试一下。
预期的结果是,对于我所探索出的扩展型时间序列频繁模式挖掘算法,能够挖掘出想要的结果,并且该模式具有较高的频度,我们才能认为这种模式是比较频繁的(如果频度太低就没什么意义了,基本上等于白挖了)
好了差不多就这样,先写算法吧,有结果我会在下面继续更新:
3.5更新1.3版本源码:
原本我以为不可以剪枝的,经过钻研发现还是有枝可剪的,即min_support == 0的分支(汗),虽然看似这个剪枝减不了多少时间复杂度,但是之前我都是用随机序列测试代码的,这次放上了1w多的数据,原本有生之年算不出的数据居然能算出来了,大笑。但是没有加入时间阈值概念,所以说还是不够完善,时间阈值版本将在2.x版本发布
不多说了上代码,分享学习:Pycharm直接run就行
#coding:utf-8
__author__ = 'ChiXu_15s103144_HIT'
import copy
import sys
import csv
#----------------------------------------------------------#
# 计算Frequent_1 #
#----------------------------------------------------------#
def freq1(dataSet, freq_num):
freq_1 = [] ; Ele = []
for i in range(len(dataSet)):
SID = splitToAlphabet(dataSet[i])
setSID = list(set(SID))
Ele += setSID
setEle = list(set(Ele))
for item in setEle:
if Ele.count(item) >= freq_num:
freq_1.append(item)
print('频繁1项集为: %s' %freq_1)
return freq_1
#----------------------------------------------------------#
# 计算Frequent_more #
#----------------------------------------------------------#
def freq_more(dataSet, freq_num, freq_1):
x = []
queue = []
itemAppear = []
itemAppearNum = []
freqItem = []
for a in freq_1:
x.append(a)
queue.append(x)# queue = [['A'],['B'],['D'],['F']]
x = []
while queue != []: # 先处理多重形式
queueDemo = extendMember(queue, freq_1) # 扩展queue成员 example:[['A', 'A'], ['A', 'B'], ['A', 'D'], ['B', 'A'], ['B', 'B'], ['B', 'D'], ['D', 'A'], ['D', 'B'], ['D', 'D']]
queue = []
for item in queueDemo: # item:['M','W','M']
itemBrief = combinToString(item)
lenth = 3 # 最多查找长度为 3 的对子
for i in range(lenth):
itemBrief = Brief(itemBrief, i+1) # 这里的 i 表示要找的对子长度 itemBrief: 'D(C)(AB)'
itemFreqNum = frequentNum(dataSet, item)
if '(' in itemBrief: # 如果有缩略形式的话,且不是freq1
itemNoBracket = noBracket(itemBrief) # 去括号形式
if itemFreqNum == 0:
sys.stdout.write('.')
continue
elif len(itemNoBracket) == 1: # 是 freq1 的话
freqItem.append(itemNoBracket)
print ('')
sys.stdout.write('遇到频繁1项集 :%s' %combinToString(itemBrief))
continue
elif itemNoBracket in itemAppear:
itemAppear.append(combinToString(item))
itemAppearNum.append(itemFreqNum)
if itemFreqNum + itemAppearNum[itemAppear.index(itemNoBracket)] >= freq_num:
print ('')
sys.stdout.write('缩略 频繁项:%s -> %s' %(combinToString(item), itemBrief))
else:
sys.stdout.write('.')
else: # 无缩略形式
itemAppear.append(combinToString(item))
itemAppearNum.append(itemFreqNum)
if itemFreqNum >= freq_num:
print ('')
sys.stdout.write('无缩略 频繁项:%s' %combinToString(item))
elif itemFreqNum == 0:
sys.stdout.write('.')
continue
else: # 0 < itemFreqNum < freq_num
sys.stdout.write('.')
pass
if itemFreqNum >0: # 频度为 0 不作为待扩展成员
queue.append(item)
#----------------------------------------------------------#
# 将queue成员进行扩展 #
#----------------------------------------------------------#
def extendMember(queue, freq_1): #queueDemo
queueDemo = []
for item in queue:
itemString = combinToString(item)
for alphabet in freq_1:
String = itemString + alphabet
queueDemo.append(splitToAlphabet(String))
#print(queueDemo)
return queueDemo
#----------------------------------------------------------#
# 计算item频度 ##
#----------------------------------------------------------#
def frequentNum(dataSet, item): #freq_num
# item: ['A','B','B','D']
flag = 0
alphabetAppearTimes = 0
freq_num = 0
for SID in dataSet:
SIDalphabetList = splitToAlphabet(SID) # 将该SID分解为字母列表
for alphabet in item:
if alphabet in SIDalphabetList: # 该字母存在于SID中
while flag <= len(SIDalphabetList)-1:
if SIDalphabetList[flag] == alphabet:
flag += 1
alphabetAppearTimes += 1 # 记录有几个item字母在该SID中出现过
break
else:
flag += 1
else:
break # item中某个字母在列表中没有出现则不用检查SID了
if alphabetAppearTimes == len(item): # 这几个字母都在这个SID中出现了
freq_num += 1
flag = 0
alphabetAppearTimes = 0
return freq_num
#----------------------------------------------------------#
# 如果 item 出现过则返回TRUE ##
#----------------------------------------------------------#
def frequentNumOnlyAppear (dataSet, item):
# item: ['A','B','B','D']
flag = 0
alphabetAppearTimes = 0
for SID in dataSet:
SIDalphabetList = splitToAlphabet(SID) # 将该SID分解为字母列表
for alphabet in item:
if alphabet in SIDalphabetList: # 该字母存在于SID中
while flag <= len(SIDalphabetList)-1:
if SIDalphabetList[flag] == alphabet:
flag += 1
alphabetAppearTimes += 1 # 记录有几个item字母在该SID中出现过
break
else:
flag += 1
else:
break # item中某个字母在列表中没有出现则不用检查SID了
if alphabetAppearTimes == len(item): # 这几个字母都在这个SID中出现了
return True
flag = 0
alphabetAppearTimes = 0
return False
#----------------------------------------------------------#
# Brief #
#----------------------------------------------------------#
def Brief(item, lenth): # item:'D(C)ABABAB' return:'D(C)(AB)' lenth:每个对子的长度
itemSplit = []
breakFlag = 0 # 提前跳出 while 的标志
groupNewDemoList = [] # 存放所有的对子序列,从中选出长度最短的作为最优对子
finalString = []
minItem = ''
if '(' in item:
key = 0 # 一个指针
itemLen = len(item)
while key < itemLen:
string = ''
while key < itemLen and item[key] != '(':
string += item[key]
key += 1
if string != '':
itemSplit.append(string)
key += 1
string = ''
while key < itemLen and item[key] != ')':
string += item[key].lower()
key += 1
if string != '':
itemSplit.append(string)
key += 1
else:
itemSplit.append(item)
# ['DCCCABABAB']
for x in itemSplit:
flag = 0 # 分对子时的标志位
while flag<lenth:
if len(x) >= 2*lenth: # 这才有找对子的意义
alphabetList = splitToAlphabet(x)
alphabetList.append('')
alphabetList.insert(0, '') # ['','A','B','A','B','A','B','']
groupNewDemo = []
group = makeGroup(alphabetList, lenth, flag) # 进行分组,比如两两一组或者三三一组,flag是分组的起始位置
longestNum = longestItemNum(group, lenth) # 看两两一组或者三三一组的组数有多少
if longestNum == 1: # 就一组就不用分对了,break
breakFlag = 1 # 提前跳出 while 的标志
break
if longestNum > 1:
groupNew = copy.deepcopy(group)
j = flag + 1
while j<len(groupNew)-1:
if groupNew[j]==groupNew[j+1] and groupNew[j]!=groupNew[j-1] and len(groupNew[j])==len(groupNew[j+1])==lenth: # 添加左括号
groupNew.insert(j, '(')
j += 2
elif groupNew[j]!=groupNew[j+1] and groupNew[j]==groupNew[j-1] and len(groupNew[j])==len(groupNew[j-1])==lenth: # 添加右括号
groupNew.insert(j+1, ')')
j += 2
else:
j += 1
# example: groupNew = ['','A','(','BD','BD',')','F','']
sign = 1
if '(' in groupNew:
while sign<len(groupNew)-1: # 只要groupNew里面还有未处理的对子
if groupNew[sign]!='(':
groupNewDemo.append(groupNew[sign])
sign += 1
else: # 遇到了'('
groupNewDemo.append('(')
groupNew.remove('(')
groupNewDemo.append(groupNew[sign])
groupNewDemo.append(')')
positionBracket = groupNew.index(')')
groupNew.remove(')')
sign = positionBracket
for i in range(sign, len(groupNew)-1):
groupNewDemo.append(groupNew[sign]) # groupNewDemo = ['A','(','BD',')','F']
groupNewDemoList.append(combinToString(groupNewDemo)) # ['(AB)','A(BA)B']
minLen = 10000
ti = 0
while ti < len(groupNewDemoList):
if len(groupNewDemoList[ti]) < minLen:
minLen = len(groupNewDemoList[ti])
minItem = groupNewDemoList[ti]
ti += 1
else:
ti += 1
else:
breakFlag=1
break
else:
break
if breakFlag == 1:
break
flag += 1
if minItem != '':
finalString.append(minItem)
minItem = ''
else:
finalString.append(x)
minItem = ''
# finalString = ['D','C','(AB)'] 下面-> 'D(C)(AB)'
flag = 0 # 指针
final = '' #return
while flag < len(finalString):
if finalString[flag][0].islower() == True: # 如果是小写字母,要加括号
final = final + '(' + finalString[flag].upper() + ')'
flag += 1
else: # 是大写字母,或括号
final += finalString[flag]
flag += 1
#print(final)
return final # 'D(C)ABABAB'
#----------------------------------------------------------#
# 将 itemBrief 中的括号全部去掉 #
#----------------------------------------------------------#
def noBracket(itemBrief):
alphabetList = splitToAlphabet(itemBrief)
flag = 0
while flag < len(alphabetList):
if alphabetList[flag] != '(' and alphabetList[flag] != ')':
flag += 1
else:
alphabetList.pop(flag)
#print combinToString(alphabetList)
return combinToString(alphabetList)
#----------------------------------------------------------#
# 计算item在全转成大写的特殊列表中存在的次数 #
#----------------------------------------------------------#
def changeSpecialNum(changeSpecial, item): #appearTimes
if item not in changeSpecial:
return 0
else:
appearTimes = changeSpecial.count(item)
return appearTimes
#----------------------------------------------------------#
# 将字符串分解为字母 ##
#----------------------------------------------------------#
def splitToAlphabet(item): #alphabetList
alphabetList = []
for i in range(len(item)):
alphabetList.append(item[i])
return alphabetList
#----------------------------------------------------------#
# 将字母合成成字符串 ##
#----------------------------------------------------------#
def combinToString(briefItemList): #briefItem
briefItem = ''
for alphabet in briefItemList:
briefItem += alphabet
return briefItem
#----------------------------------------------------------#
# 将字符串进行分组 # alphabetList=['','A','B','B','B','D','']
#----------------------------------------------------------#
def makeGroup(alphabetList, num, flag): # group num:几几一组
alphabet = ''
alphabetListNew = []
#alphabetList = ['','A','B','B','B','D','']
if num == 1:
#print(alphabetList)
return alphabetList
else:
alphabetList.pop(0)
alphabetList.pop(-1) # 把首末的空位去掉
for i in range(flag):
alphabetListNew.append(alphabetList[i])
while len(alphabetList) - flag >= num:
for i in range(num):
alphabet += alphabetList[flag+i]
alphabetListNew.append(alphabet)
flag = flag + num # 标志位后移num
alphabet = ''
for i in range(flag, len(alphabetList)): # 把剩下几个字母扔进去
alphabetListNew.append(alphabetList[i])
alphabetListNew.insert(0, '')
alphabetListNew.append('')
# alphabetListNew = ['','AB','BB','D','']
#print(alphabetListNew)
alphabetList.append('')
alphabetList.insert(0, '')
return alphabetListNew
#----------------------------------------------------------#
# 两两一组或三三一组的组数有多少 #
#----------------------------------------------------------#
def longestItemNum(group, lenth):
longest = 0
itemNum = 0
if lenth == 1:
return len(group) - 2
else:
for item in group:
if len(item) == longest:
itemNum += 1
elif len(item) > longest:
itemNum = 1
longest = len(item)
else:
continue
return itemNum
# main
print('文件路径为:/Users/John/Desktop/temporal_activity.csv')
dataset = []
csvfile = file('/Users/John/Desktop/temporal_activity.csv', 'rb')
reader = csv.reader(csvfile)
#读取 csv 文件写入 dataset
for line in reader:
if dataset == []:
issue = []
issue.append(line[0]) # issue = ['1116689']
issueDetail = []
for i in xrange(1, len(line)): # issueDetail = ['2015-01-11 22:22:48', 'Z', '12']
issueDetail.append(line[i])
issue.append(issueDetail) # issue = ['1116689', ['2015-01-11 22:22:48', 'Z', '12']]
dataset.append(issue) # dataset = [['1116689', ['2015-01-11 22:22:48', 'Z', '12']]]
else:
if line[0] != dataset[-1][0]: # 遇到了新的issue_id
issue = []
issue.append(line[0]) # issue = ['1116689']
issueDetail = []
for i in xrange(1, len(line)): # issueDetail = ['2015-01-11 22:22:48', 'Z', '12']
issueDetail.append(line[i])
issue.append(issueDetail) # issue = ['1116689', ['2015-01-11 22:22:48', 'Z', '12']]
dataset.append(issue) # dataset = [['1116689', ['2015-01-11 22:22:48', 'Z', '12']]]
else: # 还是上一个issue_id
issueDetail = []
for i in xrange(1, len(line)):
issueDetail.append(line[i])
dataset[-1].append(issueDetail) # dataset = [['1116689', ['2015-01-11 22:22:48', 'Z', '12'], ['2015-01-11 22:22:48', 'C', '12']]]
csvfile.close()
#提取 dataset 并对数据进行处理
datasetActions = []
for item in dataset:
datasetActionsItem = ''
for i in xrange(1, len(item)):
datasetActionsItem += item[i][1]
datasetActions.append(datasetActionsItem)
print(datasetActions)
print('数据预处理完毕!Bug总数: %d' %len(datasetActions))
datasetActions_less = []
datasetActions_more = []
while i < 10000:
if len(datasetActions[i]) <= 16:
datasetActions_less.append(datasetActions[i])
else:
datasetActions_more.append(datasetActions[i])
i += 1
print('小于长度15的社交行为有:%d' %len(datasetActions_less))
print('小于长度15的社交行为有:%d' %len(datasetActions_more))
freq1_num = input('请输入freq1最小支持度: ')
freq_1 = freq1(datasetActions_less, freq1_num)
freqMore_num = input('请输入freq_more最小支持度: ')
freq_more(datasetActions_less, freqMore_num, freq_1)