(最近做数据挖掘课设碰到DBLP的合作者挖掘,没有头绪,后来在网上看到田俊大神这篇文章,受益良多,转载方便以后我个人回顾学习以及朋友们学习需要。没有询问同意(不太懂怎么联系到本人)就转发了,如果本人看到了十分抱歉!侵删!)
任务描述:
本文的写作目的是从DBLP数据库中找到经常一起写作的合作者。熟悉数据挖掘中频繁项挖掘的经典算法(FP-Growth)并作出改进和优化。实验代码用Python写的,分别在本地(Win8)和Hadoop集群(条件有限,虚拟机上跑的,3个节点)上实现。(下载本文所涉及全部代码https://github.com/findmyway/DBLP-Coauthor)
任务分解:
- 从DBLP数据集中提取作者信息
- 建立索引作者ID并对文件编码
- 分析数据的规模
- 构建FP-Tree并从FP-Tree得到频繁项集
- 频繁项集挖掘结果分析
- 并行FP-Growth算法的可行性分析
- Hadoop平台上实现FP-Growth算法
- 从DBLP数据集中提取作者信息
首先从官网下载DBLP数据集http://dblp.uni-trier.de/xml/只需下载 dblp.xml.gz 解压后得到1G多dblp.xml文件!文件略大。用vim打开文件后可以看到,所有的作者信息分布在以下这些属性中:‘article’,‘inproceedings’,‘proceedings’,‘book’,‘incollection’,‘phdthesis’,‘mastersthesis’,‘www’
在这里使用python自带的xml分析器解析该文件(注意这里使用sax的方式)源码如下:(其核心思想为,分析器在进入上面那些属性中的某一个时,标记flag=1,然后将author属性的内容输出到文件,退出时再标记flag = 0,最后得到authors.txt 文件)
import codecs
from xml.sax import handler, make_parser
paper_tag = ('article','inproceedings','proceedings','book',
'incollection','phdthesis','mastersthesis','www')
class mHandler(handler.ContentHandler):
def __init__(self,result):
self.result = result
self.flag = 0
def startDocument(self):
print 'Document Start'
def endDocument(self):
print 'Document End'
def startElement(self, name, attrs):
if name == 'author':
self.flag = 1
def endElement(self, name):
if name == 'author':
self.result.write(',')
self.flag = 0
if (name in paper_tag) :
self.result.write('\r\n')
def characters(self, chrs):
if self.flag:
self.result.write(chrs)
def parserDblpXml(source,result):
handler = mHandler(result)
parser = make_parser()
parser.setContentHandler(handler)
parser.parse(source)
if __name__ == '__main__':
source = codecs.open('dblp.xml','r','utf-8')
result = codecs.open('authors.txt','w','utf-8')
parserDblpXml(source,result)
result.close()
source.close()
建立索引作者ID
读取步骤1中得到的authors.txt文件,将其中不同的人名按照人名出现的次序编码,存储到文件authors_index.txt中,同时将编码后的合作者列表写入authors_encoded.txt文件。
import codecs
source = codecs.open('authors.txt','r','utf-8')
result = codecs.open('authors_encoded.txt','w','utf-8')
index = codecs.open('authors_index.txt','w','utf-8')
index_dic = {}
name_id = 0
## build an index_dic, key -> authorName value => [id, count]
for line in source:
name_list = line.split(',')
for name in name_list:
if not (name == '\r\n'):
if name in index_dic:
index_dic[name][1] +=1
else:
index_dic[name] = [name_id,1]
index.write(name + u'\r\n')
name_id += 1
result.write(str(index_dic[name][0]) + u',')
result.write('\r\n')
source.close()
result.close()
index.close()
(这里注意编码,不然会出现UnicodeError,很蛋疼的。。。)
分析数据的规模
查看在DBLP数据集中作者发表文章的数量。即统计只发表过1次文章的人数有多少,发表过2篇文章的人数有多少…发表过n篇文章的有多少人…并且绘图
选取支持度为40 到200 这一段放大后查看
分析可知,当支持度为40的作者数量接近1000,随后支持度每增加20,对应的作者数量减半,为了降低计算量,第一次实验时支持度阈值不宜选得太小,同时为了避免结果数量太少,初次实验时阈值可选在40~60之间(在接下来的实验中选的是40)。
view_data.py
from matplotlib.font_manager import FontProperties
font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14)
import codecs
import matplotlib.pyplot as plt
import numpy as np
data = codecs.open('authors_encoded.txt','r','utf-8')
word_counts = {}
maxCounts = 0
for line in data:
line = line.split(',')
for word in line[0:-1]:
word_counts[word] = word_counts.get(word,0) + 1
if word_counts[word] > maxCounts:
maxCounts = word_counts[word]
maxKey = word
xMax = maxCounts
data.close()
bins = {}
for k,v in word_counts.iteritems():
bins[v] = bins.get(v,0) + 1
y = []
for i in range(40, 200):
y.append(bins.get(i,0))
plt.plot(y,'-');
plt.grid()
plt.yticks(range(0,1000,100))
plt.xticks(range(0,160,20),range(40,200,20))
plt.xlabel(u'支持度',fontproperties=font)
plt.ylabel(u'对应支持度下的作者个数',fontproperties=font)
plt.title(u'作者数量与支持度之间的对应关系',fontproperties=font)
plt.show()
构建FP-Tree并从FP-Tree得到频繁项集
FP-Tree算法的原理在这里不展开讲了,其核心思想分为2步,首先扫描数据库得到FP-Tree,然后再从树上递归生成条件模式树并上溯找到频繁项集。这里借用Machine Learning in Action 中的核心代码。(写得真心好,值得深入学习)。
class treeNode:
def __init__(self, nameValue, numOccur, parentNode):
self.name = nameValue
self.count = numOccur
self.nodeLink = None
self.parent = parentNode #needs to be updated
self.children = {}
def inc(self, numOccur):
self.count += numOccur
def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine
freqDic = {}
#go over dataSet twice
for trans in dataSet:#first pass counts frequency of occurance
for item in trans:
freqDic[item] = freqDic.get(item, 0) + dataSet[trans]
headerTable = {k:v for (k,v) in freqDic.iteritems() if v >= minSup}
if len(headerTable) == 0: return None, None #if no items meet min support -->get out
for k in headerTable:
headerTable[k] = [headerTable[k], None] #reformat headerTable to use Node link
#print 'headerTable: ',headerTable
retTree = treeNode('Null Set', 1, None) #create tree
for tranSet, count in dataSet.items(): #go through dataset 2nd time
localD = {}
for item in tranSet: #put transaction items in order
if headerTable.get(item,0):
localD[item] = headerTable[item][0]
if len(localD) > 0:
orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
updateTree(orderedItems, retTree, headerTable, count)#populate tree with ordered freq itemset
return retTree, headerTable #return tree and header table
def updateTree(items, inTree, headerTable, count):
if items[0] in inTree.children:#check if orderedItems[0] in retTree.children
inTree.children[items[0]].inc(count) #incrament count
else: #add items[0] to inTree.children
inTree.children[items[0]] = treeNode(items[0], count, inTree)
if headerTable[items[0]][1] == None: #update header table
headerTable[items[0]][1] = inTree.children[items[0]]
else:
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
if len(items) > 1:#call updateTree() with remaining ordered items
updateTree(items[1::], inTree.children[items[0]], headerTable, count)
def updateHeader(nodeToTest, targetNode): #this version does not use recursion
while (nodeToTest.nodeLink != None): #Do not use recursion to traverse a linked list!
nodeToTest = nodeToTest.nodeLink
nodeToTest.nodeLink = targetNode
def ascendTree(leafNode, prefixPath): #ascends from leaf node to root
if leafNode.parent != None:
prefixPath.append(leafNode.name)
ascendTree(leafNode.parent, prefixPath)
def findPrefixPath(basePat, treeNode): #treeNode comes from header table
condPats = {}
while treeNode != None:
prefixPath = []
ascendTree(treeNode, prefixPath)
if len(prefixPath) > 1:
condPats[frozenset(prefixPath[1:])] = treeNode.count
treeNode = treeNode.nodeLink
return condPats
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]#(sort header table)
for basePat in bigL: #start from bottom of header table
newFreqSet = preFix.copy()
newFreqSet.add(basePat)
#print 'finalFrequent Item: ',newFreqSet #append to set
if len(newFreqSet) > 1:
freqItemList[frozenset(newFreqSet)] = headerTable[basePat][0]
condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
myCondTree, myHead = createTree(condPattBases, minSup)
#print 'head from conditional tree: ', myHead
if myHead != None: #3. mine cond. FP-tree
#print 'conditional tree for: ',newFreqSet
#myCondTree.disp(1)
mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
def loadSimpDat(inFile):
dataSet = {}
for line in inFile:
line =line.strip().split(',')
dataLine = [word for word in line if word.isdigit()]
dataSet[frozenset(dataLine)] = dataSet.get(frozenset(dataLine),0) + 1
return dataSet
if __name__ == "__main__":
minSup = 100
print "Reading Source File ... Wait..."
with open('authors_encoded.txt','r') as f:
dataSet = loadSimpDat(f)
print "Constructing FP-tree ... Wait..."
myFPtree, myHeaderTab = createTree(dataSet, minSup)
print "Mining frequent items ... Wait..."
myFreqList = {}
mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
print "Totally %d frequent itemsets found ! " %len(myFreqList)
print "Constructing authors_index... Wait..."
maxCoauthors = 0
for freqAuthors in myFreqList.keys():
if len(freqAuthors) > maxCoauthors:
maxCoauthors = len(freqAuthors)
print "the max num of coauthors is %d " % (maxCoauthors)
with open('authors_index.txt','r') as authorsIndex:
i = 0
authorsDic = {}
for name in authorsIndex:
name = name.strip()
authorsDic[i] = name
i = i+1
print "Writing result into result.txt... Wait..."
with open('result4.txt','w') as result2:
with open('result3.txt','w') as result:
result.write("%25s\t%25s\t%15s\t%10s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\n" \
%('authorA','authorB','authorC','Sup(A,B,C)','Sup(A)','Sup(B)','Sup(C)',\
'Con(A)','Con(B)','Con(C)','MinCon','MaxCon'))
result2.write("%25s\t%25s\t%15s\t%10s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\n" \
%('authorA','authorB','authorC','Sup(A,B,C)','Sup(A)','Sup(B)','Sup(C)',\
'Con(A)','Con(B)','Con(C)','MinCon','MaxCon'))
resultList = sorted(myFreqList.items(), key=lambda p: p[1], reverse=True)
for itemSet, support in resultList:
itemList = list(itemSet)
A = itemList[0]
authorA = authorsDic.get(int(A),'0')
B = itemList[1]
authorB = authorsDic.get(int(B),'0')
SupAB_C = int(support)
SupA = int(myHeaderTab.get(A,[0])[0])
SupB = int(myHeaderTab.get(B,[0])[0])
ConA = float(SupAB_C) / float(SupA)
ConB = float(SupAB_C) / float(SupB)
(C,authorC,SupC,ConC) = ('','',0.0,0.0)
if len(itemList) == 3:
C = itemList[2]
authorC = authorsDic.get(int(C),'0')
SupC = int(myHeaderTab.get(C,[0])[0])
ConC = float(SupAB_C) / float(SupC)
MinCon = min([ConA, ConB, ConC])
MaxCon = max([ConA, ConB, ConC])
elif len(itemList) == 2:
MinCon = min([ConA, ConB])
MaxCon = max([ConA, ConB])
if MinCon < 0.4 or MaxCon < 0.5 or (MinCon + MaxCon)/2 < 0.5:
continue
result.write("%25s\t%25s\t%15s\t%10.0f\t%6.0f\t%6.0f\t%6.0f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\n" \
%(authorA,authorB,authorC,SupAB_C,\
SupA,SupB,SupC,ConA,ConB,ConC,MinCon,MaxCon))
result2.write("%25s\t%25s\t%15s\t%10.0f\t%6.0f\t%6.0f\t%6.0f\t\%6.4f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\n"\
%(A,B,C,SupAB_C,SupA,SupB,SupC,\
ConA,ConB,ConC,MinCon,MaxCon))
print "Finished !"
频繁项集挖掘结果分析
在选取频繁度为40后发现,得到的结果非常多,总共2000多,为了分析的方便,进一步提高频繁度阈值为100,此时得到了111条记录,按照合作者的共同支持度排序,部分截图如下:
输出结果说明
统计满足支持度条件的合作者个数可以发现,经常一起合作的作者数目最多为3,故在输出文件中输出了authorA,authorB,authorC(当合作者数目为2时,authorC为空,其对应支持度和置信度为0),Sup(A,B,C)为A,B,C共同合作的次数,Sup(A)Sup(B)Sup(C)分别为A,B,C各自的写作次数,Con(A)、Con(B)、Con(C)分别表示A,B,C的置信度(即合作次数除以写作总次数)MinCon和MaxCon分别统计Con(A)、Con(B)、Con(C)的最小值和最大值(注意,当authorC为空时,其对应的置信度不加入最大最小值的统计)
输出结果分析
上面的结果是没有经过任何处理的结果,初步分析可以发现以下特性:
1.在满足支持度条件的合作者中,大多数是两个人,但是也有少数3个人一起经常合作的情况;
2.由于在这里我们关注的是作者之间的合作程度,故可以不关注提升度对于结果的影响;
3.合作者之间的关系是双向性的,也就是说,A与B的合作程度与B与A合作的程度是一致的,因此可以直接考虑置信度;
4.在按支持度排序后,某些作者的置信度较低,需要引入置信度阈值,为了避免置信度不平衡的情况(比如A经常和B合作,但该合作次数占B写作次数很少一部分),加入阈值条件MinCon>=0.3,同时置信度中的较大值应该满足MaxCon>=0.5,另外加入平衡条件(MinCon + MaxCon)/2 >= 0.5,过滤后的输出结果入下:
再对该结果文件分析发现,输出结果降低到82条,同时可以看到MinCon分布在(0.3,0.4)之间的记录很少,因此,可以考虑将MinCon的阈值调整到0.4
可视化:
在这里将作者与其合作者之间的关系用图来表示以排名在最前面的作者Irith Pomeranz 为例.
# -*- coding: utf-8 -*-
import itertools
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import codecs
from matplotlib.font_manager import FontProperties
font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc")
def createEdge(nodeX):
with codecs.open('authors.txt','r','utf-8') as f:
for line in f:
line = line.strip().split(',')
if line[-1] == '':
line.remove('')
if nodeX in line and len(line) >1:
line.remove(nodeX)
for author in line:
yield (author,nodeX)
def makeFreqDic():
print "Creating FreqDic..."
with codecs.open('authors.txt','r','utf-8') as f:
freqDic = {}
for line in f:
line = line.strip().split(',')
if line[-1] == '':
line.remove('')
for author in line:
freqDic[author] = freqDic.get(author,0) + 1
return freqDic
def main(freqDic,nodeX):
G = nx.Graph()
print "Adding edge..."
for A,B in createEdge(nodeX):
edgeDic = G.get_edge_data(A,B,default = {'weight':0})
G.add_edge(A, B, weight = edgeDic['weight'] + 1)
nodes = G.nodes()
nodes.remove(nodeX)
shells = [[nodeX], nodes]
pos = nx.shell_layout(G,shells)
print "Drawing nodes..."
nodeSize = [10*freqDic[n] for n, dic in G.nodes_iter(data=True)]
nodeColors = np.random.rand(len(nodeSize))
nx.draw_networkx_nodes(G, pos, node_size=nodeSize,node_color= nodeColors,alpha=0.7)
print "Drawing edges..."
edgeWidth = [edata['weight']/2 for u,v,edata in G.edges(data=True)]
edgeColor = np.random.rand(G.number_of_edges())
nx.draw_networkx_edges(G, pos, width = edgeWidth, edge_color=edgeColor,alpha=0.35)
print "Adding label..."
select_labels = {n:n for n,d in G.nodes_iter(data=True) if freqDic[n] >= 80}
select_labels[nodeX]= nodeX
nx.draw_networkx_labels(G,pos,labels = select_labels,font_size=8,alpha=0.3)
title = str(nodeX) + u"与其合作者之间的关系网络"
plt.title(title, size=15,fontproperties=font)
plt.text(0.5, 0.94, u"# 节点大小对应该作者发表文章总次数",
horizontalalignment='center',
size=10,color='r',verticalalignment='center',
transform=plt.gca().transAxes,
fontproperties=font)
plt.text(0.5, 0.97, u"# 节点之间连线粗细对应该两个作者一起发表文章总次数",
horizontalalignment='center',
size=10,color='r',verticalalignment='center',
transform=plt.gca().transAxes,
fontproperties=font)
plt.axis('off')
fileName = str(nodeX) + ".png"
plt.savefig(fileName,transparent=True,dpi=500)
plt.show()
if __name__ == '__main__':
freqDic = makeFreqDic()
nodeX = u'Irith Pomeranz'
main(freqDic, nodeX)
并行FP-Growth算法的可行性分析
并行FP-Growth算法最早研究可以参考文献:
Parallel_Frequent_Pattern_Mining.pdf
其核心思想可以通过上图来说明。
并行FP-growth算法可分解为两个MapReduce过程。
第一轮MapReduce
第一轮MapReduce所做的工作就是一个WordCount,扫描整个数据,统计每个词项所出现的次数。具体说来就是在Map过程中,输出以下键值对<word, 1>,在Reduce过程中,统计word出现的总的次数。具体可参考:
第二轮MapReduce
第二轮MapReduce过程是并行FP-growth算法的核心。
结合上图来分析:
Map过程:
1.读取第一轮MapReduce所得到的的词频,得到一个词典的数据结构;
2.依次从数据库读取记录,并根据上一步中的词典结构对其排序,同时过滤掉不满足支持度阈值的词项;
3.输出排序后记录中每一个元素的条件模式项,具体为什么这么做可以回顾FP-growth算法的原理
Reduce过程:
1.获取每个元素所对应的条件模式项,并统计条件模式项中每个词项出现的次数
2.对条件模式项中的每个词频用支持度阈值过滤
3.从该条件模式项中生成其所有子集即为最后的结果(这一步在上图中没有,我自己加进去的)
Hadoop平台上实现FP-Growth算法
理解上面的算法原理后,实现起来就比较容易了。
第一轮的MapReduce可以参考:www.tianjun.ml/essays/19
第二轮的Map过程如下:
#!/usr/bin/env python
import sys
def creatDic():
freqDic = {}
with open('sortedList', 'r') as sortedList:
for line in sortedList:
line = line.strip().split('\t')
freqDic[int(line[0])] = int(line[1])
return freqDic
def read_input(inFile):
for line in inFile:
yield line.split(',')
def main(freqDic, minSup):
data = read_input(sys.stdin)
for names in data:
names = {name:freqDic[int(name)] for name in names \
if name.isdigit() \
and freqDic.get(int(name), 0) >= minSup}
lenth = len(names)
if lenth >= 2:
conPatItems = [name for name, value in \
sorted(names.iteritems(), \
key = lambda p:p[1])]
for i in range(lenth-1):
print "%s\t%s" % (conPatItems[i], conPatItems[i+1::])
else:
continue
if __name__ == '__main__':
support = 100
dic = creatDic()
main(dic, support)
第二轮的Reduce过程如下:
#!/usr/bin/env python
from itertools import groupby
from operator import itemgetter
import sys
def readMapOutput(file):
for line in file:
yield line.strip().split('\t')
def main(minSup):
data = readMapOutput(sys.stdin)
for currentName, group in groupby(data, itemgetter(0)):
localDic = {}
try:
for currentName, conPatItems in group:
conPatItems = conPatItems.strip().strip('[').strip(']')
#print "%s\t%s" % (currentName, conPatItems)
itemList = conPatItems.split(',')
for item in itemList:
item = item.strip().strip("'")
item = int(item)
localDic[item] = localDic.get(item,0) + 1
resultDic = {k:v for k, v in localDic.iteritems() \
if v >= minSup}
#Here we just print out 2-coauthors
if len(resultDic) >= 1:
print "%s\t%s" % (currentName, resultDic.items())
except:
print "%s\t%s" %("inner err", "sorry!")
pass
if __name__ == "__main__":
support = 100
main(support)
总结:
本次实验分别在本地和Hadoop集群上实现了DBLP数据集中的合作者挖掘,由于实验条件有限,Hadoop集群为虚拟机实现,故难以对本地单机运行效率和分布式运行的效率进行对比,不过从论文Parallel_Frequent_Pattern_Mining.pdf 的分析结果可以看出,在数据量较大的时候分布式运行的FP-growth算法要比常规的FP-growth算法运行效率高很多。
关于调试
不得不说,MapReduce的调试要比普通程序的调试要复杂的多,一个建议是先将Reduce的任务设为0,查看Map输出是否是想要的结果,或者直接在Reduce过程中打印出收到的整理后的Map输出,方便进一步分析。
写程序的过程中出了个很蛋疼问题,分布式的输出和单机模式下的输出结果不同!!!找了半天才发现分布式的输出结果因为没有挖掘完整的频繁项,比如输出的<key, value>中 value 为<authorA,authorB,authorC>的话,并没有输出其子集<authorA,authorB> <authorA,authorC>和<authorB,authorC>以至分布式下的输出结果比单机下的结果数量要少。
最后的最后,不得不吐槽下编码以及字符串的处理…