分级聚类是通过连续不断地将最为相似的群组两两合并,来构造出一个群组的层级结构。其中的每个群组都是从单一元素开始的。如图所示:
元素的相似程序是通过它们的相对位置来体现的,距离越近越相似。两两合并,直到合并最后两个群组。
聚类是无监督学习的一个例子。与神经网络或决策树不同,无监督学习算法不是利用带有正确答案的样本数据进行“训练”。它们的目的是要在一组数据中找寻某种结构,而这些数据本身不是我们要找的答案。聚类算法的目标是采集数据,然后从中找出不同的群组。
下面我就利用分级聚类算法对iteye和csdn的博客进行分类。
1.准备数据。利用feedparser解析博客rss订阅源,利用jieba中文拆词获取关键词及其出现次数。
2.合并数据。利用皮尔逊相关度算法计算数据的相似度,两两合并相似度最小的数组,直到只剩下一个数级为止。
3.绘制树状图。利用PIL将合并后的数据做成图片,以此清晰地展现数据结构。
下面开始第一步:
安装feedparser.
feedparser是一个python库,可以用以分析RSS和Atom的订阅源。下载地址https://code.google.com/p/feedparser/downloads/list。解压后在feedparser目录下命令行输入python setup.py install。
下面去寻找一些iteye和csdn博客的订阅源。下面是我随便找的10个,写入到feedlist.txt中。
- <span>http://blog.csdn.net/xxg2810/rss/list
- http://lobert.iteye.com/rss
- http://blog.csdn.net/lfsf802/rss/list
- http://blog.csdn.net/david_lv/rss/list
- http://blog.csdn.net/m13666368773/rss/list
- http://blog.csdn.net/shulianghan/rss/list
- http://blog.csdn.net/withiter/rss/list
- http://blog.csdn.net/lfsf802/rss/list
- http://blog.csdn.net/pan_tian/rss/list
- http://blog.csdn.net/beijiguangyong/rss/list</span>
因为我们在天朝,所以用的当然是中文,因此需要一个分词库,这里我使用jieba(开始使用的是mmseg-cpp,但在实际运行中会出现卡死的现象)。
下载地址:https://github.com/fxsjy/jieba,解压后将文件夹直接放到python默认库地址即可。
准备工作完毕,开始mark了。
- #encoding=utf-8 #注意编码
- import sys
- from imp import reload
- reload(sys)
- import feedparser
- import re
- import jieba
- # Returns title and dictionary of word counts for an RSS feed
- def getwordcounts(url):
- # Parse the feed
- d=feedparser.parse(url)
- wc={}
- # Loop over all the entries
- for e in d.entries:
- if 'summary' in e: summary=e.summary
- else: summary=e.description
- # Extract a list of words
- words=getwords(e.title+' '+summary)
- for word in words:
- wc.setdefault(word,0)
- wc[word]+=1
- return d.feed.title,wc
- def getwords(html):
- # Remove all the HTML tags
- txt=re.compile(r'<[^>]+>').sub('',html)
- algor = jieba.cut(txt,cut_all=True)
- # Split words by all non-alpha characters
- # words=re.compile(r'[^A-Z^a-z]+').split(txt)
- # Convert to lowercase
- return [tok.lower() for tok in algor if tok!='']
- # return [word.lower() for word in words if word!='']
- apcount={}
- wordcounts={}
- feedlist=[line for line in open('feedlist.txt')]
- for feedurl in feedlist:
- try:
- title,wc=getwordcounts(feedurl)
- wordcounts[title]=wc
- for word,count in wc.items():
- apcount.setdefault(word,0)
- if count>1:
- apcount[word]+=1
- except:
- print 'Failed to parse feed %s' % feedurl
- wordlist=[]
- for w,bc in apcount.items():
- frac=float(bc)/len(feedlist)
- if frac>0.1 and frac<0.5: //因为像“我”这样的单词几乎到处都是,而像“PYTHON”这样的单词则有可能只出现在个别博客中,所以通过只选择介于某个百分比范围内的单词,我们可以减少需要考查的单词总量。我们这里将1/10为下界,5/10为上界。
- wordlist.append(w)
- out=open('blogdata1.txt','w')
- out.write('Blog')
- for word in wordlist: out.write('\t%s' % word.strip())
- out.write('\n')
- for blog,wc in wordcounts.items():
- out.write(blog)
- for word in wordlist:
- if word in wc: out.write('\t%d' % wc[word])
- else: out.write('\t0')
- out.write('\n')
程序运行后数据保存到文件中,我这里跑了40s,如无意外,出现的是一个表格。其中的每一列对应一个单词,每一行对应一个博客。表格的数字就是单词出现在博客中的次数。
由于宽度原因,因此只截取部分数据。
到此,数据准备完毕。
第二步:mark
- def readfile(filename):
- lines=[line for line in file(filename)]
- # First line is the column titles
- colnames=lines[0].strip().split('\t')[1:]
- rownames=[]
- data=[]
- for line in lines[1:]:
- p=line.strip().split('\t')
- # First column in each row is the rowname
- rownames.append(p[0])
- # The data for this row is the remainder of the row
- data.append([float(x) for x in p[1:]])
- return rownames,colnames,data
- from math import sqrt
- def pearson(v1,v2):
- # Simple sums
- sum1=sum(v1)
- sum2=sum(v2)
- # Sums of the squares
- sum1Sq=sum([pow(v,2) for v in v1])
- sum2Sq=sum([pow(v,2) for v in v2])
- # Sum of the products
- pSum=sum([v1[i]*v2[i] for i in range(len(v1))])
- # Calculate r (Pearson score)
- num=pSum-(sum1*sum2/len(v1))
- den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
- if den==0: return 0
- return 1.0-num/den
- class bicluster:
- def __init__(self,vec,left=None,right=None,distance=0.0,id=None):
- self.left=left
- self.right=right
- self.vec=vec
- self.id=id
- self.distance=distance
- def hcluster(rows,distance=pearson):
- distances={}
- currentclustid=-1
- # Clusters are initially just the rows
- clust=[bicluster(rows[i],id=i) for i in range(len(rows))]
- while len(clust)>1:
- lowestpair=(0,1)
- closest=distance(clust[0].vec,clust[1].vec)
- # loop through every pair looking for the smallest distance
- for i in range(len(clust)):
- for j in range(i+1,len(clust)):
- # distances is the cache of distance calculations
- if (clust[i].id,clust[j].id) not in distances:
- distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec)
- d=distances[(clust[i].id,clust[j].id)]
- if d<closest:
- closest=d
- lowestpair=(i,j)
- # 计算两个聚类的平均值
- mergevec=[
- (clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0
- for i in range(len(clust[0].vec))]
- # create the new cluster
- newcluster=bicluster(mergevec,left=clust[lowestpair[0]],
- right=clust[lowestpair[1]],
- distance=closest,id=currentclustid)
- # 不在原始集合中的聚类,其id为负数
- currentclustid-=1
- del clust[lowestpair[1]]
- del clust[lowestpair[0]]
- clust.append(newcluster)
- return clust[0]
第三步;借助树状图,我们可以更加理解聚类。
安装PIL,由于我安装了python(x,y),里面已经包含了PIL,因此无需安装。
- from PIL import Image,ImageDraw,ImageFont
- #计算聚类总体的高度
- def getheight(clust):
- # Is this an endpoint? Then the height is just 1
- if clust.left==None and clust.right==None: return 1
- # Otherwise the height is the same of the heights of
- # each branch
- return getheight(clust.left)+getheight(clust.right)
- #计算误差
- def getdepth(clust):
- # The distance of an endpoint is 0.0
- if clust.left==None and clust.right==None: return 0
- #一个枝节点的距离等于左右两侧分支中距离较大者加上该枝节点自身的距离
- return max(getdepth(clust.left),getdepth(clust.right))+clust.distance
- #画图
- def drawdendrogram(clust,labels,jpeg='clusters.jpg'):
- # height and width
- h=getheight(clust)*30
- w=1200
- depth=getdepth(clust)
- # width is fixed, so scale distances accordingly
- scaling=float(w-350)/depth
- # Create a new image with a white background
- img=Image.new('RGB',(w,h),(255,255,255))
- draw=ImageDraw.Draw(img)
- draw.line((0,h/2,10,h/2),fill=(255,0,0))
- # Draw the first node
- drawnode(draw,clust,10,(h/2),scaling,labels)
- img.save(jpeg,'JPEG')
- def drawnode(draw,clust,x,y,scaling,labels):
- if clust.id<0:
- h1=getheight(clust.left)*20
- h2=getheight(clust.right)*20
- top=y-(h1+h2)/2
- bottom=y+(h1+h2)/2
- # Line length
- ll=clust.distance*scaling
- # Vertical line from this cluster to children
- draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0))
- # Horizontal line to left item
- draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0))
- # Horizontal line to right item
- draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0))
- # Call the function to draw the left and right nodes
- drawnode(draw,clust.left,x+ll,top+h1/2,scaling,labels)
- drawnode(draw,clust.right,x+ll,bottom-h2/2,scaling,labels)
- else:
- font = ImageFont.truetype('simsun.ttc',24)
- # If this is an endpoint, draw the item label
- draw.text((x+5,y-7),unicode(labels[clust.id],'utf-8'),(0,0,0),font=font)
最后我们写个test跑下
- #encoding=utf-8
- import clusters #里面包含了第二,三步的代码
- blognames,words,data = clusters.readfile('blogdata1.txt')
- clust = clusters.hcluster(data)
- clusters.printclust(clust,labels=blognames)
- clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
生成图片: