Implement pageRank algorithm in python
run : ./**.py -o output file -d dumping factor -e epsilon infile
date format:
first line: max node number
the left lines: node id : link node count, node id
from __future__ import division
#!usr/bin/python
import sys
import getopt
import scipy.sparse as sp
from scipy.spatial import distance
from numpy import *
import pdb
def receiveArguments(argv):
dfactor=''
epsilon=''
output=''
try:
opts,args=getopt.getopt(argv,'d:e:o:',[])
except getopt.GetoptError:
print 'Wrong arguments'
return
for name,value in opts:
if name=='-d':
dfactor=value
elif name=='-e':
epsilon=value
elif name =='-o':
output=value
else:
pass
if dfactor=='' or epsilon=='' or output=='' or len(args)==0:
print 'Lack arguments'
return
return dfactor,epsilon,output,args[0]
d,e,outfile,infile= receiveArguments(sys.argv[1:])
d=float(d)
e=float(e)
pf=open(infile,'r')
fileList=pf.read()
pf.close()
fileList=fileList.splitlines()
maxNodeLine=fileList[0]
maxNodeLine=maxNodeLine.split(' ')
maxNode=int(maxNodeLine[1])
addP=ones(maxNode)
p2=ones(maxNode)
p1=ones(maxNode)*100
smoothP=p2*(1-d)
#pdb.set_trace()
rowIndex=[]
colIndex=[]
data=[]
for line in fileList[1:]:
line=line.split(':')
nodeId=int(line[0]);
listing=line[1].split(' ')
outNum=int(listing[0])
addP[nodeId-1]=0
for node in listing[1:]:
# matrixA[nodeId-1][int(node)-1]=int(1/outNum)
rowIndex.append(int(node)-1)
colIndex.append(nodeId-1)
data.append(1.0/outNum)
del fileList
#pdb.set_trace()
smatrixA=sp.csc_matrix((array(data),(array(rowIndex),array(colIndex))),shape=(maxNode,maxNode))
while distance.euclidean(p1,p2)>e:
p1=p2
add=dot(p1,addP)
p2=smoothP+d*(smatrixA.dot(p1)+(add*ones(maxNode)-addP*p1)/(maxNode-1))
# pdb.set_trace()
del smatrixA
result=''
for nodeId,score in enumerate(p2):
# print j,':%0.6f'% i
line=str(nodeId+1)+':'+str(float('%.6f'%score))+'\n'
result+=line
pf=open(outfile,'w')
pf.write(result)
pf.close()