更多代码请见:https://github.com/xubo245/SparkLearning
1解释
原理在参考【3】中讲的很详细,包括MapReduce情况下的。
源码:
/**
* Run a dynamic version of PageRank returning a graph with vertex attributes containing the
* PageRank and edge attributes containing the normalized edge weight.
*
* @see [[org.apache.spark.graphx.lib.PageRank$#runUntilConvergence]]
*/
def pageRank(tol: Double, resetProb: Double = 0.15): Graph[Double, Double] = {
PageRank.runUntilConvergence(graph, tol, resetProb)
}
调用了:
/**
* Run a dynamic version of PageRank returning a graph with vertex attributes containing the
* PageRank and edge attributes containing the normalized edge weight.
*
* @tparam VD the original vertex attribute (not used)
* @tparam ED the original edge attribute (not used)
*
* @param graph the graph on which to compute PageRank
* @param tol the tolerance allowed at convergence (smaller => more accurate).
* @param resetProb the random reset probability (alpha)
*
* @return the graph containing with each vertex containing the PageRank and each edge
* containing the normalized weight.
*/
def runUntilConvergence[VD: ClassTag, ED: ClassTag](
graph: Graph[VD, ED], tol: Double, resetProb: Double = 0.15): Graph[Double, Double] =
{
runUntilConvergenceWithOptions(graph, tol, resetProb)
}
/**
* Run a dynamic version of PageRank returning a graph with vertex attributes containing the
* PageRank and edge attributes containing the normalized edge weight.
*
* @tparam VD the original vertex attribute (not used)
* @tparam ED the original edge attribute (not used)
*
* @param graph the graph on which to compute PageRank
* @param tol the tolerance allowed at convergence (smaller => more accurate).
* @param resetProb the random reset probability (alpha)
* @param srcId the source vertex for a Personalized Page Rank (optional)
*
* @return the graph containing with each vertex containing the PageRank and each edge
* containing the normalized weight.
*/
def runUntilConvergenceWithOptions[VD: ClassTag, ED: ClassTag](
graph: Graph[VD, ED], tol: Double, resetProb: Double = 0.15,
srcId: Option[VertexId] = None): Graph[Double, Double] =
{
// Initialize the pagerankGraph with each edge attribute
// having weight 1/outDegree and each vertex with attribute 1.0.
val pagerankGraph: Graph[(Double, Double), Double] = graph
// Associate the degree with each vertex
.outerJoinVertices(graph.outDegrees) {
(vid, vdata, deg) => deg.getOrElse(0)
}
// Set the weight on the edges based on the degree
.mapTriplets( e => 1.0 / e.srcAttr )
// Set the vertex attributes to (initalPR, delta = 0)
.mapVertices( (id, attr) => (0.0, 0.0) )
.cache()
val personalized = srcId.isDefined
val src: VertexId = srcId.getOrElse(-1L)
// Define the three functions needed to implement PageRank in the GraphX
// version of Pregel
def vertexProgram(id: VertexId, attr: (Double, Double), msgSum: Double): (Double, Double) = {
val (oldPR, lastDelta) = attr
val newPR = oldPR + (1.0 - resetProb) * msgSum
(newPR, newPR - oldPR)
}
def personalizedVertexProgram(id: VertexId, attr: (Double, Double),
msgSum: Double): (Double, Double) = {
val (oldPR, lastDelta) = attr
var teleport = oldPR
val delta = if (src==id) 1.0 else 0.0
teleport = oldPR*delta
val newPR = teleport + (1.0 - resetProb) * msgSum
(newPR, newPR - oldPR)
}
def sendMessage(edge: EdgeTriplet[(Double, Double), Double]) = {
if (edge.srcAttr._2 > tol) {
Iterator((edge.dstId, edge.srcAttr._2 * edge.attr))
} else {
Iterator.empty
}
}
def messageCombiner(a: Double, b: Double): Double = a + b
// The initial message received by all vertices in PageRank
val initialMessage = resetProb / (1.0 - resetProb)
// Execute a dynamic version of Pregel.
val vp = if (personalized) {
(id: VertexId, attr: (Double, Double), msgSum: Double) =>
personalizedVertexProgram(id, attr, msgSum)
} else {
(id: VertexId, attr: (Double, Double), msgSum: Double) =>
vertexProgram(id, attr, msgSum)
}
Pregel(pagerankGraph, initialMessage, activeDirection = EdgeDirection.Out)(
vp, sendMessage, messageCombiner)
.mapVertices((vid, attr) => attr._1)
} // end of deltaPageRank
2.代码:
/**
* @author xubo
* ref http://spark.apache.org/docs/1.5.2/graphx-programming-guide.html
* time 20160503
*/
package org.apache.spark.graphx.learning
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.Graph.graphToGraphOps
import org.apache.spark.graphx.VertexId
import org.apache.spark.graphx.util.GraphGenerators
import org.apache.spark.graphx.GraphLoader
object PageRank {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("PageRank").setMaster("local[4]")
val sc = new SparkContext(conf)
// Load the edges as a graph
val graph = GraphLoader.edgeListFile(sc, "file/data/graphx/input/followers.txt")
// Run PageRank
val ranks = graph.pageRank(0.0001).vertices
// Join the ranks with the usernames
val users = sc.textFile("file/data/graphx/input/users.txt").map { line =>
val fields = line.split(",")
(fields(0).toLong, fields(1))
}
val ranksByUsername = users.join(ranks).map {
case (id, (username, rank)) => (username, rank)
}
// Print the result
println(ranksByUsername.collect().mkString("\n"))
}
}
直接调用
<pre code_snippet_id="1671863" snippet_file_name="blog_20160504_3_6713227" name="code" class="plain"> val ranks = graph.pageRank(0.0001).vertices
就行了,0.0001为前后两次收敛的误差阈值,小于这个阈值时则结束计算,越小精度越到
数据:
followers.txt:
2 1
4 1
1 2
6 3
7 3
7 6
6 7
3 7
users.txt<span style="font-family: Arial, Helvetica, sans-serif; font-size: 12px; background-color: rgb(255, 255, 255);">:</span>
1,BarackObama,Barack Obama
2,ladygaga,Goddess of Love
3,jeresig,John Resig
4,justinbieber,Justin Bieber
6,matei_zaharia,Matei Zaharia
7,odersky,Martin Odersky
8,anonsys
图:
<img src="https://img-blog.csdn.net/20160504185030083?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQv/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/Center" alt="" />
3.结果:
(justinbieber,0.15)
(matei_zaharia,0.7013599933629602)
(ladygaga,1.390049198216498)
(BarackObama,1.4588814096664682)
(jeresig,0.9993442038507723)
(odersky,1.2973176314422592)
第8个顶点由于没有与任何顶点连接,故join时就没了
参考
【1】 http://spark.apache.org/docs/1.5.2/graphx-programming-guide.html
【2】https://github.com/xubo245/SparkLearning
【3】http://blog.jobbole.com/71431/