/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.giraph.examples;
import org.apache.giraph.graph.BasicComputation;
import org.apache.giraph.graph.DefaultVertex;
import org.apache.giraph.graph.GraphTaskManager;
import org.apache.giraph.edge.ArrayListEdges;
import org.apache.giraph.edge.Edge;
import org.apache.giraph.edge.EdgeFactory;
import org.apache.giraph.edge.EdgeNoValue;
import org.apache.giraph.graph.Vertex;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import java.io.IOException;
import java.util.ArrayList;
/**
* Implementation of the HCC algorithm that identifies connected components and
* assigns each vertex its "component identifier" (the smallest vertex id
* in the component)
*
* The idea behind the algorithm is very simple: propagate the smallest
* vertex id along the edges to all vertices of a connected component. The
* number of supersteps necessary is equal to the length of the maximum
* diameter of all components + 1
*
* The original Hadoop-based variant of this algorithm was proposed by Kang,
* Charalampos, Tsourakakis and Faloutsos in
* "PEGASUS: Mining Peta-Scale Graphs", 2010
*
* http://www.cs.cmu.edu/~ukang/papers/PegasusKAIS.pdf
*/
@Algorithm(
name = "Connected components",
description = "Finds connected components of the graph"
)
public class RecodeComputation extends
BasicComputation<IntWritable, IntWritable, NullWritable, IntWritable> {
/**
* 1.0号超步写入zk各自分区的顶点数量。
* 2.1号超步所有分区根据zk各自更新ID, 并发送更新后的顶点消息.
* 3.更新接收的顶点ID。
*/
@Override
public void compute(
Vertex<IntWritable, IntWritable, NullWritable> vertex,
Iterable<IntWritable> messages) throws IOException {
if (getSuperstep() == 0) {
GraphTaskManager.LOG.info("Vertex " + vertex.toString()) ;
sendMessageToAllEdges(vertex, vertex.getId()) ; //向目标顶点发送源顶点ID
return;
}
if (getSuperstep() == 1) { //1.每个目标顶点将收到的消息保存,并汇总为入度顶点集合,清空出度边集合,并将出度边集合复用。
ArrayListEdges edges = (ArrayListEdges) vertex.getEdges() ;
edges.clear() ;
for (IntWritable srcId : messages) {
GraphTaskManager.LOG.info("Received Messags : " + srcId.toString() ) ;
Integer src = srcId.get() ;
((DefaultVertex)vertex).inEdges.add(new IntWritable(src)) ;
}
StringBuffer sb = new StringBuffer() ;
ArrayList<IntWritable> inEdges = ((DefaultVertex)vertex).inEdges ;
for(IntWritable inEdge : inEdges) {
GraphTaskManager.LOG.info("inEdge " + inEdge.toString() ) ;
sb.append(inEdge.toString()) ;
}
GraphTaskManager.LOG.info("New Edges " + vertex.toString() + sb.toString() + " at step " + getSuperstep()) ;
return;
}
if(getSuperstep() == 2) { //2.每个分区根据顶点数量确定各自分区顶点的起始编码,并按照递增的顺序对本分区内部所有顶点ID进行更新,各顶点更新自己的ID后,将更新后的新ID作为消息。按照超步1中获取的入度顶点集合进行发送。
ArrayList<IntWritable> inEdges = ((DefaultVertex)vertex).inEdges ;
for (IntWritable srcId : inEdges) {
GraphTaskManager.LOG.info("Send New id : " + vertex.toString() + " to " + srcId + " at step " + getSuperstep()) ;
sendMessage(srcId, vertex.getId()) ; //向原来的入度边发送自己的新ID
}
}
if(getSuperstep() == 3) { //3.每个顶点收到的消息作为新的出度边。
ArrayListEdges edges = (ArrayListEdges) vertex.getEdges() ;
edges.clear(); //当前edges实为入度边集合
for (IntWritable srcId : messages) {
Integer src = srcId.get() ;
((ArrayListEdges)vertex.getEdges()).add(EdgeFactory.createReusable(new IntWritable(src))) ;
}
GraphTaskManager.LOG.info("new Edges " + vertex.toString() + edges.printEdges() + " at step " + getSuperstep()) ;
vertex.voteToHalt() ;
}
}
}
自定义Giraph1.1.0的重编码计算类,其中超步3中如果这样写:
for (IntWritable srcId : messages) {
((ArrayListEdges)vertex.getEdges()).add(EdgeFactory.createReusable(srcId)) ;
在以上写法中,会造成迭代中值重复,原因还没想明白。
修改后的重编码类:
public class RecodeComputation extends
BasicComputation<IntWritable, IntWritable, NullWritable, IntWritable> {
/**
* 1.0号超步写入zk各自分区的顶点数量。
* 2.1号超步所有分区根据zk各自更新ID, 并发送更新后的顶点消息.
* 3.更新接收的顶点ID。
*/
@Override
public void compute(
Vertex<IntWritable, IntWritable, NullWritable> vertex,
Iterable<IntWritable> messages) throws IOException {
if (getSuperstep() == 0) {
sendMessageToAllEdges(vertex, vertex.getId()) ; //向目标顶点发送源顶点ID
return;
}
if (getSuperstep() == 1) { //1.每个目标顶点将收到的消息保存,并汇总为入度顶点集合,清空出度边集合,并将出度边集合复用。
for (IntWritable srcId : messages) {
Integer src = srcId.get() ;
((DefaultVertex)vertex).inEdges.add(new IntWritable(src)) ;
}
return;
}
if(getSuperstep() == 2) { //2.每个分区根据顶点数量确定各自分区顶点的起始编码,并按照递增的顺序对本分区内部所有顶点ID进行更新,各顶点更新自己的ID后,将更新后的新ID作为消息。按照超步1中获取的入度顶点集合进行发送。
ArrayList<IntWritable> inEdges = ((DefaultVertex)vertex).inEdges ;
for (IntWritable srcId : inEdges) {
sendMessage(srcId, vertex.getId()) ; //向原来的入度边发送自己的新ID
}
}
if(getSuperstep() == 3) { //3.每个顶点收到的消息作为新的出度边。
ArrayListEdges edges = (ArrayListEdges) vertex.getEdges() ;
edges.clear(); //当前edges实为入度边集合
for (IntWritable srcId : messages) {
Integer src = srcId.get() ;
((ArrayListEdges)vertex.getEdges()).add(EdgeFactory.createReusable(new IntWritable(src))) ;
}
vertex.voteToHalt() ;
}
}
}