Spark Graphx 实现图中极大团挖掘, 伪并行化算法

最新推荐文章于 2023-11-22 08:29:56 发布

weixin_33816946

最新推荐文章于 2023-11-22 08:29:56 发布

阅读量276

点赞数

文章标签：大数据

原文链接：https://my.oschina.net/ktlb/blog/1502854

版权

为什么80%的码农都做不了架构师？>>>

背景:

spark graphx并未提供极大团挖掘算法
当下的极大团算法都是串行化的算法,基于Bron–Kerbosch算法

思路:

spark graphx提供了连通图的算法,连通图和极大团都是无向图中的概念,极大团为连通图的子集
利用spark graphx 找出连通图,在从各个连通图中,利用串行化的极大团算法,找出极大团 (伪并行化)
对于关联性较强的图,找出来的连通图非常大,这时串行化的极大团算法,仍然会耗时很久,这里利用剪枝的思想减少样本数据量,但是对于大图,优化空间有限
期待真正的并行化的极大团算法

配置文件:

graph_data_path=hdfs://localhost/graph_data
out_path=hdfs://localhost/clique
ck_path=hdfs://localhost/checkpoint
numIter=50		剪枝次数
count=3			极大团顶点数大小
algorithm=2		极大团算法,1:个人实现  2:jgrapht
percent=90		剪枝后的顶点数,占前一次的百分比,如果剪完后,还剩下90%的数据,那么剪枝效率已然不高
spark.master=local
spark.app.name=graph
spark.serializer=org.apache.spark.serializer.KryoSerializer
spark.yarn.executor.memoryOverhead=20480
spark.yarn.driver.memoryOverhead=20480
spark.driver.extraJavaOptions=-XX:+UseG1GC -XX:+UseCompressedOops -XX:+DisableExplicitGC
spark.executor.extraJavaOptions=-XX:+UseG1GC -XX:+UseCompressedOops -XX:+DisableExplicitGC
spark.driver.maxResultSize=10g
spark.default.parallelism=60

jgrapht

样本数据:

{"src":"0","dst":"1"}
{"src":"0","dst":"2"}
{"src":"0","dst":"3"}
{"src":"1","dst":"0"}
{"src":"2","dst":"1"}
{"src":"3","dst":"5"}
{"src":"4","dst":"6"}
{"src":"5","dst":"4"}
{"src":"6","dst":"5"}
{"src":"3","dst":"2"}
{"src":"2","dst":"3"}
{"src":"6","dst":"4"}
{"src":"3","dst":"4"}
{"src":"4","dst":"3"}
{"src":"2","dst":"6"}
{"src":"6","dst":"2"}
{"src":"6","dst":"7"}
{"src":"7","dst":"6"}

样本图:

输出:

0,1,2
0,2,3
3,4,5
4,5,6

代码实现:

import java.util
import java.util.Properties

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.graphx.{Edge, Graph}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.{SparkConf, SparkContext}
import org.jgrapht.alg.BronKerboschCliqueFinder
import org.jgrapht.graph.{DefaultEdge, SimpleGraph}

import scala.collection.JavaConverters._
import scala.collection.mutable

object ApplicationTitan {
	def main(args: Array[String]) {
	    val prop = new Properties()
	    prop.load(getClass.getResourceAsStream("/config.properties"))
	
	    val graph_data_path = prop.getProperty("graph_data_path")
	    val out_path = prop.getProperty("out_path")
	    val ck_path = prop.getProperty("ck_path")
	    val count = Integer.parseInt(prop.getProperty("count"))
	    val numIter = Integer.parseInt(prop.getProperty("numIter"))
	    val algorithm = Integer.parseInt(prop.getProperty("algorithm"))
	    val percent = Integer.parseInt(prop.getProperty("percent"))
	    val conf = new SparkConf()
	    try {
	      Runtime.getRuntime.exec("hdfs dfs -rm -r " + out_path)
//            Runtime.getRuntime.exec("cmd.exe /C rd /s /q " + out_path)
	    } catch {
	      case ex: Exception =>
	        ex.printStackTrace(System.out)
	    }
	
	    prop.stringPropertyNames().asScala.foreach(s => {
	      if (s.startsWith("spark")) {
	        conf.set(s, prop.getProperty(s))
	      }
	    })
	    conf.registerKryoClasses(Array(getClass))
	    val sc = new SparkContext(conf)
	    sc.setLogLevel("ERROR")
	    sc.setCheckpointDir(ck_path)
	    val sqlc = new SQLContext(sc)
	    try {
	      val e_df = sqlc.read
//                        .json(graph_data_path)
    	.parquet(graph_data_path)

	      var e_rdd = e_df
	        .mapPartitions(it => {
	          it.map({
	            case Row(dst: String, src: String) =>
	              val src_long = src.toLong
	              val dst_long = dst.toLong
	              if (src_long < dst_long) (src_long, dst_long) else (dst_long, src_long)
	          })
	        }).distinct()
	      e_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER)
	
	      var bc: Broadcast[Set[Long]] = null
	      var iter = 0
	      var bc_size = 0
		 //剪枝
	      while (iter <= numIter) {
	        val temp = e_rdd
	          .flatMap(x => List((x._1, 1), (x._2, 1)))
	          .reduceByKey((x, y) => x + y)
	          .filter(x => x._2 >= count - 1)
	          .mapPartitions(it => it.map(x => x._1))
	        val bc_value = temp.collect().toSet
	        bc = sc.broadcast(bc_value)
	        e_rdd = e_rdd.filter(x => bc.value.contains(x._1) && bc.value.contains(x._2))
	        e_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER)
	        iter += 1
	        if (bc_size != 0 && bc_value.size >= bc_size * percent / 100) {
	          println("total iter : "+ iter)
	          iter = Int.MaxValue
	        }
	        bc_size = bc_value.size
	      }
	
	      // 构造图
	      val edge: RDD[Edge[Long]] = e_rdd.mapPartitions(it => it.map(x => Edge(x._1, x._2)))
	      val graph = Graph.fromEdges(edge, 0, StorageLevel.MEMORY_AND_DISK_SER, StorageLevel.MEMORY_AND_DISK_SER)
	
	      //连通图
	      val cc = graph.connectedComponents().vertices
	      cc.persist(StorageLevel.MEMORY_AND_DISK_SER)
	
	      cc.join(e_rdd)
	        .mapPartitions(it => it.map(x => ((math.random * 10).toInt.toString.concat(x._2._1.toString), (x._1, x._2._2))))
	        .aggregateByKey(List[(Long, Long)]())((list, v) => list :+ v, (list1, list2) => list1 ::: list2)
	        .mapPartitions(it => it.map(x => (x._1.substring(1), x._2)))
	        .aggregateByKey(List[(Long, Long)]())((list1, list2) => list1 ::: list2, (list3, list4) => list3 ::: list4)
	        .filter(x => x._2.size >= count - 1)
	        .flatMap(x => {
	          if (algorithm == 1)
	            find(x, count)
	          else
	            find2(x, count)
	        })
	        .mapPartitions(it => {
	          it.map({
	            case set =>
	              var temp = ""
	              set.asScala.foreach(x => temp += x + ",")
	              temp.substring(0, temp.length - 1)
	            case _ =>
	          })
	        })
	//                .coalesce(1)
    .saveAsTextFile(out_path)
}

	catch {
  case ex: Exception =>
    ex.printStackTrace(System.out)
	}
	sc.stop()
}
//自己实现的极大团算法
 def find(x: (String, List[(Long, Long)]), count: Int): mutable.Set[util.Set[String]] = {
    println(x._1 + "|s|" + x._2.size)
    println("BKCliqueFinder---" + x._1 + "---" + System.currentTimeMillis())
    val neighbors = new util.HashMap[String, util.Set[String]]
    val finder = new CliqueFinder(neighbors, count)
    x._2.foreach(r => {
      val v1 = r._1.toString
      val v2 = r._2.toString
      if (neighbors.containsKey(v1)) {
        neighbors.get(v1).add(v2)
      } else {
        val temp = new util.HashSet[String]()
        temp.add(v2)
        neighbors.put(v1, temp)
      }
      if (neighbors.containsKey(v2)) {
        neighbors.get(v2).add(v1)
      } else {
        val temp = new util.HashSet[String]()
        temp.add(v1)
        neighbors.put(v2, temp)
      }
    })
    println("BKCliqueFinder---" + x._1 + "---" + System.currentTimeMillis())
    finder.findMaxCliques().asScala
}
//jgrapht 中的极大团算法
 def find2(x: (String, List[(Long, Long)]), count: Int): Set[util.Set[String]] = {
    println(x._1 + "|s|" + x._2.size)
    println("BKCliqueFinder---" + x._1 + "---" + System.currentTimeMillis())
    val to_clique = new SimpleGraph[String, DefaultEdge](classOf[DefaultEdge])
    x._2.foreach(r => {
      val v1 = r._1.toString
      val v2 = r._2.toString
      to_clique.addVertex(v1)
      to_clique.addVertex(v2)
      to_clique.addEdge(v1, v2)
    })
    val finder = new BronKerboschCliqueFinder(to_clique)
    val list = finder.getAllMaximalCliques.asScala
    var result = Set[util.Set[String]]()
    list.foreach(x => {
      if (x.size() >= count)
        result = result + x
    })
    println("BKCliqueFinder---" + x._1 + "---" + System.currentTimeMillis())
    result
}
}

自己实现的极大团算法:

import java.util.*;

/**
 * [@author](https://my.oschina.net/arthor) mopspecial@gmail.com
 * [@date](https://my.oschina.net/u/2504391) 2017/7/31
 */
public class CliqueFinder {
    private Map<String, Set<String>> neighbors;
    private Set<String> nodes;
    private Set<Set<String>> maxCliques = new HashSet<>();
    private Integer minSize;

    public CliqueFinder(Map<String, Set<String>> neighbors, Integer minSize) {
        this.neighbors = neighbors;
        this.nodes = neighbors.keySet();
        this.minSize = minSize;
    }

    private void bk3(Set<String> clique, List<String> candidates, List<String> excluded) {
        if (candidates.isEmpty() && excluded.isEmpty()) {
            if (!clique.isEmpty() && clique.size() >= minSize) {
                maxCliques.add(clique);
            }
            return;
        }

        for (String s : degeneracy_order(candidates)) {
            List<String> new_candidates = new ArrayList<>(candidates);
            new_candidates.retainAll(neighbors.get(s));

            List<String> new_excluded = new ArrayList<>(excluded);
            new_excluded.retainAll(neighbors.get(s));
            Set<String> nextClique = new HashSet<>(clique);
            nextClique.add(s);
            bk2(nextClique, new_candidates, new_excluded);
            candidates.remove(s);
            excluded.add(s);
        }
    }

    private void bk2(Set<String> clique, List<String> candidates, List<String> excluded) {
        if (candidates.isEmpty() && excluded.isEmpty()) {
            if (!clique.isEmpty() && clique.size() >= minSize) {
                maxCliques.add(clique);
            }
            return;
        }
        String pivot = pick_random(candidates);
        if (pivot == null) {
            pivot = pick_random(excluded);
        }
        List<String> tempc = new ArrayList<>(candidates);
        tempc.removeAll(neighbors.get(pivot));

        for (String s : tempc) {
            List<String> new_candidates = new ArrayList<>(candidates);
            new_candidates.retainAll(neighbors.get(s));

            List<String> new_excluded = new ArrayList<>(excluded);
            new_excluded.retainAll(neighbors.get(s));
            Set<String> nextClique = new HashSet<>(clique);
            nextClique.add(s);
            bk2(nextClique, new_candidates, new_excluded);
            candidates.remove(s);
            excluded.add(s);
        }
    }

    private List<String> degeneracy_order(List<String> innerNodes) {
        List<String> result = new ArrayList<>();
        Map<String, Integer> deg = new HashMap<>();
        for (String node : innerNodes) {
            deg.put(node, neighbors.get(node).size());
        }
        while (!deg.isEmpty()) {
            Integer min = Collections.min(deg.values());
            String minKey = null;
            for (String key : deg.keySet()) {
                if (deg.get(key).equals(min)) {
                    minKey = key;
                    break;
                }
            }
            result.add(minKey);
            deg.remove(minKey);
            for (String k : neighbors.get(minKey)) {
                if (deg.containsKey(k)) {
                    deg.put(k, deg.get(k) - 1);
                }
            }

        }
        return result;
    }


    private String pick_random(List<String> random) {
        if (random != null && !random.isEmpty()) {
            return random.get(0);
        } else {
            return null;
        }
    }

    public Set<Set<String>> findMaxCliques() {
        this.bk3(new HashSet<>(), new ArrayList<>(nodes), new ArrayList<>());
        return maxCliques;
    }

    public static void main(String[] args) {
        Map<String, Set<String>> neighbors = new HashMap<>();
        neighbors.put("0", new HashSet<>(Arrays.asList("1", "2", "3")));
        neighbors.put("1", new HashSet<>(Arrays.asList("0", "2")));
        neighbors.put("2", new HashSet<>(Arrays.asList("0", "1", "3", "6")));
        neighbors.put("3", new HashSet<>(Arrays.asList("0", "2", "4", "5")));
        neighbors.put("4", new HashSet<>(Arrays.asList("3", "5", "6")));
        neighbors.put("5", new HashSet<>(Arrays.asList("3", "4", "6")));
        neighbors.put("6", new HashSet<>(Arrays.asList("2", "4", "5")));
        neighbors.put("7", new HashSet<>(Arrays.asList("6")));
        CliqueFinder finder = new CliqueFinder(neighbors, 3);
        finder.bk3(new HashSet<>(), new ArrayList<>(neighbors.keySet()), new ArrayList<>());
        System.out.println(finder.maxCliques);
    }
}

转载于:https://my.oschina.net/ktlb/blog/1502854