val cmccLogPath ="user_profile/data/cmcclog/day01"
val dspLogPath ="user_profile/data/dsplog/day01"
val eventLogPath ="user_profile/data/eventlog/day01"
val cmccrdd = IdsExtractor.extractCmccLogIds(spark,cmccLogPath)
val dsprdd = IdsExtractor.extractDspLogIds(spark,dspLogPath)
val eventrdd = IdsExtractor.extractEventLogIds(spark,eventLogPath)
val ids = cmccrdd.union(dsprdd).union(eventrdd)//获取顶点
val vertices = ids.flatMap(arr =>{for(elem <- arr) yield (elem.hashCode.toLong, elem)})//组合边 为了过滤 组合每个顶点都连接的边 不影响构建图
val edges = ids.flatMap(arr =>{
val sortedArr = arr.sorted
for(i <-0 until sortedArr.length; j <- i +1 until sortedArr.length) yield
Edge(sortedArr(i).hashCode.toLong,sortedArr(j).hashCode.toLong,"")})
val edgesres = edges.map(e=>(e,1)).reduceByKey(_+_).filter(t=>t._2 >2).map(_._1)//获取T数据的idmp
val idmpTdf = spark.read.parquet("user_profile/demodata/idmp/output/day01")
val tvertices = idmpTdf.rdd.flatMap{caseRow(id:Long, gid:Long)=>Array((id,""),(gid,""))}
val tedges = idmpTdf.rdd.map {caseRow(id:Long, gid:Long)=>Edge(id, gid,"")}
val graph =Graph(vertices.union(tvertices), edgesres.union(tedges))
val childrenGraph = graph.connectedComponents()//获取t+1 和t 的顶点对应的最小值
val t1Vertices: VertexRDD[VertexId]= childrenGraph.vertices
val idmp = idmpTdf.rdd.map{caseRow(id:Long, gid:Long)=>(id, gid)}.collectAsMap()
val idmpbc = spark.sparkContext.broadcast(idmp)// 利用上日idmap调整当日计算结果
val result = t1Vertices.groupBy(_._2).flatMap(t =>{
val nset = t._2.toMap.keySet
val idmpdict = idmpbc.value
val oset = idmpdict.keySet
val intersectSet = nset.intersect(oset)
var gid = t._1
if(intersectSet != null && intersectSet.size >0){
gid = idmpdict.get(intersectSet.head).get
}
nset.map(t =>{(t, gid)})})