Spark下根据基站信息,计算用户停留时间最长的两个地方--记录

数据为人进出基站信息与基站信息。
少部分人进出基站信息数据:为手机号,时间,基站ID,进入基站标志1,出去标志0
18688888888,20160327082400,16030401EAFB68F1E3CDF819735E1C66,1
18611132889,20160327082500,16030401EAFB68F1E3CDF819735E1C66,1
18688888888,20160327170000,16030401EAFB68F1E3CDF819735E1C66,0
18611132889,20160327180000,16030401EAFB68F1E3CDF819735E1C66,0
少部分基站信息数据:基站ID,经度,纬度
9F36407EAD0629FC166F14DDE7970F68,116.304864,40.050645,6
CC0710CC94ECC657A8561DE549D940E0,116.303955,40.041935,6

实际意义:
将根据进入出去相同基站ID时间差可以算出这个人的停留时间,停留时间最长的可能就是这个人的工作地点或者家。然后再根据经纬度信息进行后期判断,就能判断出这个人的大体位子。 能够将附近广告推送到这个人手机号
第一个代码在rdd设计元组的时候不太好,后面与lac表合成元组效果不好

	import org.apache.spark.{SparkConf, SparkContext}

	object CountTime {

	def main(args:Array[String]){
	  val conf =new SparkConf().setAppName("Demo").setMaster("local[2]")  //配置本地运行
	  
	  val sc=new SparkContext(conf)
	  //使用两个map  在集群环境下 shuffle 会浪费资源
	 // sc.textFile("home/hadoop/Data", 2).map(_.split(",")).map(x=>(x(0),x(1),x(2),x(3))) 
	 
	  val mbt=sc.textFile("/home/hadoop/Data/*.log", 2).map(line=>{  
	    val fields=line.split(",")
	    val flag =fields(3)    //这是基站数据中 进去与出去标志信息
	    val time=fields(1)     //时间
	    val timelong=if(flag=="1") -time.toLong else time.toLong  //设置进入基站时候为负数
	    (fields(0)+"_"+fields(2),timelong)  //f返回一个元组
	    
	  }) //println(mbt.collect.toBuffer)
	  //colect  直接返回的是 [Lscala.Tuple2;@ede9bd.打印内容需要用哪个toBuffer  		
	  //ArrayBuffer((18611132889_CC0710CC94ECC657A8561DE549D940E0,20160327081100), (18688888888_CC0710CC94ECC657A8561DE549D940E0,20160327081200), (18688888888_CC0710CC94ECC657A8561DE549D940E0,-20160327081900), 

	  val result1= mbt.groupBy(_._1)
	   // println(result.collect.toBuffer)结果为
	//Array((18611132889_9F36407EAD0629FC166F14DDE7970F68,CompactBuffer((1861113 2889_9F36407EAD0629FC166F14DDE7970F68,20160327075000),  (18611132889_9F36407EAD0629FC166F14DDE7970F68,-20160327081000),  (18611132889_9F36407EAD0629FC166F14DDE7970F68,20160327182000),  (18611132889_9F36407EAD0629FC166F14DDE7970F68,-20160327230000)))
	  val rdd1=result1.mapValues(_.foldLeft(0L)(_ + _._2))  //第一个_ 代表一个 CompactBuffer.
	  
	  // rdd1结果  ArrayBuffer((18611132889_9F36407EAD0629FC166F14DDE7970F68,-54000), (18688888888_9F36407EAD0629FC166F14DDE7970F68,-51200), (18688888888_CC0710CC94ECC657A8561DE549D940E0,-1300), (18611132889_CC0710CC94ECC657A8561DE549D940E0,-1900), (18611132889_16030401EAFB68F1E3CDF819735E1C66,-97500), (18688888888_16030401EAFB68F1E3CDF819735E1C66,-87600))

	  val rdd2 =rdd1.map(t=>{
	    val mobile_bs=t._1
	   
	    val mobile=mobile_bs.split("_")(0) //(18611132889_9F36407EAD0629FC166F14DDE7970F68,-54000)进行切分提取手机号与基站信息
	    val lac =mobile_bs.split("_")(1)  
	    val time=t._2
	    (mobile,lac,time) //切分结果 //ArrayBuffer((18611132889,9F36407EAD0629FC166F14DDE7970F68,-54000), (18688888888,9F36407EAD0629FC166F14DDE7970F68,-51200),
	  }) 
	  
	  val rdd3=rdd2.groupBy(_._1)  //结果:ArrayBuffer((18688888888,CompactBuffer((18688888888,9F36407EAD0629FC166F14DDE7970F68,51200), (18688888888,CC0710CC94ECC657A8561DE549D940E0,1300), (18688888888,16030401EAFB68F1E3CDF819735E1C66,87600))), (18611132889,CompactBuffer((18611132889,9F36407EAD0629FC166F14DDE7970F68,54000), (18611132889,CC0710CC94ECC657A8561DE549D940E0,1900), (18611132889,16030401EAFB68F1E3CDF819735E1C66,97500))))
	  
	  val rdd4 =rdd3.mapValues(it =>{
	    it.toList.sortBy(_._3).reverse.take(2) //将value转换为list  最高排序下两个拿出来
	  }) //ArrayBuffer((18688888888,List((18688888888,16030401EAFB68F1E3CDF819735E1C66,87600), (18688888888,9F36407EAD0629FC166F14DDE7970F68,51200))), (18611132889,List((18611132889,16030401EAFB68F1E3CDF819735E1C66,97500), (18611132889,9F36407EAD0629FC166F14DDE7970F68,54000))))
	  
	  println(rdd4.collect.toBuffer) 
	  sc.stop	  
	}
	}

第二种与lac数据做链接
方便解释rdd分开了

   import org.apache.spark.{SparkConf, SparkContext}
   object CountTime02 {
   def main(args:Array[String]){
     
     
     val conf =new SparkConf().setAppName("Demo").setMaster("local[2]")
     
     val sc=new SparkContext(conf)
    // sc.textFile("home/hadoop/Data", 2).map(_.split(",")).map(x=>(x(0),x(1),x(2),x(3)))
     val rdd0=sc.textFile("/home/hadoop/Data/*.log", 2).map(line=>{  
       val fields=line.split(",")
       val flag =fields(3)    //c进入基站标志位
       val time=fields(1)     //时间
       val timelong=if(flag=="1") -time.toLong else time.toLong  //进入基站的时候标志位为1设置时间为负数
       ((fields(0),fields(2)),timelong)  //这个元组与上个代码中的不一样,方便操作
       
     })  //rdd0 结果ArrayBuffer(((18611132889,CC0710CC94ECC657A8561DE549D940E0),-20160327081100), ((18688888888,CC0710CC94ECC657A8561DE549D940E0),-20160327081200), ((18688888888,CC0710CC94ECC657A8561DE549D940E0),20160327081900), ((18611132889,CC0710CC94ECC657A8561DE549D940E0)
     val rdd1=rdd0.reduceByKey(_+_)   /把rdd0的手机号基站信息做为key,对time进行相加
     //rdd1 结果ArrayBuffer(((18688888888,CC0710CC94ECC657A8561DE549D940E0),1300), ((18611132889,9F36407EAD0629FC166F14DDE7970F68),54000), ((18688888888,9F36407EAD0629FC166F14DDE7970F68),51200), ((18688888888,16030401EAFB68F1E3CDF819735E1C66),87600), ((18611132889,CC0710CC94ECC657A8561DE549D940E0),1900), ((18611132889,16030401EAFB68F1E3CDF819735E1C66),97500))
     val rdd2=rdd1.map(x=>{   //将rdd1的元组进行拆分提取 手机号 基站信息 时间
       val mobile=x._1._1
       val lac=x._1._2
       val time=x._2
       (lac,(mobile,time))  //返回这种形式下的元组
     })
     
     val rdd3=rdd1.groupBy(_._1._1)  //rdd3的结果为:ArrayBuffer((18688888888,CompactBuffer(((18688888888,CC0710CC94ECC657A8561DE549D940E0),1300), ((18688888888,9F36407EAD0629FC166F14DDE7970F68),51200), ((18688888888,16030401EAFB68F1E3CDF819735E1C66),87600))), (18611132889,CompactBuffer(((18611132889,9F36407EAD0629FC166F14DDE7970F68),54000), ((18611132889,CC0710CC94ECC657A8561DE549D940E0),1900), ((18611132889,16030401EAFB68F1E3CDF819735E1C66),97500))))
     //读入基站的经纬度信息
     val rdd4=sc.textFile("/home/hadoop/Data1/*.txt").map(line=>{
       val f=line.split(",")
       (f(0),(f(1),f(2)))  //(基站ID  经度  纬度)
       
     }) 
     /*val rdd5=rdd2.join(rdd4)
     //rdd5 res:ArrayBuffer((CC0710CC94ECC657A8561DE549D940E0,((18688888888,1300),(116.303955,40.041935))), (CC0710CC94ECC657A8561DE549D940E0,((18611132889,1900),(116.303955,40.041935))), (16030401EAFB68F1E3CDF819735E1C66,((18688888888,87600),(116.296302,40.032296))), (16030401EAFB68F1E3CDF819735E1C66,((18611132889,97500),(116.296302,40.032296))), (9F36407EAD0629FC166F14DDE7970F68,((18611132889,54000),(116.304864,40.050645))), (9F36407EAD0629FC166F14DDE7970F68,((18688888888,51200),(116.304864,40.050645))))
     */根据上面的结果继续分组操作
     val rdd5=rdd2.join(rdd4).map(t=>{
       val lac=t._1    //基站ID
       val mobile=t._2._1._1  //提取手机号
       val time=t._2._1._2 //时间
       val x=t._2._2._1   //精度
       val y=t._2._2._2   //维度
       (mobile,lac,time,x,y)  
     })  //res :ArrayBuffer((18688888888,CC0710CC94ECC657A8561DE549D940E0,1300,116.303955,40.041935), (18611132889,CC0710CC94ECC657A8561DE549D940E0,1900,116.303955,40.041935), (18688888888,16030401EAFB68F1E3CDF819735E1C66,87600,116.296302,40.032296), (18611132889,16030401EAFB68F1E3CDF819735E1C66,97500,116.296302,40.032296), (18611132889,9F36407EAD0629FC166F14DDE7970F68,54000,116.304864,40.050645), (18688888888,9F36407EAD0629FC166F14DDE7970F68,51200,116.304864,40.050645))
     // val rdd6=rdd5.sortBy(_._3)  //这个地方直接根据时间排序,返回结果不理想ArrayBuffer((18688888888,CC0710CC94ECC657A8561DE549D940E0,1300,116.303955,40.041935), (18611132889,CC0710CC94ECC657A8561DE549D940E0,1900,116.303955,40.041935), (18688888888,9F36407EAD0629FC166F14DDE7970F68,51200,116.304864,40.050645), (18611132889,9F36407EAD0629FC166F14DDE7970F68,54000,116.304864,40.050645), (18688888888,16030401EAFB68F1E3CDF819735E1C66,87600,116.296302,40.032296), (18611132889,16030401EAFB68F1E3CDF819735E1C66,97500,116.296302,40.032296))
     //将对手机号为key进行分组
     val rdd6=rdd5.groupBy(_._1)   //ArrayBuffer((18688888888,CompactBuffer((18688888888,CC0710CC94ECC657A8561DE549D940E0,1300,116.303955,40.041935), (18688888888,16030401EAFB68F1E3CDF819735E1C66,87600,116.296302,40.032296), (18688888888,9F36407EAD0629FC166F14DDE7970F68,51200,116.304864,40.050645))), (18611132889,CompactBuffer((18611132889,CC0710CC94ECC657A8561DE549D940E0,1900,116.303955,40.041935), (18611132889,16030401EAFB68F1E3CDF819735E1C66,97500,116.296302,40.032296), (18611132889,9F36407EAD0629FC166F14DDE7970F68,54000,116.304864,40.050645))))
     val rdd7=rdd6.mapValues(it=>{
       it.toList.sortBy(_._3).reverse.take(2)      //转换为list进行排序 选出时间最大的两个信息
     })
     
     println(rdd7.collect.toBuffer)  //ArrayBuffer((18688888888,List((18688888888,16030401EAFB68F1E3CDF819735E1C66,87600,116.296302,40.032296), (18688888888,9F36407EAD0629FC166F14DDE7970F68,51200,116.304864,40.050645))), (18611132889,List((18611132889,16030401EAFB68F1E3CDF819735E1C66,97500,116.296302,40.032296), (18611132889,9F36407EAD0629FC166F14DDE7970F68,54000,116.304864,40.050645))))
     rdd7.saveAsTextFile("/home/hadoop/output1")  //保持到本地
     sc.stop
   }}
  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值