数据为人进出基站信息与基站信息。
少部分人进出基站信息数据:为手机号,时间,基站ID,进入基站标志1,出去标志0
18688888888,20160327082400,16030401EAFB68F1E3CDF819735E1C66,1
18611132889,20160327082500,16030401EAFB68F1E3CDF819735E1C66,1
18688888888,20160327170000,16030401EAFB68F1E3CDF819735E1C66,0
18611132889,20160327180000,16030401EAFB68F1E3CDF819735E1C66,0
少部分基站信息数据:基站ID,经度,纬度
9F36407EAD0629FC166F14DDE7970F68,116.304864,40.050645,6
CC0710CC94ECC657A8561DE549D940E0,116.303955,40.041935,6
实际意义:
将根据进入出去相同基站ID时间差可以算出这个人的停留时间,停留时间最长的可能就是这个人的工作地点或者家。然后再根据经纬度信息进行后期判断,就能判断出这个人的大体位子。 能够将附近广告推送到这个人手机号
第一个代码在rdd设计元组的时候不太好,后面与lac表合成元组效果不好
import org.apache.spark.{SparkConf, SparkContext}
object CountTime {
def main(args:Array[String]){
val conf =new SparkConf().setAppName("Demo").setMaster("local[2]") //配置本地运行
val sc=new SparkContext(conf)
//使用两个map 在集群环境下 shuffle 会浪费资源
// sc.textFile("home/hadoop/Data", 2).map(_.split(",")).map(x=>(x(0),x(1),x(2),x(3)))
val mbt=sc.textFile("/home/hadoop/Data/*.log", 2).map(line=>{
val fields=line.split(",")
val flag =fields(3) //这是基站数据中 进去与出去标志信息
val time=fields(1) //时间
val timelong=if(flag=="1") -time.toLong else time.toLong //设置进入基站时候为负数
(fields(0)+"_"+fields(2),timelong) //f返回一个元组
}) //println(mbt.collect.toBuffer)
//colect 直接返回的是 [Lscala.Tuple2;@ede9bd.打印内容需要用哪个toBuffer
//ArrayBuffer((18611132889_CC0710CC94ECC657A8561DE549D940E0,20160327081100), (18688888888_CC0710CC94ECC657A8561DE549D940E0,20160327081200), (18688888888_CC0710CC94ECC657A8561DE549D940E0,-20160327081900),
val result1= mbt.groupBy(_._1)
// println(result.collect.toBuffer)结果为
//Array((18611132889_9F36407EAD0629FC166F14DDE7970F68,CompactBuffer((1861113 2889_9F36407EAD0629FC166F14DDE7970F68,20160327075000), (18611132889_9F36407EAD0629FC166F14DDE7970F68,-20160327081000), (18611132889_9F36407EAD0629FC166F14DDE7970F68,20160327182000), (18611132889_9F36407EAD0629FC166F14DDE7970F68,-20160327230000)))
val rdd1=result1.mapValues(_.foldLeft(0L)(_ + _._2)) //第一个_ 代表一个 CompactBuffer.
// rdd1结果 ArrayBuffer((18611132889_9F36407EAD0629FC166F14DDE7970F68,-54000), (18688888888_9F36407EAD0629FC166F14DDE7970F68,-51200), (18688888888_CC0710CC94ECC657A8561DE549D940E0,-1300), (18611132889_CC0710CC94ECC657A8561DE549D940E0,-1900), (18611132889_16030401EAFB68F1E3CDF819735E1C66,-97500), (18688888888_16030401EAFB68F1E3CDF819735E1C66,-87600))
val rdd2 =rdd1.map(t=>{
val mobile_bs=t._1
val mobile=mobile_bs.split("_")(0) // 对(18611132889_9F36407EAD0629FC166F14DDE7970F68,-54000)进行切分提取手机号与基站信息
val lac =mobile_bs.split("_")(1)
val time=t._2
(mobile,lac,time) //切分结果 //ArrayBuffer((18611132889,9F36407EAD0629FC166F14DDE7970F68,-54000), (18688888888,9F36407EAD0629FC166F14DDE7970F68,-51200),
})
val rdd3=rdd2.groupBy(_._1) //结果:ArrayBuffer((18688888888,CompactBuffer((18688888888,9F36407EAD0629FC166F14DDE7970F68,51200), (18688888888,CC0710CC94ECC657A8561DE549D940E0,1300), (18688888888,16030401EAFB68F1E3CDF819735E1C66,87600))), (18611132889,CompactBuffer((18611132889,9F36407EAD0629FC166F14DDE7970F68,54000), (18611132889,CC0710CC94ECC657A8561DE549D940E0,1900), (18611132889,16030401EAFB68F1E3CDF819735E1C66,97500))))
val rdd4 =rdd3.mapValues(it =>{
it.toList.sortBy(_._3).reverse.take(2) //将value转换为list 最高排序下两个拿出来
}) //ArrayBuffer((18688888888,List((18688888888,16030401EAFB68F1E3CDF819735E1C66,87600), (18688888888,9F36407EAD0629FC166F14DDE7970F68,51200))), (18611132889,List((18611132889,16030401EAFB68F1E3CDF819735E1C66,97500), (18611132889,9F36407EAD0629FC166F14DDE7970F68,54000))))
println(rdd4.collect.toBuffer)
sc.stop
}
}
第二种与lac数据做链接
方便解释rdd分开了
import org.apache.spark.{SparkConf, SparkContext}
object CountTime02 {
def main(args:Array[String]){
val conf =new SparkConf().setAppName("Demo").setMaster("local[2]")
val sc=new SparkContext(conf)
// sc.textFile("home/hadoop/Data", 2).map(_.split(",")).map(x=>(x(0),x(1),x(2),x(3)))
val rdd0=sc.textFile("/home/hadoop/Data/*.log", 2).map(line=>{
val fields=line.split(",")
val flag =fields(3) //c进入基站标志位
val time=fields(1) //时间
val timelong=if(flag=="1") -time.toLong else time.toLong //进入基站的时候标志位为1设置时间为负数
((fields(0),fields(2)),timelong) //这个元组与上个代码中的不一样,方便操作
}) //rdd0 结果ArrayBuffer(((18611132889,CC0710CC94ECC657A8561DE549D940E0),-20160327081100), ((18688888888,CC0710CC94ECC657A8561DE549D940E0),-20160327081200), ((18688888888,CC0710CC94ECC657A8561DE549D940E0),20160327081900), ((18611132889,CC0710CC94ECC657A8561DE549D940E0)
val rdd1=rdd0.reduceByKey(_+_) /把rdd0的手机号基站信息做为key,对time进行相加
//rdd1 结果ArrayBuffer(((18688888888,CC0710CC94ECC657A8561DE549D940E0),1300), ((18611132889,9F36407EAD0629FC166F14DDE7970F68),54000), ((18688888888,9F36407EAD0629FC166F14DDE7970F68),51200), ((18688888888,16030401EAFB68F1E3CDF819735E1C66),87600), ((18611132889,CC0710CC94ECC657A8561DE549D940E0),1900), ((18611132889,16030401EAFB68F1E3CDF819735E1C66),97500))
val rdd2=rdd1.map(x=>{ //将rdd1的元组进行拆分提取 手机号 基站信息 时间
val mobile=x._1._1
val lac=x._1._2
val time=x._2
(lac,(mobile,time)) //返回这种形式下的元组
})
val rdd3=rdd1.groupBy(_._1._1) //rdd3的结果为:ArrayBuffer((18688888888,CompactBuffer(((18688888888,CC0710CC94ECC657A8561DE549D940E0),1300), ((18688888888,9F36407EAD0629FC166F14DDE7970F68),51200), ((18688888888,16030401EAFB68F1E3CDF819735E1C66),87600))), (18611132889,CompactBuffer(((18611132889,9F36407EAD0629FC166F14DDE7970F68),54000), ((18611132889,CC0710CC94ECC657A8561DE549D940E0),1900), ((18611132889,16030401EAFB68F1E3CDF819735E1C66),97500))))
//读入基站的经纬度信息
val rdd4=sc.textFile("/home/hadoop/Data1/*.txt").map(line=>{
val f=line.split(",")
(f(0),(f(1),f(2))) //(基站ID 经度 纬度)
})
/*val rdd5=rdd2.join(rdd4)
//rdd5 res:ArrayBuffer((CC0710CC94ECC657A8561DE549D940E0,((18688888888,1300),(116.303955,40.041935))), (CC0710CC94ECC657A8561DE549D940E0,((18611132889,1900),(116.303955,40.041935))), (16030401EAFB68F1E3CDF819735E1C66,((18688888888,87600),(116.296302,40.032296))), (16030401EAFB68F1E3CDF819735E1C66,((18611132889,97500),(116.296302,40.032296))), (9F36407EAD0629FC166F14DDE7970F68,((18611132889,54000),(116.304864,40.050645))), (9F36407EAD0629FC166F14DDE7970F68,((18688888888,51200),(116.304864,40.050645))))
*/根据上面的结果继续分组操作
val rdd5=rdd2.join(rdd4).map(t=>{
val lac=t._1 //基站ID
val mobile=t._2._1._1 //提取手机号
val time=t._2._1._2 //时间
val x=t._2._2._1 //精度
val y=t._2._2._2 //维度
(mobile,lac,time,x,y)
}) //res :ArrayBuffer((18688888888,CC0710CC94ECC657A8561DE549D940E0,1300,116.303955,40.041935), (18611132889,CC0710CC94ECC657A8561DE549D940E0,1900,116.303955,40.041935), (18688888888,16030401EAFB68F1E3CDF819735E1C66,87600,116.296302,40.032296), (18611132889,16030401EAFB68F1E3CDF819735E1C66,97500,116.296302,40.032296), (18611132889,9F36407EAD0629FC166F14DDE7970F68,54000,116.304864,40.050645), (18688888888,9F36407EAD0629FC166F14DDE7970F68,51200,116.304864,40.050645))
// val rdd6=rdd5.sortBy(_._3) //这个地方直接根据时间排序,返回结果不理想ArrayBuffer((18688888888,CC0710CC94ECC657A8561DE549D940E0,1300,116.303955,40.041935), (18611132889,CC0710CC94ECC657A8561DE549D940E0,1900,116.303955,40.041935), (18688888888,9F36407EAD0629FC166F14DDE7970F68,51200,116.304864,40.050645), (18611132889,9F36407EAD0629FC166F14DDE7970F68,54000,116.304864,40.050645), (18688888888,16030401EAFB68F1E3CDF819735E1C66,87600,116.296302,40.032296), (18611132889,16030401EAFB68F1E3CDF819735E1C66,97500,116.296302,40.032296))
//将对手机号为key进行分组
val rdd6=rdd5.groupBy(_._1) //ArrayBuffer((18688888888,CompactBuffer((18688888888,CC0710CC94ECC657A8561DE549D940E0,1300,116.303955,40.041935), (18688888888,16030401EAFB68F1E3CDF819735E1C66,87600,116.296302,40.032296), (18688888888,9F36407EAD0629FC166F14DDE7970F68,51200,116.304864,40.050645))), (18611132889,CompactBuffer((18611132889,CC0710CC94ECC657A8561DE549D940E0,1900,116.303955,40.041935), (18611132889,16030401EAFB68F1E3CDF819735E1C66,97500,116.296302,40.032296), (18611132889,9F36407EAD0629FC166F14DDE7970F68,54000,116.304864,40.050645))))
val rdd7=rdd6.mapValues(it=>{
it.toList.sortBy(_._3).reverse.take(2) //转换为list进行排序 选出时间最大的两个信息
})
println(rdd7.collect.toBuffer) //ArrayBuffer((18688888888,List((18688888888,16030401EAFB68F1E3CDF819735E1C66,87600,116.296302,40.032296), (18688888888,9F36407EAD0629FC166F14DDE7970F68,51200,116.304864,40.050645))), (18611132889,List((18611132889,16030401EAFB68F1E3CDF819735E1C66,97500,116.296302,40.032296), (18611132889,9F36407EAD0629FC166F14DDE7970F68,54000,116.304864,40.050645))))
rdd7.saveAsTextFile("/home/hadoop/output1") //保持到本地
sc.stop
}}