方式1:常规的RDD操作
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
//(1)使用普通的RDD的方式
object user1 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("user1").setMaster("local[*]")
val sc = new SparkContext(conf)
//读取用户连接数据数据
val file = sc.textFile("D:\\数据\\IPSearch\\user.txt")
val phon_time: RDD[((String, String), Long)] =file.map(line=>{
val field=line.split(",")
val phone = field(0)
val time=field(1).toLong
val id=field(2)
val even = field(3).toInt
if (even == 1) -time else time
((phone,id),time)
})
//累加,计算出停留的时间
val sum: RDD[((String, String), Long)] = phon_time.reduceByKey(_+_)
//将基站id提取出来进行与下一个表join
val sum1=sum.map(line=>{
val id = line._1._2
val time=line._2
val phone=line._1._1
(id,(phone,time))
})
//读取基站信息
val file1 = sc.textFile("D:\\数据\\IPSearch\\info.txt")
val info = file1.map(line=>{
val field = line.split(",")
val id=field(0)
val x=field(1)
val y=field(2)
(id,(x,y))
})
//使用join连接
val joined: RDD[(String, ((String, Long), (String, String)))] = sum1.join(info)
val joins: RDD[(String, Long, (String, String))] =joined.map(line=>{
val phone = line._2._1._1
val time = line._2._1._2
val xy=line._2._2
(phone,time,xy)
})
val sort_time: RDD[(String, List[(String, Long, (String, String))])] =joins.groupBy(_._1).mapValues(_.toList.sortBy(_._2).reverse.take(2))
val res = sort_time.map(x=>(x._2))
res.foreach(println)
sc.stop()
}
}
/*
List((18688888888,40320654342600,(116.303955,40.041935)), (18688888888,40320654252400,(116.296302,40.032296)))
List((18101056806,80641308568000,(116.304864,40.050645)), (18101056806,80641308525100,(116.303955,40.041935)))
*/
方式2:使用DataFrame
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object user2 {
def main(args: Array[String]): Unit = {
val sparkSession = SparkSession.builder().appName("user2").master("local[*]").getOrCreate()
val user_file = sparkSession.read.textFile("D:\\数据\\IPSearch\\user.txt")
import sparkSession.implicits._
val user = user_file.map(line=>{
val field=line.split(",")
val phone=field(0)
val time=field(1).toLong
val id=field(2)
val even=field(3).toInt
if(even == 1) -even else even
(phone,id,time)
}).toDF("phone","uid","time")
//创建user视图表
user.createTempView("u_user")
val info_file=sparkSession.read.textFile("D:\\数据\\IPSearch\\info.txt")
val info=info_file.map(line=>{
val field=line.split(",")
val id=field(0)
val x=field(1)
val y=field(2)
(id,x,y)
}).toDF("id","x","y")
//创建info视图表
info.createTempView("u_info")
//开始写sql 语句
val rse: RDD[Row] = sparkSession.sql("select phone ,x,y ,sum(time) sum1 from u_user join u_info " +
"on uid=id group by phone,x,y order by sum1 desc").rdd
val a=rse.map(x=>{
val phone=x.getAs[String](0)
val x1=x.getAs[String](1)
val y1=x.getAs[String](2)
(phone,(x1,y1))
}).groupBy(_._1).mapValues(_.toList).map(x=>(x._2).take(2))
a.foreach(println)
sparkSession.stop()
}
}
/*
RDD[Row]里面的类型使用getAs[String](1)类型取出来
List((18688888888,(116.303955,40.041935)), (18688888888,(116.296302,40.032296)))
List((18101056806,(116.304864,40.050645)), (18101056806,(116.303955,40.041935)))
*/
第三种:使用自定义函数
import org.apache.spark.sql.SparkSession
//小表为广播
object user4 {
def main(args: Array[String]): Unit = {
val sparkSession = SparkSession.builder().appName("user2").master("local[*]").getOrCreate()
val user_file = sparkSession.read.textFile("D:\\数据\\IPSearch\\user.txt")
import sparkSession.implicits._
val user = user_file.map(line=>{
val field=line.split(",")
val phone=field(0)
val time=field(1).toLong
val id=field(2)
val even=field(3).toInt
if(even == 1) -time else time
((phone,id),time)
}).rdd
val user1= user.reduceByKey(_+_).map(x=>{
(x._1._1,x._1._2,x._2)
}).groupBy(_._1).mapValues(_.toList.sortBy(_._3).take(2)).flatMap(_._2).//使用flatMap将list里面的元素压平
map(x=>(x._2,x._1)).toDF("id","phone")
/*val user2: RDD[(String, List[(String, String, Long)])] = user.reduceByKey(_+_).map(x=>{
(x._1._1,x._1._2,x._2)
}).groupBy(_._1).mapValues(_.toList.sortBy(_._3).take(2))
println(user2.collect().toBuffer)*/
//((181,List((181,1603,4), (181,CC07,8))), (186,List((186,1603,4), (186,CC07,4)))
//创建视图表
user1.createTempView("u_info")
//读取基站信息
val info_file=sparkSession.read.textFile("D:\\数据\\IPSearch\\info.txt")
val info=info_file.map(line=>{
val field=line.split(",")
val id=field(0)
val x=field(1)
val y=field(2)
(id,x,y)
}).rdd
//把数据在driver端收集
val user_collect= info.collect()
//创建广播变量
val broadcasts= sparkSession.sparkContext.broadcast(user_collect)
//自定义udf
val fun=(id:String,phone:String)=>{
//读取字典
val idinfo: Array[(String, String, String)] = broadcasts.value
val index=binarySearch(idinfo,id)
//println(index)
var phone_x="a"
var phone_y="b"
if (index != -1){
phone_x=idinfo(index)._2
phone_y=idinfo(index)._3
//println(phone+"===="+phone_x+"===="+phone_y)
}
(phone,phone_x,phone_y)
}
/*
18101056806====116.296302====40.032296
18101056806====116.303955====40.041935
18688888888====116.296302====40.032296
18688888888====116.303955====40.041935
*/
//注册函数
sparkSession.udf.register("fun",fun)
val res=sparkSession.sql("select fun(id,phone) from u_info")
res.show()
sparkSession.stop()
}
//自定义查找方法
def binarySearch(arr: Array[(String, String, String)],id: String):Int={
for (i:Int<- 0 until arr.length){
// println(arr(i)._1+"-----"+id)
if (arr(i)._1.equals(id)){
return i
}
}
-1
}
}
/*
创建广播变量还是需要sparkContext
+--------------------+
| UDF:fun(id, phone)|
+--------------------+
|[18101056806,116....|
|[18101056806,116....|
|[18688888888,116....|
|[18688888888,116....|
+--------------------+
*/
如果要使用广播还是使用小表为基准