代码如下:
package com.yh.musicproject.eds.machine
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import com.yh.musicproject.common.ConfigUtils
import org.apache.commons.lang3.StringUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import scalaj.http.{Http, HttpResponse}
import scala.collection.mutable.ListBuffer
object GenerateTwMacLocD {
def main(args: Array[String]): Unit = {
if(args.size==0){
System.exit(1)
println("请输入日期")
}
val inputDate: String = args(0)
//使用ConfigUtils.LOCAL_RUN ,修改配置文件,当LOCAL_RUN=true时,在本地运行,当为false时在yarn集群上运行
var spark: SparkSession =null
if(ConfigUtils.LOCAL_RUN){
spark = SparkSession.builder()
.master("local[*]")
.appName(this.getClass.getName)
.config("hive.metastore.uris",ConfigUtils.HIVE_METASTORE_URIS)
.config("spark.sql.shuffle.partitions", 2)
.enableHiveSupport()
.getOrCreate()
}else{
spark = SparkSession.builder()
.master("yarn")
.appName(this.getClass.getName)
.config("hive.metastore.uris",ConfigUtils.HIVE_METASTORE_URIS)
.config("spark.sql.shuffle.partitions", 2)
.enableHiveSupport()
.getOrCreate()
}
spark.sparkContext.setLogLevel("WARN")
spark.sql("use songdb")
// 一个机器可能存在很多人上边经纬度,并且上报的经纬度可能不一样
// 根据机器编号、经纬度分组,统计出现的次数最多那个情况,作为后续查询的条件
val df: DataFrame = spark.sql(
s"""
|select
| mid, x, y, cnt
|from (
| select
| mid, x, y,cnt,
| row_number() over (partition by mid order by cnt desc) pm
| from(
| select mid ,lat x ,lng y , count(distinct uid) cnt
| from TO_YCAK_USR_LOC_D
| where data_dt = '${inputDate}' and lat != '' and lng != ''
| group by mid,lat,lng
| ) t1
| ) t2
|where pm = 1
|""".stripMargin)
// df.show()
// 上边将查询结果存储在DataFrame中
// 1. 自定义函数
// 2. dataFrame对象转换成RDD
val rdd1: RDD[Row] = df.rdd.mapPartitions(iter => {
val list3 = new ListBuffer[Row] //存储解析后的数据
// 将迭代器转化成list集合, [如果不变成list集合,迭代器只能循环一次]
val list: List[Row] = iter.toList
var times = 0 //存储发送请求的次数
if (list.size % 20 == 0) {
times = list.size / 20
} else {
times = list.size / 20 + 1
}
for (i <- 0 until times) { //循环times次
val list2: List[Row] = list.slice(i * 20, (i + 1) * 20)
var yx = "" // 存储拼接后的经纬度
for (elem <- list2) {
val x: String = elem.getAs[String]("x")
val y: String = elem.getAs[String]("y")
yx = y + "," + x + "|"
}
// 经度,纬度|经度,纬度| substring(0,yx.length-1)可以去掉最后一个竖线
val res: HttpResponse[String] = Http("https://restapi.amap.com/v3/geocode/regeo")
.param("key", "72740e3cea1bc1b29502af781d1f0b39")
.param("batch", "true")
.param("location", s"${yx.substring(0, yx.length - 1)}").asString
//println(res.body)
val jSONObject: JSONObject = JSON.parseObject(res.body)
if (!StringUtils.isBlank(res.body) && "10000".equals(jSONObject.getString("infocode"))) {
val jsonarray: JSONArray = jSONObject.getJSONArray("regeocodes")
for (i <- 0 until jsonarray.size) {
val currentJsonObject = jsonarray.getJSONObject(i)
val MID : Int = list2(i).getAs[String]("mid").toInt
val X : String = list2(i).getAs[String]("x")
val Y : String = list2(i).getAs[String]("y")
val CNT : Int = list2(i).getAs[Long]("cnt").toInt
val ADDER : String = currentJsonObject.getString("formatted_address")
val PRVC : String = currentJsonObject.getJSONObject("addressComponent").getString("province")
val CTY : String = currentJsonObject.getJSONObject("addressComponent").getString("city")
val CTY_CD : String = currentJsonObject.getJSONObject("addressComponent").getString("citycode")
val DISTRICT : String = currentJsonObject.getJSONObject("addressComponent").getString("district")
val AD_CD : String = currentJsonObject.getJSONObject("addressComponent").getString("adcode")
val TOWN_SHIP : String = currentJsonObject.getJSONObject("addressComponent").getString("township")
val TOWN_CD : String = currentJsonObject.getJSONObject("addressComponent").getString("towncode")
val NB_NM : String = currentJsonObject.getJSONObject("addressComponent").getJSONObject("neighborhood").getString("name")
val NB_TP : String = currentJsonObject.getJSONObject("addressComponent").getJSONObject("neighborhood").getString("type")
val BD_NM : String = currentJsonObject.getJSONObject("addressComponent").getJSONObject("building").getString("name")
val BD_TP : String = currentJsonObject.getJSONObject("addressComponent").getJSONObject("building").getString("type")
val STREET : String = currentJsonObject.getJSONObject("addressComponent").getJSONObject("streetNumber").getString("street")
val STREET_NB : String = currentJsonObject.getJSONObject("addressComponent").getJSONObject("streetNumber").getString("number")
val STREET_LOC : String = currentJsonObject.getJSONObject("addressComponent").getJSONObject("streetNumber").getString("location")
val STREET_DRCTION: String = currentJsonObject.getJSONObject("addressComponent").getJSONObject("streetNumber").getString("direction")
val STREET_DSTANCE: String = currentJsonObject.getJSONObject("addressComponent").getJSONObject("streetNumber").getString("distance")
val BUS_INFO : String = currentJsonObject.getJSONObject("addressComponent").getString("businessAreas")
list3.append(Row(MID, X, Y, CNT, ADDER, PRVC, CTY, CTY_CD, DISTRICT, AD_CD, TOWN_SHIP, TOWN_CD, NB_NM, NB_TP,
BD_NM, BD_TP, STREET, STREET_NB, STREET_LOC, STREET_DRCTION, STREET_DSTANCE, BUS_INFO))
}
}
}
list3.iterator
})
//rdd1.foreach(println)
// 将rdd转换成dataFrame
// 方式1: rdd.toDF
// 方式2: spark.createDataFrame
val schema = StructType(Array[StructField](
StructField("MID",IntegerType),
StructField("X",StringType),
StructField("Y",StringType),
StructField("CNT",IntegerType),
StructField("ADDER",StringType),
StructField("PRVC",StringType),
StructField("CTY",StringType),
StructField("CTY_CD",StringType),
StructField("DISTRICT",StringType),
StructField("AD_CD",StringType),
StructField("TOWN_SHIP",StringType),
StructField("TOWN_CD",StringType),
StructField("NB_NM",StringType),
StructField("NB_TP",StringType),
StructField("BD_NM",StringType),
StructField("BD_TP",StringType),
StructField("STREET",StringType),
StructField("STREET_NB",StringType),
StructField("STREET_LOC",StringType),
StructField("STREET_DRCTION",StringType),
StructField("STREET_DSTANCE",StringType),
StructField("BUS_INFO",StringType)
))
spark.createDataFrame(rdd1,schema).createTempView("mac_location")
spark.sql(
s"""
|insert overwrite table tw_mac_loc_d partition(data_dt='${inputDate}')
|select * from mac_location
|""".stripMargin)
spark.stop()
}
}