// import
import Utils.SparkUtil
import breeze.linalg.DenseMatrix
import org.locationtech.jts.simplify.DouglasPeuckerSimplifier
import com.sankuai.roadfusion.algorithms.MaximalNearestSubline
import com.sankuai.roadfusion.utils.CoordinateTransformUtil
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.jts.registerTypes
import org.apache.spark.sql.{Column, DataFrame, Row,DatasetHolder}
import org.apache.spark.storage.StorageLevel
import org.locationtech.jts.geom.{Geometry, MultiLineString,LineString,Point,Coordinate,Polygon, GeometryFactory}
import org.locationtech.jts.index.strtree.STRtree
import com.google.gson.{JsonObject, JsonParser, JsonArray}
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
import scala.collection.JavaConversions._
import scala.collection.mutable.WrappedArray
import scala.collection.mutable.ListBuffer
import org.apache.spark.HashPartitioner
import com.mtmap.model.{DbscanModel, GisTools, TrailTools}
import java.text.SimpleDateFormat
import scala.collection.JavaConversions._
def printlog(info:String): Unit ={
val dt = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new java.util.Date)
println("=========="*8+dt)
println(info+"\n")
}
val spark_sc = new SparkUtil("local[4]", "huangrui-jupyter")
registerTypes
import spark_sc.spark.implicits._
spark_sc.spark.udf.register("transToPlanar", GisTools.projectToPlanar _)
spark_sc.spark.udf.register("transToMars", GisTools.projectToMars _)
spark_sc.spark.udf.register("getLength", (line:LineString) => line.getLength)
//hdfs 读写
//.parquet
spark_sc.writeParquet(dataframe.repartition(1),path)
val dataframe = spark_sc.getParquet("/user/hadoop-mesa/huangrui13/trails_factor/jx/20200922/nanjing_3m/feature_fortrain/")
//.csv
spark_sc.writeCsv(dataframe.repartition(1),path)
df_road_gdataframeroup.repartition(200).write.format("com.databricks.spark.csv").option("header", "true").option("delimiter", "\t").save(out_path)
val data = spark_sc.getCsv_t(path)
//.geojson
spark_sc.writeGeoJsonMany(dataframe,path)
spark_sc.writeGeoJsonOnly(dataframe,path) //写本地
//dataset 写 csv
train_data.repartition(1).write.option("header",true).option("encoding", "UTF-8").csv("train_0922.csv")
//读入Hive数据表中的数据成DataFrame
val query = """
| select t.dt,t.city_id,t.lat,t.lng,t.vehicle_id,t.gps_time
| from mart_qcs_lbs.dwd_trk_roadnet_s_d t where t.city_id in (320100) and t.dt=20200422
| and t.lat > -90 and t.lat < 90
| and t.lng > -180 and t.lng < 180
| and t.vehicle_id in ("9510a5f8a97044cdbc4162c9aebec9c1")
""".stripMargin
val dfpoints = spark_sc.spark.sql(query)
val dfpoints_distinct = dfpoints.distinct()
dfpoints_distinct.count()
//函数式编程
def get_features(spark_sc: SparkUtil, dfraw: DataFrame): DataFrame = {
import spark_sc.spark.implicits._
val dftrails: DataFrame = dfraw.rdd.mapPartitions(iter => {
var r_list = List[(String, String, String, String)]()
while (iter.hasNext) {
val cur = iter.next
val allResult = cur.getAs[String]("allResult")
if (arr_lengthdiff.length > 1){
val true_point_roadid_string = point_roadid.map(_.toString).reduce(_ + ";" + _)
r_list = r_list.:+(true_point_roadid_string)
}
}
r_list.iterator
}).toDF("point_vs_roadid")
dftrails
}
val df_trails = get_features(spark_sc, duandian_produce)
//隐函数编程
val kld_group_road_per_node = kld_door_group.rdd.map(row=> {
val nodeid = row.getAs[String]("nodeid")
(nodeid)
}).toDF("nodeid")
val df_data_get_slice = df_trails.rdd.mapPartitions(iter => {
var r_list = List[(String, String, String, String)]()
while (iter.hasNext) {
val cur = iter.next
val true_point_roadid = cur.getAs[String]("point_vs_roadid").split(";")
for (i <- 0 until matchRoadids.size) {
val per_road_pointsidx = allResult_addidx.filter(x => x._1 == matchRoadids(i) || x._1 == ("-" + matchRoadids(i))).map(x => x._2)
val arr_single_trail_id = new ArrayBuffer[String]()
for (j <- 0 until per_road_pointsidx.size) {
arr_single_trail_id.append(trail_id)
}
if (arr_single_trail_id.size > 1) {
arr_single_trail_lenth_string = lenth_subline.reduce(_ + ";" + _)
r_list = r_list.:+(matchRoadids(i), arr_single_trail_lenth_string, arr_single_trail_trail_id_string, trail_points_idx)
}
}
}
r_list.iterator
}).toDF("road_id", "lenth", "trail_id", "trail_points_idx")
//join
val tmp_join = df1.join(df2,$"df1_key"===$"df2_key","right").na.fill(Map("jx_detect" -> -2)).filter("jx_detect==-2")
//group by 合并操作
val df_group = df.groupBy("nodeid").agg(
concat_ws(";",collect_list("fid")).as("road_id"),
concat_ws(":",collect_list("inoutlabel")).as("inoutlabel"),
).toDF("nodeid","road_id","inoutlabel")
//缓解数据倾斜
//udf加前缀
spark_sc.spark.udf.register("random_prefix", (key:String)=>{
val random = new Random()
val randNum = random.nextInt(20)
randNum + "_" + key
})
//udf去前缀
spark_sc.spark.udf.register("remove_prefix",(key:String)=>{
val strings = key.split("_")
strings(1)
})
val road_join_fill = road_join.na.fill("notrail")
//分隔符简介: "+"代表一个nodeid路口的多条roadid,":"代表一条roadid匹配的若干条轨迹id,";"代表一条轨迹id的若干个时刻点。
val kld_door_group_tmp = road_join_fill.selectExpr("random_prefix(nodeid) as nodeid_random","*").drop("nodeid")
val kld_door_group1 = kld_door_group_tmp.groupBy("nodeid_random").agg(
concat_ws("+",collect_list("fid")).as("fid"),
concat_ws("+",collect_list("inoutlabel")).as("inoutlabel"),
concat_ws("+",collect_list("passcode")).as("passcode"),
concat_ws("+",collect_list("geom")).as("geom"),
concat_ws("+",collect_list("trail_id")).as("trail_id"),
concat_ws("+",collect_list("lenth")).as("lenth"),
concat_ws("+",collect_list("trail_points_idx")).as("trail_points_idx")
).toDF("nodeid_random","fid","inoutlabel","passcode","geom","trail_id","lenth","trail_points_idx")
val kld_door_group_tmp2 = kld_door_group1.selectExpr("remove_prefix(nodeid_random) as nodeid","*").drop("nodeid_random")
val kld_door_group = kld_door_group_tmp2.groupBy("nodeid").agg(
concat_ws("+",collect_list("fid")).as("road_id"),
concat_ws("+",collect_list("inoutlabel")).as("inoutlabel"),
concat_ws("+",collect_list("passcode")).as("passcode"),
concat_ws("+",collect_list("geom")).as("geom"),
concat_ws("+",collect_list("trail_id")).as("trail_id"),
concat_ws("+",collect_list("lenth")).as("lenth"),
concat_ws("+",collect_list("trail_points_idx")).as("trail_points_idx")
).toDF("nodeid","road_id","inoutlabel","passcode","geom","trail_id","lenth","trail_points_idx")
kld_door_group.show(20,false)
//截断:防止数据倾斜
val dforder = df.selectExpr("road_id", "lenth", "trail_id", "trail_points_idx",
"row_number() over (partition by road_id order by lenth) as order").filter("order<=8000")
//路段特征提取,直方图
val hist_dir = (dir:Double) => {
val d = if(dir<337.5) dir else 360.0-dir
val features = Array(0,45,90,135,180,225,270,315)
.map(t => if((-22.5<=d-t)&&(d-t<22.5)) 1.0 else 0.0)
features
}
// lead 函数使用
import org.apache.spark.sql.expressions.Window
// 按时间排序,转换list[point]
val window = Window.partitionBy("vehicle_id").orderBy($"gps_time".asc)
val dftmp = dfpoints_distinct.withColumn("next_gps_time", lead($"gps_time", 1, -1).over(window)).orderBy($"gps_time".asc)
//解析dataframe数据
import org.locationtech.jts.geom.{Geometry, MultiLineString,LineString,Point,Coordinate,Polygon, GeometryFactory}
import org.apache.spark.sql.{Column, DataFrame, Row,DatasetHolder}
import scala.collection.JavaConversions._
val test_dataset_dataset = test_dataset.rdd.map(row=>{
val nodeid = row.getAs[String]("nodeid")
val group_road = row.getAs[String]("group_road")
val geom = row.getAs[MultiLineString]("geom").toString
val group_road_nums = row.getAs[Row]("group_road_nums").toSeq.map(x=>x.asInstanceOf[Int])
val group_road_lenth = row.getAs[Row]("group_road_lenth").toSeq.map(x=>x.asInstanceOf[Int])
val all_match_nums = row.getAs[Int]("all_match_nums")
val same_dir_match_nums = row.getAs[Int]("same_dir_match_nums")
val instream_percent = row.getAs[String]("instream_percent").toDouble
val outstream_percent = row.getAs[String]("outstream_percent").toDouble
val gap_idx = row.getAs[Row]("gap_idx_5+gap_idx_10+gap_idx_20+gap_idx_40").toSeq.map(x=>x.asInstanceOf[Double])
val gap_lenth = row.getAs[Row]("gap_lenth_20+gap_lenth_50+gap_lenth_100+gap_lenth_200+gap_lenth_500").toSeq.map(x=>x.asInstanceOf[Double])
val gap_kd = row.getAs[Row]("gap_kd_1_0+gap_kd_1_5+gap_kd_2_0+gap_kd_3_0+gap_kd_5_0").toSeq.map(x=>x.asInstanceOf[Double])
val gap2_kd = row.getAs[Row]("gap2_kd_1_5+gap2_kd_2_0+gap2_kd_3_0+gap2_kd_5_0").toSeq.map(x=>x.asInstanceOf[Double])
val turndir = row.getAs[Double]("turndir")
val zero_trail = row.getAs[Double]("zero_trail")
val gap_road_lenth = row.getAs[Double]("gap_road_lenth")
val target = row.getAs[Int]("target")
SOMEAMAZING(nodeid,group_road,geom,group_road_nums(0),group_road_nums(1),group_road_lenth(0),group_road_lenth(1),
all_match_nums,instream_percent,outstream_percent,same_dir_match_nums,turndir,
gap_kd(0),gap_kd(1),gap_kd(2),gap_kd(3),gap_kd(4),
gap_idx(0),gap_idx(1),gap_idx(2),gap_idx(3),
gap_lenth(0),gap_lenth(1),gap_lenth(2),gap_lenth(3),gap_lenth(4),
gap2_kd(0),gap2_kd(1),gap2_kd(2),gap2_kd(3),
gap_road_lenth,zero_trail,target)
}).toDS()
test_dataset_dataset.show(2)
//数据集分割与合并
val Array(trainData_true_sh, testData_true_sh) = all_sample_sh.filter("target==1").randomSplit(Array(0.8, 0.2))
val train_dataset_sh = trainData_true_sh.union(trainData_false_sh)
//计算两点连成直线的角度0-360度
var ink = scala.math.atan2(p2.getY()-p1.getY(),p2.getX()-p1.getX())*180/scala.math.Pi
if (ink<0){
ink = 360+ink
}
ink = ink%360
//计算两点间距离(double)并保留两位小数
def get_distance2(p1:Coordinate,p2:Coordinate)={
math.pow(p1.getX-p2.getX,2)+math.pow(p1.getY-p2.getY,2)
}
var gap_road_lenth = math.sqrt(get_distance2(p2,p3)).formatted("%.2f").toDouble