spark读取hive,然后做了简单的处理,存入hive:
package com.dianyou.offline.action import com.alibaba.fastjson.{JSONArray, JSONObject} import com.dianyou.util.PropertiesUtil import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SparkSession} import scala.collection.mutable.ListBuffer /** * 创建时间 : 2019/12/26 * 业务目的: 用户操作行为以及其详情页 * */ object UserAction { var DIR: String = _ var NUMBER: Int = _ var LOGGER: Logger = Logger.getLogger(UserAction.getClass) Logger.getLogger("org.apache.spark").setLevel(Level.WARN) // final val APP_START = "APP启动分析" def main(args: Array[String]): Unit = { // Logger.getLogger("org.apache.hadoop").setLevel(Level.ERROR) // Logger.getLogger("org.apache.spark").setLevel(Level.ERROR) // Logger.getLogger("org.apache.hive").setLevel(Level.ERROR) // Logger.getLogger("org.apache.hadoop").setLevel(Level.ERROR) // Logger.getLogger("org.apache.zookeeper").setLevel(Level.ERROR) if (args.length < 1) { // LOGGER.warn("请传入配置文件路径...时间") System.exit(0) } else { DIR = args(0); } NUMBER = 100 //传入job的时间 // val time_0 = args(1) val mysqlPro = PropertiesUtil.loadHDFSPro(args(0), "mysql") //创建sparkconf // val sparkConf = new SparkConf() val sparkConf = new SparkConf().setAppName("adver").setMaster("local[*]") sparkConf.set("javax.jdo.option.ConnectionURL", mysqlPro.getProperty("javax.jdo.option.connectionurl")) sparkConf.set("javax.jdo.option.ConnectionDriverName", mysqlPro.getProperty("javax.jdo.option.connectiondrivername")) sparkConf.set("javax.jdo.option.ConnectionUserName", mysqlPro.getProperty("javax.jdo.option.connectionusername")) sparkConf.set("javax.jdo.option.ConnectionPassword", mysqlPro.getProperty("javax.jdo.option.connectionpassword")) val spark = SparkSession .builder .config(sparkConf) .enableHiveSupport() .config("spark.sql.warehouse.dir", "spark-warehouse") .getOrCreate() //设置hive数据库 spark.sql("use xxx") spark.sql("set hive.exec.dynamic.partition=true") spark.sql("set hive.exec.dynamic.partition.mode=nonstrict") LOGGER.error("1,读取Hive表数据") val day = "2017-12-19" // val sql="select concat(\"{\"infos\":infos,\",\",\"deviceid\":deviceid}\") as value ,deviceid from test_impala2"; // val sql="select concat(infos,'|',deviceid_id) as value ,deviceid_id from test_impala2"; val sql = s""" |select time,id,infos,day,deviceid from test_impala2 limit 1 | """.stripMargin; /* val sql = s""" |select time,id,infos,day,deviceid_id from test_impala2 where day = '${day}' limit 1 | """.stripMargin;*/ val df2: DataFrame = spark.sql(sql) df2.show() val rdd22 = df2.rdd //todo 1,先按天,设备id 进行全局排序 val sortRdd = rdd22.sortBy(x => { val urlTime = x.getAs[String]("time") //todo 转成时间戳 ,升序排序 val time: Long = urlTime.replace(".", "").toLong (time, true) }) val jsonRdd: RDD[JSONObject] = sortRdd.map((log: Row) => { val time = log.getAs[String]("time").trim val infos = log.getAs[String]("infos").trim val day = log.getAs[String]("day").trim val deviceid = log.getAs[String]("deviceid").trim // val lableName = log.getAs[String]("lableName").trim val logJson = new JSONObject() logJson.put("time", time) logJson.put("infos", infos) logJson.put("day", day) logJson.put("deviceid", deviceid) logJson }) //todo 分组排序 val rdd2 = jsonRdd.map(log => { val deviceid = log.getString("deviceid") (deviceid, log) }) val rdd: RDD[(String, String)] = rdd2.groupByKey().map(x => { val key = x._1; val values: List[JSONObject] = x._2.toList; val rds: List[JSONObject] = values.sortBy(x => { x.getString("time") }) val logsListBuffer = new ListBuffer[JSONObject](); //scala.collection.mutable.ListBuffer val jsonArray = new JSONArray() var index = 0 rds.foreach(logJson => { // val lableName = logJson.getString("lableName") val deviceid = logJson.getString("deviceid") //todo 如果是启动信息 if (deviceid.contains("APP启动分析")) { index += 1 } logJson.put("type", index) jsonArray.add(logJson) }) (key, jsonArray.toJSONString) }) //todo 要存入Hive val rdd2222: RDD[Person] = rdd.map(x => { val deviceid = x._1; val days = day val didnum = getPartitionNum(deviceid) val infos = x._2 println("didnum = "+ didnum) Person(deviceid,infos,days,didnum) }) import spark.implicits._ rdd2222.toDF().createOrReplaceTempView("tempTest") rdd2222.toDF().show() spark.sql("create table if not exists test_impala3(deviceid string,infos string) partitioned by (day string,didnum string)") val testSql = s""" |insert into table aijiami.test_impala3 partition(day,didnum) |select deviceid,infos,day,didnum from tempTest | """.stripMargin LOGGER.error("==========插入新表============") spark.sql(testSql) LOGGER.info("----------------插入完成----------------") spark.sparkContext.stop() spark.stop() } //todo 优化参考文文章: https://blog.csdn.net/weixin_42348946/article/details/83376442 def getPartitionNum(deviceId: String): String = { //todo 从配置文件获取,默认是10 val deviceIdHashCode = Integer.valueOf(deviceId.hashCode) val abs = Integer.valueOf(Math.abs(deviceIdHashCode.intValue)) val getMod = Integer.valueOf(abs.intValue % NUMBER) val partitionNum: String = String.format("%03d", getMod) partitionNum } case class Person(deviceid: String, infos: String,day: String, didnum: String) }