sparkSQL 读取本地文件,写入到hive

1,因业务需要,把本地的文件根据相关表需要导入到hive中,具体代码如下:


import com.alibaba.fastjson.{JSON, JSONException, JSONObject}
import com.crgt.bigdata.CarLogToHdfs.FlowReport
import org.apache.hadoop.fs.Path
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.rdd
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SaveMode, SparkSession}

object KafkaLogFileToHdfs {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("KafkaLogFileToHdfs3")
      .master("local[*]")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.network.timeout", "600")
      .config("spark.sql.broadcastTimeout", "36000")
      .config("spark.local.dir", "/bigdata02/sparklog")
      .config("spark.sql.broadcastTimeout", "36000")
      .getOrCreate()
    val topics = Map(
      "batch_log1" -> 10,
      "batch_log2" -> 20,
      "batch_log3" ->20
    )
    import spark.implicits._
    val day = args(0)
    topics.foreach(topic => {
      val df= spark.read.text(s"file:///bigdata04/log2file/data/${topic._1}-$day").repartition(topic._2).rdd
        .map(a => changeToBean(topic._1, a.getString(0)))
          .repartition(topic._2)
      saveToHdfs(df,topic._1,day,spark)
    })
  }

  def changeToBean(topic: String, data: String): Any = {
    try {
      val value = JSON.parseObject(data)
      val matchValue = topic match {
        case "batch_log1" => PorLog(value.getString("event"), value.getString("t_mac").toUpperCase, value.getString("t_ip"), value.getLong("time"), value.getString("t_type"), value.getString("t_Osverison"), value.getString("generate_url"), value.getString("user_agent"), value.getString("service_version"), value.getString("download_url"), value.getString("is_cache"), value.getString("train_id"))
        case "batch_log2" => FlowReport(value)
        case "batch_log3" => macProbe(value)
        case _ =>
      }
      matchValue
    } catch {
      case exe: JSONException => "无效数据"
    }
  }
  val macProbe=(value: JSONObject) =>{
    val message = value.getString("message")
    val info = JSON.parseObject(message).getString("info")
    val staMac = JSON.parseObject(info).getString("sta_mac")
    val appMac = JSON.parseObject(info).getString("ap_mac")
    UserProbe(JSON.parseObject(message).getString("trainId"), JSON.parseObject(message).getLong("localTimeStamp"), staMac, appMac)
  }

  val FlowReport = (value: JSONObject) => {
    if (value.getJSONArray("users").size() > 0) {
      val userJson = JSON.parseObject(value.getJSONArray("users").get(0).toString)
      CSFlowReportLog(
        value.getString("carid"), value.getLong("utime"),
        userJson.getString("uuid"), userJson.getLong("upBytes"),
        userJson.getLong("downBytes"), userJson.getLong("usedBytes"),
        userJson.getString("bssid"), userJson.getString("mac").toUpperCase,
        userJson.getString("ip"), userJson.getInteger("status"),
        userJson.getString("uphone"), userJson.getString("options")
      )
    }
  }
  def saveToHdfs(rdd: RDD[Any], topic:String,day:String,spark:SparkSession): Unit = {
    import spark.implicits._
    val path=s"/logfile/$topic/$day"
    val hadoopConf = spark.sparkContext.hadoopConfiguration
    val hdfs = org.apache.hadoop.fs.FileSystem.get(hadoopConf)
    if (hdfs.exists(new Path(path))) {
      hdfs.delete(new Path(path), true)
    }
    rdd.filter(a => a.isInstanceOf[PorLog]).map(b => b.asInstanceOf[PorLog]).toDF().write.mode(SaveMode.Append).orc(path)
    rdd.filter(a => a.isInstanceOf[CSFlowReportLog]).map(b => b.asInstanceOf[CSFlowReportLog]).toDF().write.mode(SaveMode.Append).orc(path)
    rdd.filter(a => a.isInstanceOf[UserProbe]).map(b => b.asInstanceOf[UserProbe]).toDF().write.mode(SaveMode.Append).orc(path)

  }
}
case class PorLog(event: String, t_mac: String, t_ip: String, utime: Long, t_type: String, t_Osverison: String, generate_url: String, user_agent: String, service_version: String, download_url: String, is_cache: String, train_id: String)

case class CSFlowReportLog(carid: String, utime: Long, uuid: String, upBytes: Long, downBytes: Long, usedBytes: Long, bssid: String, mac: String, ip: String, status: Int, uphone: String, options: String)

case class UserProbe(trainId: String, localTimeStamp: Long, sta_mac: String, ap_mac: String)

 

2,启动脚本如下

nohup spark-submit --master local --conf spark.driver.memory=4G --class com.bigdata.KafkaLogFileToHdfs --conf spark.dynamicAllocation.enabled=false --executor-memory 2G --num-executors 2 --executor-cores 1 /gtdata04/liqiang/log2hdfs.jar  $lastdate  >> /gtdata01/log/KafkaLogFileToHdfs.log 2>&1 &

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值