1,因业务需要,把本地的文件根据相关表需要导入到hive中,具体代码如下:
import com.alibaba.fastjson.{JSON, JSONException, JSONObject}
import com.crgt.bigdata.CarLogToHdfs.FlowReport
import org.apache.hadoop.fs.Path
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.rdd
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SaveMode, SparkSession}
object KafkaLogFileToHdfs {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("KafkaLogFileToHdfs3")
.master("local[*]")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("spark.network.timeout", "600")
.config("spark.sql.broadcastTimeout", "36000")
.config("spark.local.dir", "/bigdata02/sparklog")
.config("spark.sql.broadcastTimeout", "36000")
.getOrCreate()
val topics = Map(
"batch_log1" -> 10,
"batch_log2" -> 20,
"batch_log3" ->20
)
import spark.implicits._
val day = args(0)
topics.foreach(topic => {
val df= spark.read.text(s"file:///bigdata04/log2file/data/${topic._1}-$day").repartition(topic._2).rdd
.map(a => changeToBean(topic._1, a.getString(0)))
.repartition(topic._2)
saveToHdfs(df,topic._1,day,spark)
})
}
def changeToBean(topic: String, data: String): Any = {
try {
val value = JSON.parseObject(data)
val matchValue = topic match {
case "batch_log1" => PorLog(value.getString("event"), value.getString("t_mac").toUpperCase, value.getString("t_ip"), value.getLong("time"), value.getString("t_type"), value.getString("t_Osverison"), value.getString("generate_url"), value.getString("user_agent"), value.getString("service_version"), value.getString("download_url"), value.getString("is_cache"), value.getString("train_id"))
case "batch_log2" => FlowReport(value)
case "batch_log3" => macProbe(value)
case _ =>
}
matchValue
} catch {
case exe: JSONException => "无效数据"
}
}
val macProbe=(value: JSONObject) =>{
val message = value.getString("message")
val info = JSON.parseObject(message).getString("info")
val staMac = JSON.parseObject(info).getString("sta_mac")
val appMac = JSON.parseObject(info).getString("ap_mac")
UserProbe(JSON.parseObject(message).getString("trainId"), JSON.parseObject(message).getLong("localTimeStamp"), staMac, appMac)
}
val FlowReport = (value: JSONObject) => {
if (value.getJSONArray("users").size() > 0) {
val userJson = JSON.parseObject(value.getJSONArray("users").get(0).toString)
CSFlowReportLog(
value.getString("carid"), value.getLong("utime"),
userJson.getString("uuid"), userJson.getLong("upBytes"),
userJson.getLong("downBytes"), userJson.getLong("usedBytes"),
userJson.getString("bssid"), userJson.getString("mac").toUpperCase,
userJson.getString("ip"), userJson.getInteger("status"),
userJson.getString("uphone"), userJson.getString("options")
)
}
}
def saveToHdfs(rdd: RDD[Any], topic:String,day:String,spark:SparkSession): Unit = {
import spark.implicits._
val path=s"/logfile/$topic/$day"
val hadoopConf = spark.sparkContext.hadoopConfiguration
val hdfs = org.apache.hadoop.fs.FileSystem.get(hadoopConf)
if (hdfs.exists(new Path(path))) {
hdfs.delete(new Path(path), true)
}
rdd.filter(a => a.isInstanceOf[PorLog]).map(b => b.asInstanceOf[PorLog]).toDF().write.mode(SaveMode.Append).orc(path)
rdd.filter(a => a.isInstanceOf[CSFlowReportLog]).map(b => b.asInstanceOf[CSFlowReportLog]).toDF().write.mode(SaveMode.Append).orc(path)
rdd.filter(a => a.isInstanceOf[UserProbe]).map(b => b.asInstanceOf[UserProbe]).toDF().write.mode(SaveMode.Append).orc(path)
}
}
case class PorLog(event: String, t_mac: String, t_ip: String, utime: Long, t_type: String, t_Osverison: String, generate_url: String, user_agent: String, service_version: String, download_url: String, is_cache: String, train_id: String)
case class CSFlowReportLog(carid: String, utime: Long, uuid: String, upBytes: Long, downBytes: Long, usedBytes: Long, bssid: String, mac: String, ip: String, status: Int, uphone: String, options: String)
case class UserProbe(trainId: String, localTimeStamp: Long, sta_mac: String, ap_mac: String)
2,启动脚本如下
nohup spark-submit --master local --conf spark.driver.memory=4G --class com.bigdata.KafkaLogFileToHdfs --conf spark.dynamicAllocation.enabled=false --executor-memory 2G --num-executors 2 --executor-cores 1 /gtdata04/liqiang/log2hdfs.jar $lastdate >> /gtdata01/log/KafkaLogFileToHdfs.log 2>&1 &