Amazon S3中的数据同步至hive分区表中
本篇主要讲解利用spark程序将appflyer采集的数据同步hdfs再同步到hive表中,从而对数据进行分析计算;。
1.编写Spark代码
①val spark = SparkSession.builder()
//.master("local[*]")
.config("spark.eventLog.enabled", "false")
.config("spark.driver.memory", "2g")
.config("spark.executor.memory", "2g")
.appName("SparkDemoFromS3")
.getOrCreate()
// AWS Access Key:*******
//Home Folder:******
//Bucket Name:*******
//Bucket Secret:******
spark.sparkContext.hadoopConfiguration.set("fs.s3a.access.key", "******")
spark.sparkContext.hadoopConfiguration.set("fs.s3a.secret.key", "******k")
spark.sparkContext.hadoopConfiguration.set("fs.s3a.endpoint", "s3-eu-west-1.amazonaws.com")
ImportData.Import(spark,args(0),args(1))
②def Import(spark: SparkSession, name: String, dt: String): Unit = {
val rdd0 = spark.sparkContext
.textFile(s"s3a://af-ext-reports/216c-acc-iHe6xzdF-216c/data-locker- hourly/t=$name/dt=$dt/h=0")
val rdd1 = spark.sparkContext
.textFile(s"s3a://af-ext-reports/216c-acc-iHe6xzdF-216c/data-locker-hourly/t=$name/dt=$dt/h=1")
val rdd2 = spark.sparkContext
.textFile(s"s3a://af-ext-reports/216c-acc-iHe6xzdF-216c/data-locker-hourly/t=$name/dt=$dt/h=2")
val rdd3 = spark.sparkContext
.tex