Sparksession,SparkSQL真香,比RDD,dataframe好写多了。
当然,也可以直接按后面的df直接写。
package XXX
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{Dataset, SaveMode, SparkSession}
object adsDataToHDFS extends Logging{
case class adsInfo(adId: Long, settlementType: Int, billing_type: Int, ad_campaign_id: Long, app_id: String)
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName(this.getClass.getName).enableHiveSupport().getOrCreate()
import spark.implicits._
val catalog = args(0)
val targetPath = args(1)
val fileSystem = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val path = new Path(targetPath)
if (fileSystem.exists(path)) {
fileSystem.delete(new Path(targetPath), true)
}
val adsInfoDF: Dataset[adsInfo] = spark.sql(
s"""
|select
| cast(b.id as bigint) as adId,
| a.settlement_type as settlementType,
| a.billing_type,
| a.ad_campaign_id,
| a.app_id
|from (select id,settlement_type,billing_type,ad_campaign_id,app_id from $catalog.ad_dim.dim_ad_group) a join
|(select id,ad_group_id from $catalog.ad_dim.dim_ad_creative) b on a.id=b.ad_group_id
|""".stripMargin).as[adsInfo]
adsInfoDF.rdd.map(f=>f.adId+","+f.settlementType+","+f.billing_type+","+f.ad_campaign_id+","+f.app_id)
.repartition(1)
.saveAsTextFile(targetPath)
// adsInfoDF.repartition(1).write.mode(SaveMode.Overwrite)
// .csv(targetPath)
}
}