val df3=df1.join(df2,on字段,连接类型)
1 两个表的关联字段名一样
val df=a11.join(a22,Seq("receive_time","channel_code"))
2 两个表的关联字段名不同 (3个等于号)
val h5_1=h10_lev3.join(h10_lev2,h10_lev3("parentid_3")===h5_lev2("node2id"),"inner")
3常用操作汇总
import org.apache.spark.sql.SaveMode
val r88=spark.read.format("jdbc").option("url","jdbc:mysql://localhost:3306/test").option("driver","com.mysql.jdbc.Driver").option("dbtable","test.ccc").option("user","xxx").option("password","xxx").load()
r88.select("itemid","dl_count_30").groupBy(["itemid"]).sum("dl_count_30").orderBy(desc("sum(dl_count_30)")).withColumnRenamed("sum(play_count)","playcount18Q1").limit(500000).write.mode(SaveMode.Overwrite).save("/home/gmd/tmp/playcount18Q1")
r88.write.mode(SaveMode.Overwrite).format("jdbc").option("url","jdbc:mysql://localhost:3306/test").option("driver","com.mysql.jdbc.Driver").option("dbtable","test.cms20190618").option("user","xx").option("password","xxx").save()
4聚合
val tempos2=tempos.join(r91,Seq("relate_itemid"),"inner").agg(sum("counts") as "ultimate_top_play_counts").first().getAs[Long]("ultimate_top_play_counts")
val tempos2_playcounts2=(tempos2+".0").toDouble
5 在写text时,这里要求只有一列,且为字符串类型
import org.apache.spark.sql.SaveMode
a.filter("appid=2980").selectExpr("cast(userid as string)").write.mode(SaveMode.Overwrite).text("/home/gmd/userid2980.txt")