Spark DataFrame 学习整理

import org.apache.spark.sql.{SQLContext, Row}
import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}
import org.apache.spark.sql.hive.HiveContext
import sqlContext.implicits._
import org.apache.spark.sql.functions.monotonically_increasing_id
import org.apache.spark.storage.StorageLevel

val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)

// 直接执行sql语句
val test_data = sqlContext.sql("select load_id,visits_id,cookie,first_visit_dt,if(event_id =='' or event_id is null, 'event_id_null',event_id) AS event_id,datediff(from_unixtime(unix_timestamp(dt,'yyyymmdd'),'yyyy-mm-dd'),sdate) diff_date from ad.aaa_use_leads_log where dt=20190301 and series_flag='3691' limit 5000")
// 利用collect().foreach(println) show() 打印测试数据
.collect().foreach(println)
.show()
// 聚合函数 count sum count(distinct) 排序
val pv_number = train_data.groupBy("cookie","is_leads").count()
.show()
val duration = train_data.groupBy("cookie","is_leads").sum("duration_int")
.show()
val visits_id = train_data.select("cookie","is_leads","visits_id").distinct().groupBy("cookie", "is_leads").count()
.show()
var event_data = train_data.groupBy("event_id").count().orderBy(desc("count"))
.show()
// 实现join where 
val tmp_te = test_data.select("cookie","event_id")
val tmp_te_1 = tmp_te.join(event_data, Seq("event_id"), "left").where("count is not null")
// dataFrame新增一列序号列,id唯一
val event_data_new = event_data.select("event_id").withColumn("cookie", monotonically_increasing_id)
// 巧用 regexp_replace替换全部字符串,实现dataFrame列的增加
val tmp_te_2 = tmp_te.join(event_data, Seq("event_id"), "left").where("count is null").withColumn("event_id", regexp_replace(regexp_replace(col("event_id"), ".*", ""),"","event_others"))
// 实现 union
val tmp_te_all = tmp_te_1.unionAll(tmp_te_2)
// 实现数据的 分组、透视、求和
val tmp_te = event_data_new.unionAll(tmp_te_all.select("event_id","cookie"))
tmp_tr.groupBy("cookie").pivot("event_id").sum
// 选取需要的列,进行保存数据
cookie_dict_tr.select("is_leads").write.mode(SaveMode.Append).text("/user/ad_tj/aaa/leads_project/aaa")
// 将DF转化为rdd,去掉括号再进行保存
cookie_dict_tr.rdd.map(_.toString().replaceAll("\\[","").replaceAll("\\]","")).saveAsTextFile("/user/ad_tj/aaa/leads_project/aaa5")
// 重新分区减少小文件,替换分隔符  cookie_dict_tr.repartition(100).rdd.map(_.mkString("\t")).saveAsTextFile("/data/tmp/dev/20190603/bds/bds_dev_app_installation_list_w")
// 将dataframe中null全部替换为0
test_data.na.fill(0)





 

 

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值