spark中的遍历dataframe

最新推荐文章于 2024-03-30 15:52:13 发布

xujingpilot

最新推荐文章于 2024-03-30 15:52:13 发布

阅读量2.8k

点赞数

分类专栏： spark

本文链接：https://blog.csdn.net/xujingpilot/article/details/105980543

版权

spark 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

  def main(args: Array[String]) = {
    val spark = SparkSession.builder().appName("p2") //.master("local")
      .enableHiveSupport().getOrCreate()
    import spark.implicits._
    val nowdate = LocalDate.now()
    val statis_day = nowdate.minusDays(1).toString.split("-").mkString("")//当天时间
    val pre01_day = date_before(statis_day, "1")
    //前一天日期
    //val pre01_day=args(0)
    val pre31_day = date_before(pre01_day, "7")
    //前三十一天日期

    var df= spark.sql(
      s"""
         |select idea_third_categ,
         |case when percentile_approx(qvalue, 0.2)>0 then percentile_approx(qvalue, 0.2)
         | else "1.0" end as x1,
         |case when percentile_approx(qvalue, 0.4)>0 then percentile_approx(qvalue, 0.4)
         | else "1.0" end as  x2,
         |case when percentile_approx(qvalue, 0.6)>0 then percentile_approx(qvalue, 0.6)
         | else "1.0" end as x3,
         |case when percentile_approx(qvalue, 0.8)>0 then percentile_approx(qvalue, 0.8)
         | else "1.0" end as x4,
         | "1.0" as x5
         |from aps.cpc_pv_log
         |where statis_date<=${pre01_day} and
         |statis_date>${pre31_day}
         |group by idea_third_categ
      """.stripMargin).rdd
    //df.createOrReplaceTempView("df")

    val resultDF = spark.createDataFrame(df.map(item=>{
      val s1= item.getAs[String]("idea_third_categ")
      val ab = new ArrayBuffer[String]()
      var List = new Array[String](5)
      List(0) = item.getAs[String]("x1")
      List(1) = item.getAs[String]("x2")
      List(2) = item.getAs[String]("x3")
      List(3) = item.getAs[String]("x4")
      List(4) = item.getAs[String]("x5")
      for (i <- 0 to 3) {
        breakable {
          if (List(i) == "1.0" | List(i + 1) == List(i)) {
            break()
          }
          //buffer1+=data
          ab += List(i)
        }
      }
      var s2=ab.mkString("#")
      (s1,s2)
    })).toDF("Card3_Id", "quantile")

    resultDF.show(20)
    resultDF.createOrReplaceTempView("dframe")
    val pv_cart3 = spark.sql(
      s"""insert overwrite table predict.pctr_quantile partition (statis_date='${statis_day}')
         |select Card3_Id,quantile
         |from dframe
            """.stripMargin).repartition(10)
    println("dataframe导入成功")
  }

   var df= spark.sql(
      s"""
         |select idea_third_categ,
         |case when percentile_approx(qvalue, 0.2)>0 then percentile_approx(qvalue, 0.2)
         | else "1.0" end as x1,
         |case when percentile_approx(qvalue, 0.4)>0 then percentile_approx(qvalue, 0.4)
         | else "1.0" end as  x2,
         |case when percentile_approx(qvalue, 0.6)>0 then percentile_approx(qvalue, 0.6)
         | else "1.0" end as x3,
         |case when percentile_approx(qvalue, 0.8)>0 then percentile_approx(qvalue, 0.8)
         | else "1.0" end as x4,
         | "1.0" as x5
         |from aps.cpc_pv_log
         |where statis_date<=${pre01_day} and
         |statis_date>${pre31_day}
         |group by idea_third_categ
      """.stripMargin).rdd.map(x=>{
      val s1= x(0).toString
      val ab = new ArrayBuffer[String]()
      var List = new Array[String](5)
      List(0) = x(1).toString
      List(1) = x(2).toString
      List(2) = x(3).toString
      List(3) = x(4).toString
      List(4) = x(5).toString
      for (i <- 0 to 3) {
        breakable {
          if (List(i) == "1.0" | List(i + 1) == List(i)) {
            break()
          }
          //buffer1+=data
          ab += List(i)
        }
      }
      var s2=ab.mkString("#")
      (s1,s2)
    }).toDF("Card3_Id", "quantile")

xujingpilot

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
spark中的遍历dataframe

def main(args: Array[String]) = { val spark = SparkSession.builder().appName("p2") //.master("local") .enableHiveSupport().getOrCreate() import spark.implicits._ val nowdate = Loca...
复制链接

扫一扫

专栏目录