以下方法使用scala, df类型为Dataframe
常用的包:
import org.apache.spark.sql.functions._
import spark.implicits._
1. 新增一列
df = df.withColumn("new col", lit(null)), //新的一列名字为"new col", 使用空值进行填充
2. 列名重新命名
df = df.withColumnRenamed("旧列名", "新列名")
3. 根据条件进行判断来对Dataframe中某列的值进行填充
df = df.withColumn("需要进行填充的列名", when(条件判断, $"用于填充的列名")
.otherwise($"用于填充的列名"))
4. 删除一列
df = df.drop("列a", "列B")
5. 两个dataframe进行关联
df3 = df1.join(df2, seq("关联的列"), "left")
6. 分组之后计算最小值
df = df.groupBy("order_no", "seq_no", "product_code").agg(("diff_Date", "min"))
7. 筛选符合条件的数据
df = df.where($"cancel_dt".isNotNull)
8. 选出指定列
df = df.select("cancel_dt")
9. udf
dfNull = dfNull.withColumn("province_fill", addressClean.proCodeToProName(col("province_fill")))
val proCodeToProName: UserDefinedFunction = udf((provinceCode: String) => {
var cleanProvince = provinceFullMap.getOrElse(provinceCode, "未匹配省市")
cleanProvince
})
10. case class
case class ChargeByPolicyPremData(var order_no: String){}
val reqPolicyBaseDataDs = reqPolicyDf.as[ChargeByPolicyPremData]
var reqPolicyBaseDataDf: DataFrame = reqPolicyBaseDataDs.map(lines => (
lines.req_id,
lines.order_no
)).toDF("req_id", "order_no")
11. 开窗函数
把需要的字段,进行拼接
val WindowSpec = Window.partitionBy( "A").orderBy($"create_at".desc)
dfProcessed = dfProcessed.withColumn("concatString", base64(concat($"B")))
dfProcessed = dfProcessed.withColumn("concatString_next",lead($"concatString",1, "99").over(WindowSpec))
.where($"concatString_next" =!= $"concatString")
.drop("concatString", "concatString_next")
var dfProceesed = df.withColumn("rank", row_number() over Window.partitionBy("A")
.orderBy($"create_at".desc))
.where($"rank" === 1)
.drop("rank")