导入方法包
import spark.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
#窗口函数
import org.apache.spark.sql.expressions.Window
运算的方法
用dataFrame API实现行列转换
情况一
数据:
user,friends
3197468391,1346449342 3873244116 4226080662 1222907620 547730952 1052032722 2138119761 417295859 1872292079 984265443 2535686531 3703382700
代码实现:
val df=dfUserFriends.withColumnRenamed("user","user_id").
withColumn("friends_id",explode(split($"friends"," "))).
drop("friends")
情况二
数据:
+----------+--------------------+--------------------+--------------------+--------------------+
| event| yes| maybe| invited| no|
+----------+--------------------+--------------------+--------------------+--------------------+
|1159822043|1975964455 252302...|2733420590 517546...|1723091036 379587...|3575574655 107729...|
| 686467261|2394228942 268611...|1498184352 645689...|1788073374 733302...| null|
|1186208412| null|3320380166 381079...|1379121209 440668682|1728988561 295072...|
+----------+--------------------+--------------------+--------------------+--------------------+
转成
+----------+----------+-----------+
| event_id| user_id|attend_type|
+----------+----------+-----------+
|1159822043|1975964455| yes|
|1159822043| 252302513| yes|
|1159822043|4226086795| yes|
|1159822043|3805886383| yes|
|1159822043|1420484491| yes|
+----------+----------+-----------+
代码实现:
法一:
val dfYes=dfEventAttendess.select("event","yes").
withColumnRenamed("event","event_id").
withColumn("user_id",explode(split($"yes"," "))).
withColumn("attend_type",lit("yes")).drop($"yes")
val dfMayBe=dfEventAttendess.select("event","maybe").
withColumnRenamed("event","event_id").
withColumn("user_id",explode(split($"maybe"," "))).
withColumn("attend_type",lit("maybe")).drop("maybe")
val dfInvited = dfEventAttendess.select("event","invited").
withColumnRenamed("event","event_id").
withColumn("user_id",explode(split($"invited"," "))).
withColumn("attend_type",lit("invited")).drop("invited")
val dfNo = dfEventAttendess.select("event","no").
withColumnRenamed("event","event_id").
withColumn("user_id",explode(split($"no"," "))).
withColumn("attend_type",lit("no")).drop("no")
val dfResult=dfYes.union(dfMayBe).union(dfInvited).union(dfNo)
法二:
–使用seq构建 seq(“yes”,“maybe”,“invited”,“no”),使用map对序列中的内容依次进行处理,
–对最终处理的结果合并
val dfFinal: Seq[DataFrame]=Seq("yes","maybe","invited","no").
map(at=>dfEventAttendess.select($"event".as("event_id"),col(at)).
withColumn("user_id",explode(split(col(at)," "))).
withColumn("attend_type",lit(at)).drop(col(at))
)
val dfFinal2: DataFrame=dfFinal.reduce((x,y)=>x.union(y))
查看时间格式是否有效
数据:
+---------+----------+------------------------+
|event_id |user_id |start_time |
+---------+----------+------------------------+
|684921758|3647864012|2012-10-31T00:00:00.001Z|
|244999119|3476440521|2012-11-03T00:00:00.001Z|
+---------+----------+------------------------+
代码实现:
#正则表达式
spark.sql("""
select count(*) from events
where start_time regexp '^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.*'
""").show()
dataFrame.filter($"timestamp".rlike("^\\d{4}-\\d{2}-\\d{2}\\s\\d{2}:\\d{2}:\\d{2}.*")).count()
查询最新的时间
数据
+----------+----------+-------+--------------------+----------+--------------+
| user| event|invited| timestamp|interested|not_interested|
+----------+----------+-------+--------------------+----------+--------------+
|1895679477|2019748690| 0|2012-11-13 07:59:...| 0| 0|
|1895679477|2019748690| 0|2012-11-15 19:36:...| 0| 0|
|1895679477|2019748690| 0|2012-11-12 12:22:...| 0| 0|
+----------+----------+-------+--------------------+----------+--------------+
代码实现
#方法一:
dfTrain.withColumn("rn",row_number()over(Window.partitionBy($"user",$"event").
orderBy($"timestamp".desc))).
filter($"rn"===lit(1)).drop("rn")
#方法二:
dfTrain.selectExpr("*","row_number()over(partition by user,event order by timestamp desc)rn").
filter($"rn"===1).show()
结果:
+---------+----------+-------+--------------------+----------+--------------+
| user| event|invited| timestamp|interested|not_interested|
+---------+----------+-------+--------------------+----------+--------------+
| 48380986|4125754193| 0|2012-10-19 17:27:...| 0| 0|
|123290209| 463131402| 0|2012-10-26 18:04:...| 0| 0|
|139210489|3911341942| 0|2012-11-03 19:02:...| 0| 0|
|156079457| 406835364| 0|2012-10-26 05:23:...| 0| 0|
|156603823|2073609284| 0|2012-10-30 14:15:...| 0| 0|
+---------+----------+-------+--------------------+----------+--------------+