# 这样无法保留第一个数据
user_df = user_df.orderBy("dt", ascending=False).drop_duplicates(subset=["username"])
# 需要这样使用,dt 倒排,只保存第一个
rank_window = Window.partitionBy("username").orderBy(F.col("dt").desc())
user_df = user_df.withColumn('rank', F.rank().over(rank_window)).where(F.col("rank") == 1).drop('rank').orderBy("dt", ascending=False).drop_duplicates(subset=["username"])
user_df.cache().count()