from pyspark.sql import Row
from pyspark.sql import functions as F
def main(sparkSession):
df = sc.parallelize([\
Row(name='A',time='20200221',age='18'),\
Row(name='A',time='20200221',age='18'),\
Row(name='A',time='20200223',age='28'),\
Row(name='A',time='20200223',age='30'),\
Row(name='A',time='20200330',age='30'),\
Row(name='B',time='20200225',age='21'),\
Row(name='B',time='20200226',age='21'),\
Row(name='C',time='20200228',age='21'),\
Row(name='C',time='20200226',age='21')\
]).toDF()
df.dropDuplicates().show()
ddf=df.dropDuplicates(['name','time'])
ddf.show()
ddf0=ddf.orderBy(F.col('time').desc())
ddf1= ddf0.dropDuplicates(['name'])
ddf1.show()
from pyspark.sql import Row
from pyspark.sql import functions as F
def main(sparkSession):
df = sc.parallelize([\
Row(name='A',time='20200221',age='18'),\
Row(name='A',time='20200221',age='18'),\
Row(name='A',time='20200223',age='28'),\
Row(name='A',time='20200223',age='30'),\
Row(name='A',time='20200330',age='30'),\
Row(name='B',time='20200225',age='21'),\
Row(name='B',time='20200226',age='21'),\
Row(name='C',time='20200228',age='21'),\
Row(name='C',time='20200226',age='21')\
]).toDF()
df.dropDuplicates().show()
[2021-04-18 09:01:59.459] [INFO] - +---+----+--------+
|age|name| time|
+---+----+--------+
| 30| A|20200330|
| 28| A|20200223|
| 21| B|20200226|
| 21| B|20200225|
| 21| C|20200228|
| 21| C|20200226|
| 30| A|20200223|
| 18| A|20200221|
+---+----+--------+
ddf=df.dropDuplicates(['name','time'])
ddf.show()
[2021-04-18 09:02:00.507] [INFO] - +---+----+--------+
|age|name| time|
+---+----+--------+
| 21| C|20200226|
| 21| B|20200226|
| 21| C|20200228|
| 18| A|20200221|
| 30| A|20200330|
| 21| B|20200225|
| 28| A|20200223|
+---+----+--------+
ddf0=ddf.orderBy(F.col('time').desc())
ddf1= ddf0.dropDuplicates(['name'])
ddf1.show()
[2021-04-18 09:02:04.534] [INFO] - +---+----+--------+
|age|name| time|
+---+----+--------+
| 21| B|20200226|
| 21| C|20200228|
| 30| A|20200330|
+---+----+--------+
```