from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('aggs').getOrCreate()
df = spark.read.csv('sales_info.csv', inferSchema =True, header =True)
df.printSchema()
df.show()# drop missing data# drop if any by row
df.na.drop().show()# at least 2 non-null values will pass
df.na.drop(thresh =2)# use how parameter
df.na.drop(how='all').show()
df.na.drop(how='any').show()# based on subset of column
df.na.drop(subset=['Sales']).show(
Fill Missing Data
df.na.fill('FILL VALUE').show()# only fill in string type data
df.na.fille(0).show()# only fill in number type data
df.na.fill('No Name', subset =['Name']).show()# specify the subset# fill the null with meanfrom pyspark.sql.functions import mean
mean_val = df.select(mean(df['Sales'])).collect()
mean_sales = mean_val[0][0]# to show the number
df.na.fill(mean_sales,['Sales']).show()