记录下最近写过的pyspark代码,如下:
edaStats = namedtuple('edaStats', 'is_numeric, n_unique, n_missing, _min, _25, _50, _75, _max, n_unique, n_freq, mean ')
def eda_stats(dataframe, field_name, field_type):
is_numeric, n_unique, n_missing, _min, _25, _50, _75, _max, n_unique =
(None, None, None, None, None, None, None, None, None,)
n_freq, mean = None, None
is_numeric = is_numeric_type(field_type)
colDF = dataframe.select(field_name).where(col(field_name).isNotNull())
col_cnt = colDF.cache().count()
sortedDF = colDF.sort(asc(field_name)).withColumn("idx", monotonically_increasing_id())
windowSpec = W.orderBy("idx")
sortedDF = sortedDF.withColumn("idx", row_number().over(windowSpec)).cache()
n_missing = tot_cnt - col_cnt
if col_cnt > 0:
_min = sortedDF.where(' idx = 1').select(field_name).first()[0]
_max = sortedDF.where(f' idx = {col_cnt}').select(field_name).first()[0]
_25 = sortedDF.where(f' idx = {int(col_cnt * 0.25 )}').select(field_name).first()[0]
_50 = sortedDF.where(f' idx = {int(col_cnt * 0.5 )}').select(field_name).first()[0]
_75 = sortedDF.where(f' idx = {int(col_cnt * 0.75 )}').select(field_name).first()[0]
n_unique = sortedDF.distinct().count()
if n_unique < 200:
mode_tuple = sortedDF.groupBy(field_name).agg(count(lit(1)).alias('_count')).sort(desc('count_')).first()
n_freq = mode_tuple[0]
if is_numeric:
mean = df.select(mean(col(field_name))).first()[0]
sortedDF.unpersist()
colDF.unpersist()
return edaStats(is_numeric, n_unique, n_missing, _min, _25, _50, _75, _max, n_unique, n_freq, mean )
edaStats_list = []
for dtype in dtypes:
field_name = dtype[0]
field_type = dtype[1]
_edaStats = eda_stats(df, field_name, field_type)
edaStats_list.append(_edaStats)