我用perfplot测试多个方法,最快的方法是按每列循环,并通过^{}删除缺少的值或Nones,或者在较大的数据帧中使用^{}。
更小的数据帧是最快的字典理解,它通过NaN != NaN技巧测试丢失的值,同时也测试Nones
np.random.seed(2020)
import perfplot
def comp_notnull(df1):
return {k1: {k:v for k,v in v1.items() if pd.notnull(v)} for k1, v1 in df1.to_dict().items()}
def comp_NaNnotNaN_None(df1):
return {k1: {k:v for k,v in v1.items() if v == v and v is not None} for k1, v1 in df1.to_dict().items()}
def comp_dropna(df1):
return {k: v.dropna().to_dict() for k,v in df1.items()}
def comp_bool_indexing(df1):
return {k: v[v.notna()].to_dict() for k,v in df1.items()}
def make_df(n):
df1 = pd.DataFrame(np.random.choice([1,2, np.nan], size=(n, 5)), columns=list('ABCDE'))
return df1
perfplot.show(
setup=make_df,
kernels=[comp_dropna, comp_bool_indexing, comp_notnull, comp_NaNnotNaN_None],
n_range=[10**k for k in range(1, 7)],
logx=True,
logy=True,
equality_check=False,
xlabel='len(df)')