# coding = utf-8 # from __future__ import division import numpy as np import pandas as pd import scipy.stats from scipy.stats import mode from datetime import datetime df = pd.read_csv('train.csv') label = df['TARGET'] df = df.drop(['ID', 'TARGET'], axis=1) def fill_fre_top_5(x): if (len(x)) <= 5: new_array = np.full(5, np.nan) new_array[0:len(x)] = x return new_array def eda_analysis(missSet=[np.nan, 9999999999, -999999], df=None): # 1Count # count_un = df.apply(lambda x: len(x.unique())) # 2Zero Values # count_zero = df.apply(lambda x: np.sum(x == 0)) # 3 Mean # df_mean = df.apply(lambda x: np.mean(x[~np.isin(x, missSet)])) # 4 Median # df_median = df.apply(lambda x: np.median(x[~np.isin(x, missSet)])) # 5 Mode # df_mode = df.apply(lambda x: scipy.stats.mode(x[~np.isin(x, missSet)])[0][0]) # 6 Mode Percentage# df_mode_count = df.apply(lambda x: scipy.stats.mode(x[~np.isin(x, missSet)])[1][0]) df_mode_perct = df_mode_count / df.shape[0] df_mode_perct.columns = ['mode_perct'] # 7 Min Value# df_min = df.apply(lambda x: np.min(x[~np.isin(x, missSet)])) # 8 Max Value# df_max = df.apply(lambda x: np.max(x[~np.isin(x, missSet)])) # 9 Quantile Values# json_quantile = {} for i, name in enumerate(df.columns): json_quantile[name] = np.percentile(df[name][~np.isin(df[name], missSet)], (1, 5, 25, 50, 75, 95, 99)) df_quantile = pd.DataFrame(json_quantile)[df.columns].T df_quantile.columns = ['quan01', 'quan05', 'quan25', 'quan50', 'quan75', 'quan95', 'quan99'] # 10 Frequence# json_fre_name = {} json_fre_count = {} for i, name in enumerate(df.columns): # 1.Index Name# index_name = df[name][~np.isin(df[name], missSet)].value_counts().iloc[0:5, ].index.values # 1.1if the length is less than 5# index_name = fill_fre_top_5(index_name) json_fre_name[name] = index_name # 2.Value Count# values_count = df[name][~np.isin(df[name], missSet)].value_counts().iloc[0:5, ].values values_count = fill_fre_top_5(values_count) json_fre_count[name] = values_count df_fre_name = pd.DataFrame(json_fre_name)[df.columns].T df_fre_count = pd.DataFrame(json_fre_count)[df.columns].T df_fre = pd.concat([df_fre_name, df_fre_count], axis=1) df_fre.columns = ['value1', 'value2', 'value3', 'value4', 'value5', 'freq1', 'freq2', 'freq3', 'freq4', 'freq5'] # 11Miss Value Count# df_miss = df.apply(lambda x: np.sum(np.isin(x, missSet))) df_miss = df_miss.to_frame('freq_miss') # 12Combine All Information# df_eda_summary = pd.concat( [count_un, count_zero, df_mean, df_median, df_mode, df_mode_count, df_mode_perct, df_min, df_max, df_fre, df_miss, df_quantile], axis=1) return df_eda_summary
用Python进行常见的描述统计
最新推荐文章于 2024-09-08 09:00:00 发布