用Python进行常见的描述统计

# coding = utf-8 #
from __future__ import division
import numpy as np
import pandas as pd
import scipy.stats
from scipy.stats import mode
from datetime import datetime
df = pd.read_csv('train.csv')
label = df['TARGET']
df = df.drop(['ID', 'TARGET'], axis=1)

def fill_fre_top_5(x):
    if (len(x)) <= 5:
        new_array = np.full(5, np.nan)
        new_array[0:len(x)] = x
        return new_array
def eda_analysis(missSet=[np.nan, 9999999999, -999999], df=None):
    # 1Count #
    count_un = df.apply(lambda x: len(x.unique()))
    # 2Zero Values #
    count_zero = df.apply(lambda x: np.sum(x == 0))
    # 3 Mean #
    df_mean = df.apply(lambda x: np.mean(x[~np.isin(x, missSet)]))

    # 4 Median #
    df_median = df.apply(lambda x: np.median(x[~np.isin(x, missSet)]))

    # 5 Mode #
    df_mode = df.apply(lambda x: scipy.stats.mode(x[~np.isin(x, missSet)])[0][0])

    # 6 Mode Percentage#
    df_mode_count = df.apply(lambda x: scipy.stats.mode(x[~np.isin(x, missSet)])[1][0])

    df_mode_perct = df_mode_count / df.shape[0]
    df_mode_perct.columns = ['mode_perct']
    # 7 Min Value#
    df_min = df.apply(lambda x: np.min(x[~np.isin(x, missSet)]))

    # 8 Max Value#
    df_max = df.apply(lambda x: np.max(x[~np.isin(x, missSet)]))

    # 9 Quantile Values#
    json_quantile = {}
    for i, name in enumerate(df.columns):
        json_quantile[name] = np.percentile(df[name][~np.isin(df[name], missSet)], (1, 5, 25, 50, 75, 95, 99))
    df_quantile = pd.DataFrame(json_quantile)[df.columns].T
    df_quantile.columns = ['quan01', 'quan05', 'quan25', 'quan50', 'quan75', 'quan95', 'quan99']
    # 10 Frequence#
    json_fre_name = {}
    json_fre_count = {}
    for i, name in enumerate(df.columns):
        # 1.Index Name#
        index_name = df[name][~np.isin(df[name], missSet)].value_counts().iloc[0:5, ].index.values
        # 1.1if the length is less than 5#
        index_name = fill_fre_top_5(index_name)
        json_fre_name[name] = index_name
        # 2.Value Count#
        values_count = df[name][~np.isin(df[name], missSet)].value_counts().iloc[0:5, ].values
        values_count = fill_fre_top_5(values_count)
        json_fre_count[name] = values_count

    df_fre_name = pd.DataFrame(json_fre_name)[df.columns].T
    df_fre_count = pd.DataFrame(json_fre_count)[df.columns].T

    df_fre = pd.concat([df_fre_name, df_fre_count], axis=1)
    df_fre.columns = ['value1', 'value2', 'value3', 'value4', 'value5', 'freq1', 'freq2', 'freq3', 'freq4', 'freq5']

    # 11Miss Value Count#
    df_miss = df.apply(lambda x: np.sum(np.isin(x, missSet)))
    df_miss = df_miss.to_frame('freq_miss')

    # 12Combine All Information#
    df_eda_summary = pd.concat(
        [count_un, count_zero, df_mean, df_median, df_mode, df_mode_count, df_mode_perct, df_min, df_max, df_fre,
         df_miss, df_quantile], axis=1)
    return df_eda_summary
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值