SimpleImputer的使用方法

最新推荐文章于 2024-04-16 21:32:09 发布

DeniuHe

最新推荐文章于 2024-04-16 21:32:09 发布

阅读量579

点赞数

分类专栏：算法 Python学习文章标签： python pandas sklearn

本文链接：https://blog.csdn.net/DeniuHe/article/details/127463269

版权

Python学习同时被 2 个专栏收录

239 篇文章 13 订阅

订阅专栏

算法

193 篇文章 2 订阅

订阅专栏

###### importing libraries ################################################
import numpy as np
import pandas as pd
import utilities as util
import impyute as impy 
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

###### data imputation ####################################################
def __sklearn_imputation(dataframes, strategy):
    dfs = util.df_to_dfs(dataframes)
    imp_sklearn_dfs = []
    sklearn_imputer = SimpleImputer(missing_values=np.nan, strategy= strategy)
    for i in range(len(dfs)):
        imp_sklearn_dfs.append(
            pd.DataFrame(
                sklearn_imputer.fit_transform(dfs[i]), columns = dfs[i].columns
                ).astype(dfs[i].dtypes.to_dict()))            

    return imp_sklearn_dfs

def mean_imputation(dataframes):
    """Imputes missing values found in pandas dataframe/s using sklearn mean imputation.
    Args:
        dataframes (pandas dataframe or list of dataframes): The dataframe/s to impute missing values for.
    Returns:
        list of pandas dataframe: A list of pandas dataframe imputted using mean imputation.
    """
    return __sklearn_imputation(dataframes, "mean")

def median_imputation(dataframes):
    """Imputes missing values found in pandas dataframe/s using sklearn median imputation.
    Args:
        dataframes (pandas dataframe or list of dataframes): The dataframe/s to impute missing values for.
    Returns:
        list of pandas dataframe: A list of pandas dataframe imputted using median imputation.
    """
    return __sklearn_imputation(dataframes, "median")

def mode_imputation(dataframes):
    """Imputes missing values found in pandas dataframe/s using sklearn mode imputation.
    Args:
        dataframes (pandas dataframe or list of dataframes): The dataframe/s to impute missing values for.
    Returns:
        list of pandas dataframe: A list of pandas dataframe imputted using mode imputation.
    """
    return __sklearn_imputation(dataframes, "most_frequent")


def em_imputation(dataframes, dtype, loops = 50):
    """Imputes missing values found in pandas dataframe/s using impyute expectation maximization.
    Args:
        dataframes (pandas dataframe or list of dataframes): The dataframe/s to impute missing values for.
        dtype (str(‘int’,’float’)): Type of data.
        loops (int, optional): Number of expectation maximization iterations to run before breaking.  Defaults to 50.
    Returns:
        list of pandas dataframe: A list of pandas dataframe imputted using expectation maximization.
    """
    dfs = util.df_to_dfs(dataframes)
    imp_em_dfs = []
    for i in range(len(dfs)):
        tmp_em_df = impy.imputation.cs.em(dfs[i].values, loops = loops, dtype = dtype) 
        imp_em_dfs.append(
            pd.DataFrame(
                tmp_em_df, columns = dfs[i].columns
                ).astype(dfs[i].dtypes.to_dict()))

    return imp_em_dfs

def mice_imputation(dataframes, dtype):
    """Imputes missing values found in pandas dataframe/s using impyute expectation maximization.
    Args:
        dataframes (pandas dataframe or list of dataframes): The dataframe/s to impute missing values for.
        dtype (str(‘int’,’float’)): Type of data.
        loops (int, optional): Number of expectation maximization iterations to run before breaking.  Defaults to 50.
    Returns:
        list of pandas dataframe: A list of pandas dataframe imputted using expectation maximization.
    """
    dfs = util.df_to_dfs(dataframes)
    imp_mice_dfs = []
    for i in range(len(dfs)):
        tmp_mice_df = impy.imputation.cs.mice(dfs[i].values, dtype = dtype) 
        imp_mice_dfs.append(
            pd.DataFrame(
                tmp_mice_df, columns = dfs[i].columns
                ).astype(dfs[i].dtypes.to_dict()))

    return imp_mice_dfs

def knn_imputation(dataframes, dtype, k = 100):
    """Imputes missing values found in pandas dataframe/s using impyute knn.
    Args:
        dataframes (pandas dataframe or list of dataframes): The dataframe/s to impute missing values for.
        dtype (str(‘int’,’float’)): Type of data.
        k (int): Number of neighbours used in KNN. 
    Returns:
        list of pandas dataframe: A list of pandas dataframe imputted using knn.
    """
    dfs = util.df_to_dfs(dataframes)
    imp_knn_dfs = []
    for i in range(len(dfs)):
        tmp_knn_df = impy.imputation.cs.fast_knn(dfs[i].values, k = k, dtype = dtype) 
        imp_knn_dfs.append(
            pd.DataFrame(
                tmp_knn_df, columns = dfs[i].columns
                ).astype(dfs[i].dtypes.to_dict()))

    return imp_knn_dfs

###### over sampling  ####################################################
def oversample_smote(dataframes, sampling_strategy = "auto", random_state = 40, k = 8, columns = None, verbose = False):
    
    # convert df to dataframes
    dfs = util.df_to_dfs(dataframes)
    # initialize smote object
    smote = SMOTE(sampling_strategy = sampling_strategy, random_state = random_state, k_neighbors = k)
    
    # loop in each dataframe 
    oversampled_dfs = []
    for i in range(len(dfs)):
        n = dfs[i].shape[1] - 1
        
        # get the features for the df
        x = dfs[i].iloc[:,0:n] 
        # get the lables for the df 
        y = dfs[i].iloc[:,n]
        
        # output log (original)
        if(verbose):
            group, occurrences = np.unique(y, return_counts = True)
            outcomes = dict(zip(group, occurrences))
            print("original dataset (labels): " + str(outcomes))
            print("total: " + str(sum(outcomes.values())))
        
        # apply smote 
        x_resampled, y_resampled = smote.fit_sample(x,y)
             
        # output log (oversampled)
        if(verbose):
            group, occurrences = np.unique(y_resampled, return_counts = True)
            outcomes = dict(zip(group, occurrences))
            print("resampled dataset (labels): " + str(outcomes))
            print("total: " + str(sum(outcomes.values())) + "\n")
        
        # convert oversampled arrays back to dataframes
        oversampled_instances = np.concatenate((x_resampled, np.matrix(y_resampled).T), axis=1)
        oversampled_df = pd.DataFrame(data = oversampled_instances, columns = columns)
        oversampled_df.iloc[:,n] = oversampled_df.iloc[:,n].astype(int)
        oversampled_dfs.append(oversampled_df)
        
    # return oversampled dataframes
    return oversampled_dfs
                                     
###### re-scaling data  ##################################################
def scale_range(x, min, max):
    return np.interp(x, (x.min(), x.max()) , (min, max))

def standardization(x):
    """Scales values in array using standardization and replaces the values by their Z scores (x - x_mean / std). 
    This technique redistributes the array with mean = 0 and STD = 1. 
    Args:
        x (numpy array): A 1D numpy numeric array which will be scaled using standardization.
    Returns:
        numpy array: A 1D numpy numeric array scaled using standardization.
    """

    return ((x - np.mean(x)) / np.std(x))
##########################################################################