调包实现Multiple imputation by chained equations（MICE）

最新推荐文章于 2023-03-16 11:01:32 发布

DeniuHe

最新推荐文章于 2023-03-16 11:01:32 发布

阅读量904

点赞数

分类专栏：算法文章标签： python 机器学习

本文链接：https://blog.csdn.net/DeniuHe/article/details/122219980

版权

算法专栏收录该内容

193 篇文章 2 订阅

订阅专栏

Public functions — imputena documentationhttps://imputena.readthedocs.io/en/latest/functions.html#multiple-imputation-by-chained-equations

https://github.com/macarro/imputena/blob/master/test/multiple_imputation/test_mice.pyhttps://github.com/macarro/imputena/blob/master/test/multiple_imputation/test_mice.py

import logging
import unittest
import numpy as np
import pandas as pd
from imputena import mice

def generate_df_breast_cancer():
    """
    Example dataframe used to test logistic regression.
    Adapted from: Breast Cancer Wisconsin (Diagnostic) Data Set (UCI Machine
    Learning Repository)
    Contains 15 missing values.
        thickness  uniformity  adhesion  size  nucleoli  mitoses class
    0         1.0         1.0         1   1.0        1        1     B
    1         1.0         1.0         1   2.0        1        1     B
    2         8.0         4.0         3   3.0        3        1     B
    3         4.0         1.0         1   2.0        6        1     B
    4        10.0         8.0         4   4.0       10        4  None
    5         5.0         1.0         1   2.0        2        1     B
    6         NaN        10.0        10   3.0        6        1  None
    7         3.0         3.0         1   2.0        1        1     B
    8         3.0         NaN         1   2.0        1        1  None
    9         2.0         3.0         1   5.0        1        1     B
    10        NaN         1.0         1   NaN        1        1  None
    11        5.0         2.0         2   1.0        1        1     B
    12       10.0         NaN         2   NaN        7        1  None
    13        7.0         8.0         2   4.0        8        2     M
    14        8.0         4.0         1   3.0        9        2  None
    15        1.0         1.0         1   2.0        1        1     B
    16        4.0         1.0         1   2.0        1        1  None
    17        1.0         2.0         1   2.0        1        1     B
    18       10.0         NaN         4   NaN       10        1     M
    19        1.0         1.0         1   2.0        1        1     B
    20        3.0         1.0         1   2.0        1        1     B
    21        5.0         1.0         1   2.0        1        1     B
    22        4.0         1.0         1   2.0        1        1     B
    23        8.0         4.0         1   2.0        3        1     M
    24        8.0         7.0         4   5.0       10        1     M
    25       10.0         4.0        10   4.0        1        1     M
    26        8.0         3.0         9   3.0        3        1     M
    27        8.0        10.0         8   7.0        7        1     M
    28        6.0         1.0         1   2.0        1        1     B
    29        4.0         1.0         1   2.0        1        1     B
    """
    return pd.DataFrame({
        'thickness': np.array(
            [1.0, 1.0, 8.0, 4.0, 10.0, 5.0, np.nan, 3.0, 3.0, 2.0, np.nan, 5.0,
             10.0, 7.0, 8.0, 1.0, 4.0, 1.0, 10.0, 1.0, 3.0, 5.0, 4.0, 8.0, 8.0,
             10.0, 8.0, 8.0, 6.0, 4.0]),
        'uniformity': np.array(
            [1.0, 1.0, 4.0, 1.0, 8.0, 1.0, 10.0, 3.0, np.nan, 3.0, 1.0, 2.0,
             np.nan, 8.0, 4.0, 1.0, 1.0, 2.0, np.nan, 1.0, 1.0, 1.0, 1.0, 4.0,
             7.0, 4.0, 3.0, 10.0, 1.0, 1.0]),
        'adhesion': np.array(
            [1, 1, 3, 1, 4, 1, 10, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 4, 1, 1, 1,
             1, 1, 4, 10, 9, 8, 1, 1]),
        'size': np.array(
            [1.0, 2.0, 3.0, 2.0, 4.0, 2.0, 3.0, 2.0, 2.0, 5.0, np.nan, 1.0,
             np.nan, 4.0, 3.0, 2.0, 2.0, 2.0, np. nan, 2.0, 2.0, 2.0, 2.0, 2.0,
             5.0, 4.0, 3.0, 7.0, 2.0, 2.0]),
        'nucleoli': np.array(
            [1, 1, 3, 6, 10, 2, 6, 1, 1, 1, 1, 1, 7, 8, 9, 1, 1, 1, 10, 1, 1,
             1, 1, 3, 10, 1, 3, 7, 1, 1]),
        'mitoses': np.array(
            [1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1,
             1, 1, 1, 1, 1, 1, 1, 1]),
        'class': ['B', 'B', 'B', 'B', None, 'B', None, 'B', None, 'B', None,
                  'B', None, 'M', None, 'B', None, 'B', 'M', 'B', 'B', 'B',
                  'B', 'M', 'M', 'M', 'M', 'M', 'B', 'B']
    })




# df = generate_df_breast_cancer()

df = pd.read_csv(r"F:\Latex_IDA\example_incomplete.csv",header=None,index_col=None)
# 下面这句代码必须要有
df.columns = ["s"+str(i) for i  in df.columns]

# print(df)
#
dfs = mice(df,imputations=3)
# print(dfs[0])
# print(dfs[0].isna().sum().sum())
#
#
# print(dfs[1])
# print(dfs[1].isna().sum().sum())
#
# print(dfs[2])
# print(dfs[2].isna().sum().sum())

data_0 = dfs[0]
data_1 = dfs[1]
data_2 = dfs[2]

data_mean = (data_0 + data_1 + data_2)/3
print(data_mean)

非调包

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeClassifier
from sklearn.model_selection import train_test_split

class MiceImputer(object):

    def __init__(self, seed_values = True, seed_strategy="mean", copy=True):
        self.strategy = seed_strategy # seed_strategy in ['mean','median','most_frequent', 'constant']
        self.seed_values = seed_values # seed_values = False initializes missing_values using not_null columns
        self.copy = copy
        self.imp = SimpleImputer(strategy=self.strategy, copy=self.copy)

    def fit_transform(self, X, method = 'Linear', iter = 5, verbose = True):

        # Why use Pandas?
        # http://gouthamanbalaraman.com/blog/numpy-vs-pandas-comparison.html
        # Pandas < Numpy if X.shape[0] < 50K
        # Pandas > Numpy if X.shape[0] > 500K

        # Data necessary for masking missing-values after imputation
        null_cols = X.columns[X.isna().any()].tolist()
        null_X = X.isna()[null_cols]

        ### Initialize missing_values

        if self.seed_values:

            # Impute all missing values using SimpleImputer
            if verbose:
                print('Initilization of missing-values using SimpleImputer')
            new_X = pd.DataFrame(self.imp.fit_transform(X))
            new_X.columns = X.columns
            new_X.index = X.index

        else:

            # Initialize a copy based on value of self.copy
            if self.copy:
                new_X = X.copy()
            else:
                new_X = X

            not_null_cols = X.columns[X.notna().any()].tolist()

            if verbose:
                print('Initilization of missing-values using regression on non-null columns')

            for column in null_cols:

                null_rows = null_X[column]
                train_x = new_X.loc[~null_rows, not_null_cols]
                test_x = new_X.loc[null_rows, not_null_cols]
                train_y = new_X.loc[~null_rows, column]

                if X[column].nunique() > 2:
                    m = LinearRegression(n_jobs = -1)
                    m.fit(train_x, train_y)
                    new_X.loc[null_rows,column] = pd.Series(m.predict(test_x))
                    not_null_cols.append(column)

                elif X[column].nunique() == 2:
                    m = LogisticRegression(n_jobs = -1, solver = 'lbfgs')
                    m.fit(train_x, train_y)
                    new_X.loc[null_rows,column] = pd.Series(m.predict(test_x))
                    not_null_cols.append(column)

        ### Begin iterations of MICE

        model_score = {}

        for i in range(iter):
            if verbose:
                print('Beginning iteration ' + str(i) + ':')

            model_score[i] = []

            for column in null_cols:

                null_rows = null_X[column]
                not_null_y = new_X.loc[~null_rows, column]
                not_null_X = new_X[~null_rows].drop(column, axis = 1)

                train_x, val_x, train_y, val_y = train_test_split(not_null_X, not_null_y, test_size=0.33, random_state=42)
                test_x = new_X.drop(column, axis = 1)

                if new_X[column].nunique() > 2:
                    if method == 'Linear':
                        m = LinearRegression(n_jobs = -1)
                    elif method == 'Ridge':
                        m = Ridge()

                    m.fit(train_x, train_y)
                    model_score[i].append(m.score(val_x, val_y))
                    new_X.loc[null_rows,column] = pd.Series(m.predict(test_x))
                    if verbose:
                        print('Model score for ' + str(column) + ': ' + str(m.score(val_x, val_y)))

                elif new_X[column].nunique() == 2:
                    if method == 'Linear':
                        m = LogisticRegression(n_jobs = -1, solver = 'lbfgs')
                    elif method == 'Ridge':
                        m = RidgeClassifier()

                    m.fit(train_x, train_y)
                    model_score[i].append(m.score(val_x, val_y))
                    new_X.loc[null_rows,column] = pd.Series(m.predict(test_x))
                    if verbose:
                        print('Model score for ' + str(column) + ': ' + str(m.score(val_x, val_y)))

            if model_score[i] == []:
                model_score[i] = 0
            else:
                model_score[i] = sum(model_score[i])/len(model_score[i])

        return new_X

df = pd.read_csv(r"F:\Latex_IDA\example_incomplete.csv",header=None,index_col=None)
df.columns = ["s"+str(i) for i  in df.columns]
mice = MiceImputer()
Z = mice.fit_transform(df)
print(Z)

data_complete = np.array(pd.read_csv(r"F:\Latex_IDA\example_orginal.csv",header=None,index_col=None))
y = data_complete[:,-1]

n1 = np.linspace(0,5,1000)
m1 = np.sqrt(25 - n1**2)

n2 = np.linspace(0,9,1000)
m2 = np.sqrt(81 - n2**2)
plt.figure(figsize=(6,6))
plt.scatter(Z["s0"],Z["s1"],c=y,edgecolors='k',linewidths=0.5)
plt.plot(n1,m1,ls='--',lw=1,color='k')
plt.plot(n2,m2,ls='--',lw=1,color='k')
# plt.savefig(r"F:\Latex_IDA\example_ICkNNI.pdf",dpi=400,bbox_inches='tight')
plt.show()
# print(df)

import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from colorama import Fore, Style, init
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

df = pd.read_csv(r"F:\Latex_IDA\example_incomplete.csv",header=None,index_col=None)
df.columns = ["s"+str(i) for i  in df.columns]

imp = IterativeImputer(estimator=LinearRegression(),
                       initial_strategy="mean",
                       max_iter=10,
                       tol=1e-10,
                       random_state=0,)

Z = imp.fit_transform(df)
print(Z)
data_complete = np.array(pd.read_csv(r"F:\Latex_IDA\example_orginal.csv",header=None,index_col=None))
y = data_complete[:,-1]
n1 = np.linspace(0,5,1000)
m1 = np.sqrt(25 - n1**2)
n2 = np.linspace(0,9,1000)
m2 = np.sqrt(81 - n2**2)
plt.figure(figsize=(6,6))
plt.scatter(Z[:,0],Z[:,1],c=y,edgecolors='k',linewidths=0.5)
plt.plot(n1,m1,ls='--',lw=1,color='k')
plt.plot(n2,m2,ls='--',lw=1,color='k')
plt.show()