GMM based imputation

import pandas as pd
import numpy as np

def MVNImputer(df, epsilon=1e-5, maxiter=100):
    data = df.values
    n_obs, n_var = data.shape
    mu_init = np.nanmean(data, axis=0)
    sigma_init = np.zeros((n_var, n_var))
    for i in range(n_var):
        for j in range(n_var):
            vecs = data[:,[i,j]]
            vecs = vecs[~np.any(np.isnan(vecs), axis=1),:].T
            if len(vecs) > 0:
                cov = np.cov(vecs)
                cov = cov[0,1]
                sigma_init[i,j] = cov
                sigma_init[j,i] = cov
            else:
                sigma_init[i,j] = 1.0
                sigma_init[j,i] = 1.0

    print("Start EM iteration")
    pre_mu = mu_init
    pre_sigma = sigma_init
    pre_lik = -np.inf
    for n_iter in range(maxiter):
        """E step"""
        temp_data = np.copy(data)
        for i in range(n_obs):
            if np.any(np.isnan(temp_data[i,:])):
                nans = np.isnan(temp_data[i,:])
                # conditional distribution of multivariate normal
                offset_mu = np.dot(pre_sigma[nans,:][:,~nans], np.dot(np.linalg.inv(pre_sigma[~nans,:][:,~nans]), (temp_data[i, ~nans] - pre_mu[~nans])[:, np.newaxis]))
                temp_data[i, nans] = pre_mu[nans] + offset_mu
        """M step"""
        new_mu = np.mean(temp_data, axis=0)
        new_sigma = np.cov(temp_data.T)
        new_lik = -0.5 * n_obs * (n_var * np.log(2 * np.pi) + np.log(np.linalg.det(new_sigma)))
        for i in range(n_obs):
            new_lik -= 0.5 * np.dot((temp_data[i, :] - new_mu), np.dot(np.linalg.inv(new_sigma), (temp_data[i, :] - new_mu)[:, np.newaxis]))
        if new_lik - pre_lik < epsilon:
            imputed = temp_data
            break
        pre_mu = new_mu
        pre_sigma = new_sigma
        pre_lik = new_lik
    else:
        imputed = temp_data
    return imputed

if __name__ == '__main__':
 


    d = {'motivations':[3,4,5,2,5,3,2,6,3,3],
         'IQ':[83,85,95,96,103,104,109,112,115,116],
         'aptitude':[np.nan,np.nan,np.nan,np.nan,128,102,111,113,117,133]}
    # true aptitude are: 'aptitude':[93,99,98,103,128,102,111,113,117,133]}

    df = pd.DataFrame(d, index=range(1,11))
    print(df)
    imputed = MVNImputer(df)
    print(imputed)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

DeniuHe

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值