用来平滑和标记异常点的简单脚本

一维平滑:Iterative Smoother和Gaussian Process

Iterative Smoother:来自这篇论文
Gaussian Process:来自scikit-learn
代码如下:

import numpy as np
import matplotlib.pyplot as plt
from numba import njit, prange
from astropy.cosmology import Planck15 as plk15
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel
import warnings

def centerize(x, xmin, xmax, std=None):
    x = np.asarray(x)
    xcenter = (xmin + xmax)/2.0
    scale = (xmax - xmin)/2.0
    if scale == 0:
        return x, std
    if std is not None:
        std = np.asarray(std)/scale
    return (x-xcenter)/scale, std

def uncenterize(x, xmin, xmax, std=None):
    x = np.asarray(x)
    xcenter = (xmin + xmax)/2.0
    scale = (xmax - xmin)/2.0
    if std is not None:
        std = np.asarray(std)*scale
    return x*scale + xcenter, std

def CRBF_kernel(constant_value=1.0, constant_value_bounds=(1e-3, 1e3), length_scale=0.1, length_scale_bounds=(1e-3, 1e3)):
    '''
    ConstantKernel * RBF see website below for details:
    https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.kernels.ConstantKernel.html#sklearn.gaussian_process.kernels.ConstantKernel
    https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.kernels.RBF.html#sklearn.gaussian_process.kernels.RBF
    https://scikit-learn.org/stable/modules/gaussian_process.html#gp-kernels
    '''
    kernel = ConstantKernel(constant_value=constant_value, constant_value_bounds=constant_value_bounds) * RBF(length_scale=length_scale, length_scale_bounds=length_scale_bounds)
    return kernel

def no_optimizer(obj_func, initial_theta, bounds):
    '''
    Do not do any optimization, equivalent to setting bounds='fixed' in kernel function
    '''
    return initial_theta, 1.e-6

def gaussian_process(x, y, kernel=CRBF_kernel(), alpha=0.0, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=0, get_obj=False, normalize_y=True, centerize_x=True, **kwargs):
    '''
    Wrapper for GaussianProcessRegressor in scikit-learn, see https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html#sklearn.gaussian_process.GaussianProcessRegressor
    kernel: kernel instance, default=None, arguments for kernel functions are typically, (parameter, bounds), where bounds can be tuple or 'fixed'\n
    alpha: float or ndarray of shape (n_samples,), default=1e-10. Value added to the diagonal of the kernel matrix during fitting. To avoid numerical problem and can also be interpreted as the variance of additional Gaussian measurement noise on the training observations. NOTE, in this function, if normalize_y, I will normalize alpha in the same time.\n
    normalize_y: normalize the target values y by removing the mean and scaling to unit-variance\n
    centerize_x: normalize x so that x.min() is -1 and x.max() is 1\n
    optimizer: 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b';, optimizing the kernel’s parameters. Signature:\n
        def optimizer(obj_func, initial_theta, bounds):\n
            return theta_opt, func_min\n
    n_restarts_optimizer: int, default 0.\n
    return:\n
        if get_obj:\n
            fit_func, gp_obj\n
        else:\n
            fit_func\n
        NOTE: fit_func can accept either return_std and return_cov, default are both False\n
    '''
    x = np.asarray(x)
    xmin = x.min()
    xmax = x.max()
    if centerize_x:
        x, _ = centerize(x, xmin, xmax)
    y = np.asarray(y)
    x = x[:,None]
    if normalize_y:
        alpha = alpha/y.std(axis=0)**2
    gp = GaussianProcessRegressor(kernel=kernel, alpha=alpha, optimizer=optimizer, n_restarts_optimizer=n_restarts_optimizer, normalize_y=normalize_y, **kwargs)
    gp.fit(x, y)
    def _predict(xin, return_std=False, return_cov=False):
        if centerize_x:
            xin, _ = centerize(xin, xmin, xmax)
        else:
            xin = np.asarray(xin)
        xin = xin[:,None]
        return gp.predict(xin, return_std=return_std, return_cov=return_cov)
    if get_obj:
        return _predict, gp
    else:
        return _predict
        
def lognormal_weight(xi, x, delta=0.3):
    '''
    lognormal weighting function for different points, from https://arxiv.org/pdf/2009.12045.pdf
    xi: array_like, (M,), data points
    x: array_like, (N,), points to evaluate 
    delta: correlation length, larger delta, more smooth
    return: ndarray, (M, N), the weight matrix
    '''
    xi, x = np.meshgrid(xi, x, indexing='ij')
    return np.exp(-np.log(x/xi)**2/2./delta**2)

def normal_weight(xi, x, delta=0.3):
    '''
    normal weighting function for different points
    xi: array_like, (M,), data points
    x: array_like, (N,), points to evaluate 
    delta: correlation length, larger delta, more smooth
    return: ndarray, (M, N), the weight matrix
    '''
    xi, x = np.meshgrid(xi, x, indexing='ij')
    return np.exp(-(x - xi)**2/2./delta**2)

def _iter_smooth_1step(yi, yi_hat, W, covinv):
    dy = yi - yi_hat
    #input()
    if covinv is not None:
        if W is not None:
            covinv = np.dot(covinv, W)
        dy = np.dot(dy, covinv)
        deno = covinv.sum(axis=0)
        dy = dy/deno
    elif W is not None:
        dy = np.dot(dy, W)
        deno = W.sum(axis=0)
        dy = dy/deno
    return dy

def get_chi2(dy, covinv):
    if covinv is None:
        return np.dot(dy, dy)
    else:
        return np.dot(dy, np.dot(covinv, dy))

@njit(parallel=True)
def fill_array(x, y, xout, out):
    n_x = x.shape[0]
    for ii in prange(n_x):
        inds = np.where(xout==x[ii])[0]
        out[inds] = y[ii]

def iter_smoother(yi, xi, x=None, yini=None, covinv=None, weight_func=None, max_iter=1000, min_chi2=None):
    '''
    From https://arxiv.org/pdf/2009.12045.pdf.
    yi: data to smooth
    xi: independent variable corresponding to yi
    x: independent variable to predict, if input and not equivalent to xi, weight_func must not be None
    yini: initial guess, not important
    covinv: inverse of covariance matrix for yi, if None, treat as unit matrix
    weight_func: describe the correlation between x points. Signature:
        def weight_func(xi, x):
            return W # shape == xi.shape + x.shape
        see normal_weight for example
    max_iter: maximum iteration
    min_chi2: if not None, use get_chi2 to find chi2, if chi2>min_chi2 then break, if None, break when the maximum iterations are reached. NOTE, the chi2 is calculated using the step dy, instead of yi - yi_pred
    '''
    yi = np.asarray(yi)
    xi = np.asarray(xi)
    n_xi = xi.shape[0]
    if x is None:
        x = xi
    else:
        x = np.asarray(x)
    if x is xi or np.array_equal(x, xi):
        print('Equivalent x and xi! Do not concatenate them!')
        append = False
    else:
        print('Concatenate x and xi!')
        x0 = x.copy()
        x = x[~np.isin(x, xi)]
        x = np.append(x, xi)
        append = True
    if yini is None:
        yini = np.zeros_like(x)
    if weight_func is not None:
        W = weight_func(xi, x)
        W = np.asarray(W)
    elif append:
        raise Exception('Input weight_func if x is not None, and x is not equal xi!')
    else:
        W = None
    if covinv is not None:
        covinv = np.asarray(covinv)
    n_iter = 0
    while True:
        dy = _iter_smooth_1step(yi, yini[-n_xi:], W, covinv)
        yini = yini + dy
        n_iter += 1
        if min_chi2 is not None:
            chi2 = get_chi2(dy[-n_xi:], covinv)
            if chi2<=min_chi2:
                print('Break at %d th iterations with chi2 = %s <= %s'%(n_iter, chi2, min_chi2))
                break
        if n_iter>=max_iter:
            chi2 = get_chi2(dy[-n_xi:], covinv)
            print('Break at %d th iterations with chi2 = %s!'%(n_iter, chi2))
            break
    if not append:
        yi_hat = yini.copy()
        y = yini
    else:
        yi_hat = yini[-n_xi:]
        y = np.zeros_like(x0)
        fill_array(x, yini, x0, y)
    return [y, yi_hat]

if __name__ == '__main__':

    # test for iter_smoother and gp, gausian noise

    from functools import partial
    xi = np.linspace(-10, 10, 201)
    yi0 = np.sin(xi)
    amp = 0.1
    yi = yi0 + np.random.randn(*yi0.shape)*amp

    covinv = np.ones_like(xi)/amp**2
    covinv = np.diag(covinv)
    x = np.linspace(-10, 10, 301)
    weight_func=partial(normal_weight, delta=0.8)

    #weight_func = None # if weight_func is None, result coincide with input
    #covinv = None
    #x = xi

    gp_pred = gaussian_process(xi, yi, alpha=amp**2)
    gp_yi = gp_pred(xi)
    gp_chi2 = get_chi2(gp_yi - yi, covinv)
    print(gp_chi2)

    y, yini = iter_smoother(yi, xi, x=x, weight_func=weight_func, max_iter=1000, covinv=covinv, min_chi2=amp**2*len(xi))
    y2, yini2 = iter_smoother(yi, xi, x=x, weight_func=weight_func, max_iter=1000, covinv=covinv, min_chi2=None)


    plt.plot(xi, yi0, '-', label='input')
    plt.plot(xi, yi, '.', label='data')
    #plt.plot(xi, yini)
    plt.plot(x, y, '--', label='smoother with minimun chi2')
    plt.plot(x, y2, '--', label='smoother')
    plt.plot(x, gp_pred(x), '--', label='gp')
    plt.legend(fontsize=12)
    plt.show()

异常点标记:

sigma_clip:通过cenfunc得到平滑轮廓,并通过stdfunc得到sigma,标记超过一定sigma的数值。
fft_fit:通过fft找出频谱,去除小于最高峰某个阈值的峰(设置为0),然后ifft回去。
NOTE:之后可以测试一下二维的情形,理论上而言也能适用,可以把SVDsum threshold之类的方法也实现一下,或者看看有没有别人写的类似的代码。

import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import medfilt
from scipy.ndimage import convolve1d
import warnings
from tqdm import tqdm

def extra_mask(mask, axis, extra_mask):
    '''
    NOTE: this is just an approximate approach, the mask area may not be extended by extra_mask for narror area\n
    '''
    mask = np.asarray(mask)
    kernel = np.ones(extra_mask+1, dtype=np.float64)
    mask = mask.astype(np.float64)
    return convolve1d(mask, kernel, axis=axis)>0

def median_std(dy, correct=True):
    '''
    More robust than std, by replacing mean with median. Support masked array.\n
    if correct, 0.674 will be divided to make this function return the std for gaussian random variables with zero mean.\n
    '''
    if correct:
        return np.ma.median(dy**2.)**0.5/0.674
    else:
        return np.ma.median(dy**2.)**0.5

def sigma_clip(data, sigma_lower=5., sigma_upper=5., cenfunc=medfilt, stdfunc=median_std, max_iter=1, get_sigma=False, verbose=True, **kwargs):

    '''
    cenfunc: return an array that be broadcasted with data or a float number\n
    kwargs: for cenfunc\n
    return masked_array\n
    '''
    if type(data) is np.ma.core.MaskedArray:
        mask = data.mask
        data = data.data
    elif type(data) is not np.ndarray:
        mask = None
        data = np.asarray(data)

    valid = np.isfinite(data)
    if mask is not None:
        valid = valid & (~mask)
    n_iter = 0
    n_mask = 0
    while True:
        if n_iter>= max_iter:
            if verbose:
                print('Break at %d th iterations! Masked %d points!'%(n_iter, n_mask))
            break
        if not np.any(valid):
            if n_iter == 0:
                warnings.warn('No valid points at the first iteration!')
            break
        new_data = data[valid]
        cen_data = cenfunc(new_data, **kwargs)
        diff = new_data - cen_data
        sigma = stdfunc(diff)
        if sigma == 0:
            if n_iter == 0:
                warnings.warn('Sigma=0 at the first iteration! It will leading to masking no points! You may need to change cenfunc to avoid over fitting!')
            if verbose:
                print('Break at %d th iterations for sigma=0! Masked %d points!'%(n_iter, n_mask))
            break

        #plt.figure()
        #plt.plot(diff)
        #xlim = plt.xlim()
        #plt.hlines(-sigma_lower*sigma, *xlim)
        #plt.hlines(sigma_upper*sigma, *xlim)
        #print(sigma)
        #plt.show()

        this_valid = (diff>=-sigma_lower*sigma)&(diff<=sigma_upper*sigma)

        valid[valid] = this_valid
        n_iter += 1
        n_mask += (~this_valid).sum()
        if np.all(this_valid):
            if verbose:
                print('Converged at %d th iterations! Masked %d points!'%(n_iter, n_mask))
            break
        elif verbose:
            print('Masked %d points at %d iteration, with sigma %s!'%((~this_valid).sum(), n_iter, sigma))
    if get_sigma:
        return np.ma.array(data, mask=~valid), sigma
    else:
        return np.ma.array(data, mask=~valid)

def sigma_clip_along_axis(data, axis, sigma_lower=5., sigma_upper=5., cenfunc=medfilt, stdfunc=median_std, max_iter=1, verbose=False, **kwargs):
    '''
    return mask, sigma
    '''
    if (type(data) is not np.ndarray) and (type(data) is not np.ma.core.MaskedArray):
        data = np.asarray(data)
    if axis<0:
        axis = len(data.shape) + axis
    Ni, Nk = data.shape[:axis], data.shape[axis+1:]
    mask = np.zeros_like(data, dtype=bool)
    shp = list(data.shape)
    del shp[axis]
    sigma = np.zeros(shp, dtype=data.dtype)
    for ii in tqdm(np.ndindex(Ni)):
        for kk in np.ndindex(Nk):
            y = data[ii+np.s_[:,]+kk]
            ymask, this_sigma = sigma_clip(y, sigma_lower=sigma_lower, sigma_upper=sigma_upper, cenfunc=cenfunc, stdfunc=stdfunc, max_iter=max_iter, get_sigma=True, verbose=verbose, **kwargs)
            mask[ii+np.s_[:,]+kk] = ymask.mask
            sigma[ii+kk] = this_sigma
    return mask, sigma

def block_average_sigma_clip(data, block_size, average_axis, clip_axis, sigma_lower=5., sigma_upper=5., cenfunc=medfilt, stdfunc=median_std, max_iter=1, verbose=False, **kwargs):
    '''
    NOTE: the sigma is the sigma before averaging\n
    return mask, sigma
    '''
    if type(data) is not np.ma.core.MaskedArray:
        data = np.ma.array(data, mask=~np.isfinite(data))
    ntotal = data.shape[average_axis]
    nblock = ntotal//block_size
    if nblock*block_size < ntotal:
        nblock += 1
    mask = np.zeros_like(data.data, dtype=bool)
    sigma = np.zeros_like(data.data)
    shp = list(data.shape)
    if average_axis<0:
        average_axis = len(shp) + average_axis
    if clip_axis<0:
        clip_axis = len(shp) + clip_axis
    if clip_axis>average_axis:
        clip_axis -= 1
    del shp[average_axis]
    slice1 = (slice(None),)*average_axis
    slice2 = (slice(None),)*(len(shp)-average_axis-1)
    ist = 0
    for _ in range(nblock):
        slice0 = (slice(ist, ist+block_size), )
        this_slice = slice1 + slice0 + slice2
        ist = ist + block_size
        this_data = data[this_slice]
        n_mean = this_data.shape[average_axis]
        this_data = this_data.mean(axis=average_axis)

        this_mask, this_sigma = sigma_clip_along_axis(this_data, clip_axis, sigma_lower=sigma_lower, sigma_upper=sigma_upper, cenfunc=cenfunc, stdfunc=stdfunc, max_iter=max_iter, verbose=verbose, **kwargs)
        this_mask = np.expand_dims(this_mask, axis=average_axis)
        this_sigma = np.expand_dims(this_sigma, axis=clip_axis)
        this_sigma = np.expand_dims(this_sigma, axis=average_axis)
        mask[this_slice] = this_mask
        sigma[this_slice] = this_sigma*np.sqrt(n_mean) # before averaging
    return mask, sigma

def fft_fit(y, axis=-1, max_th=0.01):
    y = np.asarray(y)
    fft = np.fft.fft(y - y.mean(), axis=axis)
    psd = np.abs(fft)**2
    th = psd.max(axis=axis)*max_th
    th = np.expand_dims(th, axis)
    fft[psd<th] = 0
    npk = (psd>=th).sum()
    print('Number of components: %d'%npk)
    yfit = np.fft.ifft(fft)

    #plt.plot(np.fft.fftshift(psd), '.')
    #plt.plot(np.fft.fftshift(np.abs(fft)**2), '--')
    #plt.show()

    return yfit.real + y.mean()

if __name__ == '__main__':
#
#    # test for sigma_clip, plane signal with gaussian noise and peak
#    xi = np.linspace(-10, 10, 201)
#    yi0 = np.zeros_like(xi)
#    amp = 0.5
#    spike = np.random.normal(0, 15, size=10)
#    spike_inds = np.random.choice(np.arange(xi.shape[0]), spike.shape[0], replace=True)
#    yi = yi0 + np.random.randn(*yi0.shape)*amp
#    yi[spike_inds] = spike
#    y = sigma_clip(yi, max_iter=1, cenfunc=np.median)
#    plt.plot(xi, yi0, label='input')
#    plt.plot(xi, yi, '.', label='data')
#    plt.plot(xi, y, '--', label='sigma clip')
#    plt.plot(xi[y.mask], y[y.mask].data, '*', label='outlayer')
#    plt.legend(fontsize=10)
#    plt.show()
#

    # test for sigma_clip and fft_fit, gaussian noise with peak
    def f(x):
        """The function to predict."""
        return (x-0.8)*(np.sin(x) + 2*np.cos(3*x) + 0.5*np.sin(5*x) + 0.7*np.cos(7*x))
    xi = np.linspace(-10, 10, 201)
    yi0 = f(xi)
    amp = 0.5
    spike = np.random.normal(0, 15, size=10)
    spike_inds = np.random.choice(np.arange(xi.shape[0]), spike.shape[0], replace=True)
    yi = yi0 + np.random.randn(*yi0.shape)*amp
    yi[spike_inds] = spike

    y = sigma_clip(yi, max_iter=1, kernel_size=7)
    y2 = fft_fit(yi, max_th=0.05)
    plt.plot(xi, yi0, label='input')
    plt.plot(xi, yi, '.', label='data')
    plt.plot(xi, y, '--', label='sigma clip')
    plt.plot(xi, y2, '--', label='fft')
    plt.plot(xi[y.mask], y[y.mask].data, '*', label='outlayer')
    plt.legend(fontsize=10)
    plt.show()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值