一维平滑:Iterative Smoother和Gaussian Process
Iterative Smoother
:来自这篇论文
Gaussian Process
:来自scikit-learn
代码如下:
import numpy as np
import matplotlib.pyplot as plt
from numba import njit, prange
from astropy.cosmology import Planck15 as plk15
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel
import warnings
def centerize(x, xmin, xmax, std=None):
x = np.asarray(x)
xcenter = (xmin + xmax)/2.0
scale = (xmax - xmin)/2.0
if scale == 0:
return x, std
if std is not None:
std = np.asarray(std)/scale
return (x-xcenter)/scale, std
def uncenterize(x, xmin, xmax, std=None):
x = np.asarray(x)
xcenter = (xmin + xmax)/2.0
scale = (xmax - xmin)/2.0
if std is not None:
std = np.asarray(std)*scale
return x*scale + xcenter, std
def CRBF_kernel(constant_value=1.0, constant_value_bounds=(1e-3, 1e3), length_scale=0.1, length_scale_bounds=(1e-3, 1e3)):
'''
ConstantKernel * RBF see website below for details:
https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.kernels.ConstantKernel.html#sklearn.gaussian_process.kernels.ConstantKernel
https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.kernels.RBF.html#sklearn.gaussian_process.kernels.RBF
https://scikit-learn.org/stable/modules/gaussian_process.html#gp-kernels
'''
kernel = ConstantKernel(constant_value=constant_value, constant_value_bounds=constant_value_bounds) * RBF(length_scale=length_scale, length_scale_bounds=length_scale_bounds)
return kernel
def no_optimizer(obj_func, initial_theta, bounds):
'''
Do not do any optimization, equivalent to setting bounds='fixed' in kernel function
'''
return initial_theta, 1.e-6
def gaussian_process(x, y, kernel=CRBF_kernel(), alpha=0.0, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=0, get_obj=False, normalize_y=True, centerize_x=True, **kwargs):
'''
Wrapper for GaussianProcessRegressor in scikit-learn, see https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html#sklearn.gaussian_process.GaussianProcessRegressor
kernel: kernel instance, default=None, arguments for kernel functions are typically, (parameter, bounds), where bounds can be tuple or 'fixed'\n
alpha: float or ndarray of shape (n_samples,), default=1e-10. Value added to the diagonal of the kernel matrix during fitting. To avoid numerical problem and can also be interpreted as the variance of additional Gaussian measurement noise on the training observations. NOTE, in this function, if normalize_y, I will normalize alpha in the same time.\n
normalize_y: normalize the target values y by removing the mean and scaling to unit-variance\n
centerize_x: normalize x so that x.min() is -1 and x.max() is 1\n
optimizer: 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b';, optimizing the kernel’s parameters. Signature:\n
def optimizer(obj_func, initial_theta, bounds):\n
return theta_opt, func_min\n
n_restarts_optimizer: int, default 0.\n
return:\n
if get_obj:\n
fit_func, gp_obj\n
else:\n
fit_func\n
NOTE: fit_func can accept either return_std and return_cov, default are both False\n
'''
x = np.asarray(x)
xmin = x.min()
xmax = x.max()
if centerize_x:
x, _ = centerize(x, xmin, xmax)
y = np.asarray(y)
x = x[:,None]
if normalize_y:
alpha = alpha/y.std(axis=0)**2
gp = GaussianProcessRegressor(kernel=kernel, alpha=alpha, optimizer=optimizer, n_restarts_optimizer=n_restarts_optimizer, normalize_y=normalize_y, **kwargs)
gp.fit(x, y)
def _predict(xin, return_std=False, return_cov=False):
if centerize_x:
xin, _ = centerize(xin, xmin, xmax)
else:
xin = np.asarray(xin)
xin = xin[:,None]
return gp.predict(xin, return_std=return_std, return_cov=return_cov)
if get_obj:
return _predict, gp
else:
return _predict
def lognormal_weight(xi, x, delta=0.3):
'''
lognormal weighting function for different points, from https://arxiv.org/pdf/2009.12045.pdf
xi: array_like, (M,), data points
x: array_like, (N,), points to evaluate
delta: correlation length, larger delta, more smooth
return: ndarray, (M, N), the weight matrix
'''
xi, x = np.meshgrid(xi, x, indexing='ij')
return np.exp(-np.log(x/xi)**2/2./delta**2)
def normal_weight(xi, x, delta=0.3):
'''
normal weighting function for different points
xi: array_like, (M,), data points
x: array_like, (N,), points to evaluate
delta: correlation length, larger delta, more smooth
return: ndarray, (M, N), the weight matrix
'''
xi, x = np.meshgrid(xi, x, indexing='ij')
return np.exp(-(x - xi)**2/2./delta**2)
def _iter_smooth_1step(yi, yi_hat, W, covinv):
dy = yi - yi_hat
#input()
if covinv is not None:
if W is not None:
covinv = np.dot(covinv, W)
dy = np.dot(dy, covinv)
deno = covinv.sum(axis=0)
dy = dy/deno
elif W is not None:
dy = np.dot(dy, W)
deno = W.sum(axis=0)
dy = dy/deno
return dy
def get_chi2(dy, covinv):
if covinv is None:
return np.dot(dy, dy)
else:
return np.dot(dy, np.dot(covinv, dy))
@njit(parallel=True)
def fill_array(x, y, xout, out):
n_x = x.shape[0]
for ii in prange(n_x):
inds = np.where(xout==x[ii])[0]
out[inds] = y[ii]
def iter_smoother(yi, xi, x=None, yini=None, covinv=None, weight_func=None, max_iter=1000, min_chi2=None):
'''
From https://arxiv.org/pdf/2009.12045.pdf.
yi: data to smooth
xi: independent variable corresponding to yi
x: independent variable to predict, if input and not equivalent to xi, weight_func must not be None
yini: initial guess, not important
covinv: inverse of covariance matrix for yi, if None, treat as unit matrix
weight_func: describe the correlation between x points. Signature:
def weight_func(xi, x):
return W # shape == xi.shape + x.shape
see normal_weight for example
max_iter: maximum iteration
min_chi2: if not None, use get_chi2 to find chi2, if chi2>min_chi2 then break, if None, break when the maximum iterations are reached. NOTE, the chi2 is calculated using the step dy, instead of yi - yi_pred
'''
yi = np.asarray(yi)
xi = np.asarray(xi)
n_xi = xi.shape[0]
if x is None:
x = xi
else:
x = np.asarray(x)
if x is xi or np.array_equal(x, xi):
print('Equivalent x and xi! Do not concatenate them!')
append = False
else:
print('Concatenate x and xi!')
x0 = x.copy()
x = x[~np.isin(x, xi)]
x = np.append(x, xi)
append = True
if yini is None:
yini = np.zeros_like(x)
if weight_func is not None:
W = weight_func(xi, x)
W = np.asarray(W)
elif append:
raise Exception('Input weight_func if x is not None, and x is not equal xi!')
else:
W = None
if covinv is not None:
covinv = np.asarray(covinv)
n_iter = 0
while True:
dy = _iter_smooth_1step(yi, yini[-n_xi:], W, covinv)
yini = yini + dy
n_iter += 1
if min_chi2 is not None:
chi2 = get_chi2(dy[-n_xi:], covinv)
if chi2<=min_chi2:
print('Break at %d th iterations with chi2 = %s <= %s'%(n_iter, chi2, min_chi2))
break
if n_iter>=max_iter:
chi2 = get_chi2(dy[-n_xi:], covinv)
print('Break at %d th iterations with chi2 = %s!'%(n_iter, chi2))
break
if not append:
yi_hat = yini.copy()
y = yini
else:
yi_hat = yini[-n_xi:]
y = np.zeros_like(x0)
fill_array(x, yini, x0, y)
return [y, yi_hat]
if __name__ == '__main__':
# test for iter_smoother and gp, gausian noise
from functools import partial
xi = np.linspace(-10, 10, 201)
yi0 = np.sin(xi)
amp = 0.1
yi = yi0 + np.random.randn(*yi0.shape)*amp
covinv = np.ones_like(xi)/amp**2
covinv = np.diag(covinv)
x = np.linspace(-10, 10, 301)
weight_func=partial(normal_weight, delta=0.8)
#weight_func = None # if weight_func is None, result coincide with input
#covinv = None
#x = xi
gp_pred = gaussian_process(xi, yi, alpha=amp**2)
gp_yi = gp_pred(xi)
gp_chi2 = get_chi2(gp_yi - yi, covinv)
print(gp_chi2)
y, yini = iter_smoother(yi, xi, x=x, weight_func=weight_func, max_iter=1000, covinv=covinv, min_chi2=amp**2*len(xi))
y2, yini2 = iter_smoother(yi, xi, x=x, weight_func=weight_func, max_iter=1000, covinv=covinv, min_chi2=None)
plt.plot(xi, yi0, '-', label='input')
plt.plot(xi, yi, '.', label='data')
#plt.plot(xi, yini)
plt.plot(x, y, '--', label='smoother with minimun chi2')
plt.plot(x, y2, '--', label='smoother')
plt.plot(x, gp_pred(x), '--', label='gp')
plt.legend(fontsize=12)
plt.show()
异常点标记:
sigma_clip
:通过cenfunc
得到平滑轮廓,并通过stdfunc
得到sigma
,标记超过一定sigma
的数值。
fft_fit
:通过fft
找出频谱,去除小于最高峰某个阈值的峰(设置为0),然后ifft
回去。
NOTE
:之后可以测试一下二维的情形,理论上而言也能适用,可以把SVD
和sum threshold
之类的方法也实现一下,或者看看有没有别人写的类似的代码。
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import medfilt
from scipy.ndimage import convolve1d
import warnings
from tqdm import tqdm
def extra_mask(mask, axis, extra_mask):
'''
NOTE: this is just an approximate approach, the mask area may not be extended by extra_mask for narror area\n
'''
mask = np.asarray(mask)
kernel = np.ones(extra_mask+1, dtype=np.float64)
mask = mask.astype(np.float64)
return convolve1d(mask, kernel, axis=axis)>0
def median_std(dy, correct=True):
'''
More robust than std, by replacing mean with median. Support masked array.\n
if correct, 0.674 will be divided to make this function return the std for gaussian random variables with zero mean.\n
'''
if correct:
return np.ma.median(dy**2.)**0.5/0.674
else:
return np.ma.median(dy**2.)**0.5
def sigma_clip(data, sigma_lower=5., sigma_upper=5., cenfunc=medfilt, stdfunc=median_std, max_iter=1, get_sigma=False, verbose=True, **kwargs):
'''
cenfunc: return an array that be broadcasted with data or a float number\n
kwargs: for cenfunc\n
return masked_array\n
'''
if type(data) is np.ma.core.MaskedArray:
mask = data.mask
data = data.data
elif type(data) is not np.ndarray:
mask = None
data = np.asarray(data)
valid = np.isfinite(data)
if mask is not None:
valid = valid & (~mask)
n_iter = 0
n_mask = 0
while True:
if n_iter>= max_iter:
if verbose:
print('Break at %d th iterations! Masked %d points!'%(n_iter, n_mask))
break
if not np.any(valid):
if n_iter == 0:
warnings.warn('No valid points at the first iteration!')
break
new_data = data[valid]
cen_data = cenfunc(new_data, **kwargs)
diff = new_data - cen_data
sigma = stdfunc(diff)
if sigma == 0:
if n_iter == 0:
warnings.warn('Sigma=0 at the first iteration! It will leading to masking no points! You may need to change cenfunc to avoid over fitting!')
if verbose:
print('Break at %d th iterations for sigma=0! Masked %d points!'%(n_iter, n_mask))
break
#plt.figure()
#plt.plot(diff)
#xlim = plt.xlim()
#plt.hlines(-sigma_lower*sigma, *xlim)
#plt.hlines(sigma_upper*sigma, *xlim)
#print(sigma)
#plt.show()
this_valid = (diff>=-sigma_lower*sigma)&(diff<=sigma_upper*sigma)
valid[valid] = this_valid
n_iter += 1
n_mask += (~this_valid).sum()
if np.all(this_valid):
if verbose:
print('Converged at %d th iterations! Masked %d points!'%(n_iter, n_mask))
break
elif verbose:
print('Masked %d points at %d iteration, with sigma %s!'%((~this_valid).sum(), n_iter, sigma))
if get_sigma:
return np.ma.array(data, mask=~valid), sigma
else:
return np.ma.array(data, mask=~valid)
def sigma_clip_along_axis(data, axis, sigma_lower=5., sigma_upper=5., cenfunc=medfilt, stdfunc=median_std, max_iter=1, verbose=False, **kwargs):
'''
return mask, sigma
'''
if (type(data) is not np.ndarray) and (type(data) is not np.ma.core.MaskedArray):
data = np.asarray(data)
if axis<0:
axis = len(data.shape) + axis
Ni, Nk = data.shape[:axis], data.shape[axis+1:]
mask = np.zeros_like(data, dtype=bool)
shp = list(data.shape)
del shp[axis]
sigma = np.zeros(shp, dtype=data.dtype)
for ii in tqdm(np.ndindex(Ni)):
for kk in np.ndindex(Nk):
y = data[ii+np.s_[:,]+kk]
ymask, this_sigma = sigma_clip(y, sigma_lower=sigma_lower, sigma_upper=sigma_upper, cenfunc=cenfunc, stdfunc=stdfunc, max_iter=max_iter, get_sigma=True, verbose=verbose, **kwargs)
mask[ii+np.s_[:,]+kk] = ymask.mask
sigma[ii+kk] = this_sigma
return mask, sigma
def block_average_sigma_clip(data, block_size, average_axis, clip_axis, sigma_lower=5., sigma_upper=5., cenfunc=medfilt, stdfunc=median_std, max_iter=1, verbose=False, **kwargs):
'''
NOTE: the sigma is the sigma before averaging\n
return mask, sigma
'''
if type(data) is not np.ma.core.MaskedArray:
data = np.ma.array(data, mask=~np.isfinite(data))
ntotal = data.shape[average_axis]
nblock = ntotal//block_size
if nblock*block_size < ntotal:
nblock += 1
mask = np.zeros_like(data.data, dtype=bool)
sigma = np.zeros_like(data.data)
shp = list(data.shape)
if average_axis<0:
average_axis = len(shp) + average_axis
if clip_axis<0:
clip_axis = len(shp) + clip_axis
if clip_axis>average_axis:
clip_axis -= 1
del shp[average_axis]
slice1 = (slice(None),)*average_axis
slice2 = (slice(None),)*(len(shp)-average_axis-1)
ist = 0
for _ in range(nblock):
slice0 = (slice(ist, ist+block_size), )
this_slice = slice1 + slice0 + slice2
ist = ist + block_size
this_data = data[this_slice]
n_mean = this_data.shape[average_axis]
this_data = this_data.mean(axis=average_axis)
this_mask, this_sigma = sigma_clip_along_axis(this_data, clip_axis, sigma_lower=sigma_lower, sigma_upper=sigma_upper, cenfunc=cenfunc, stdfunc=stdfunc, max_iter=max_iter, verbose=verbose, **kwargs)
this_mask = np.expand_dims(this_mask, axis=average_axis)
this_sigma = np.expand_dims(this_sigma, axis=clip_axis)
this_sigma = np.expand_dims(this_sigma, axis=average_axis)
mask[this_slice] = this_mask
sigma[this_slice] = this_sigma*np.sqrt(n_mean) # before averaging
return mask, sigma
def fft_fit(y, axis=-1, max_th=0.01):
y = np.asarray(y)
fft = np.fft.fft(y - y.mean(), axis=axis)
psd = np.abs(fft)**2
th = psd.max(axis=axis)*max_th
th = np.expand_dims(th, axis)
fft[psd<th] = 0
npk = (psd>=th).sum()
print('Number of components: %d'%npk)
yfit = np.fft.ifft(fft)
#plt.plot(np.fft.fftshift(psd), '.')
#plt.plot(np.fft.fftshift(np.abs(fft)**2), '--')
#plt.show()
return yfit.real + y.mean()
if __name__ == '__main__':
#
# # test for sigma_clip, plane signal with gaussian noise and peak
# xi = np.linspace(-10, 10, 201)
# yi0 = np.zeros_like(xi)
# amp = 0.5
# spike = np.random.normal(0, 15, size=10)
# spike_inds = np.random.choice(np.arange(xi.shape[0]), spike.shape[0], replace=True)
# yi = yi0 + np.random.randn(*yi0.shape)*amp
# yi[spike_inds] = spike
# y = sigma_clip(yi, max_iter=1, cenfunc=np.median)
# plt.plot(xi, yi0, label='input')
# plt.plot(xi, yi, '.', label='data')
# plt.plot(xi, y, '--', label='sigma clip')
# plt.plot(xi[y.mask], y[y.mask].data, '*', label='outlayer')
# plt.legend(fontsize=10)
# plt.show()
#
# test for sigma_clip and fft_fit, gaussian noise with peak
def f(x):
"""The function to predict."""
return (x-0.8)*(np.sin(x) + 2*np.cos(3*x) + 0.5*np.sin(5*x) + 0.7*np.cos(7*x))
xi = np.linspace(-10, 10, 201)
yi0 = f(xi)
amp = 0.5
spike = np.random.normal(0, 15, size=10)
spike_inds = np.random.choice(np.arange(xi.shape[0]), spike.shape[0], replace=True)
yi = yi0 + np.random.randn(*yi0.shape)*amp
yi[spike_inds] = spike
y = sigma_clip(yi, max_iter=1, kernel_size=7)
y2 = fft_fit(yi, max_th=0.05)
plt.plot(xi, yi0, label='input')
plt.plot(xi, yi, '.', label='data')
plt.plot(xi, y, '--', label='sigma clip')
plt.plot(xi, y2, '--', label='fft')
plt.plot(xi[y.mask], y[y.mask].data, '*', label='outlayer')
plt.legend(fontsize=10)
plt.show()