复合分位回归
类似分位回归的,给定分位数序列
0
<
τ
1
<
τ
2
<
⋯
<
τ
K
<
1
0<\tau_1<\tau_2<\cdots<\tau_K<1
0<τ1<τ2<⋯<τK<1,复合分位回归的目的不再是在一个分位点上最小化损失函数,而是在多个分位点上同时最小化check function. 则估计回归系数
β
\boldsymbol{\beta}
β的估计是通过如下目标函数得到的:
(
b
^
1
,
…
,
b
^
K
,
β
^
C
Q
R
)
=
arg min
b
1
…
,
b
k
,
β
∑
k
=
1
K
{
∑
i
=
1
n
ρ
τ
k
(
y
i
−
b
k
−
x
i
⊤
β
)
}
(\hat{b}_1,\dots,\hat{b}_K,\hat{\boldsymbol{\beta}}^{\mathrm{CQR}})=\underset{b_1\dots,b_k,\boldsymbol{\beta}}{\argmin}\sum_{k=1}^K\left\{ \sum_{i=1}^n\rho_{\tau_k} (y_i-b_k-\bold{x}_i^{\top}\boldsymbol{\beta})\right\}
(b^1,…,b^K,β^CQR)=b1…,bk,βargmink=1∑K{i=1∑nρτk(yi−bk−xi⊤β)}
通常我们会取等距分位序列:
τ
k
=
k
K
+
1
,
k
=
1
,
2
,
…
,
K
\tau_k=\frac{k}{K+1},k=1,2,\dots,K
τk=K+1k,k=1,2,…,K. 同时给出估计量的渐近分布:
n
(
β
^
C
Q
R
−
β
∗
)
→
N
(
0
,
Σ
C
Q
R
)
\sqrt{n}(\hat{\boldsymbol{\beta}}^{\mathrm{CQR}}-\boldsymbol{\beta}^*)\to N(0,\boldsymbol{\Sigma}_{\mathrm{CQR}})
n(β^CQR−β∗)→N(0,ΣCQR)其中
Σ
C
Q
R
=
C
−
1
∑
k
,
k
′
=
1
K
min
(
τ
k
,
τ
k
′
)
(
1
−
max
(
τ
k
,
τ
k
′
)
)
(
∑
k
=
1
K
f
(
b
τ
k
∗
)
)
2
\boldsymbol{\Sigma}_{\mathrm{CQR}}=\bold{C}^{-1}\frac{\sum_{k,k'=1}^K\min(\tau_k,\tau_{k'})(1-\max(\tau_k,\tau_k'))}{(\sum_{k=1}^Kf(b^*_{\tau_k}))^2}
ΣCQR=C−1(∑k=1Kf(bτk∗))2∑k,k′=1Kmin(τk,τk′)(1−max(τk,τk′))
其中
C
\bold{C}
C是
lim
n
→
∞
1
n
X
⊤
X
=
C
\lim_{n\to\infty}\frac{1}{n}\bold{X}^{\top}\bold{X}=\bold{C}
n→∞limn1X⊤X=C
1.1 基于MM算法的Python实现
data = pd.read_csv(r"C:\Users\beida\Desktop\sc\rent.csv")
n = len(data); p = 1
x = np.array(data[["cons", 'area']])
y = np.array(data["rent_euro"], dtype=np.float64)
#beta = np.matrix([100, 2.6]).reshape(p, 1)
# np.set_printoptions(precision=8)
tau = np.arange(1, 6)/6
k = len(tau)
maxit = 1000
toler = 0.0001
error = 10000
iteration = 1
p = 2
u = np.zeros(k)
r = np.zeros((n, k))
signw = np.zeros((n, k))
z = np.zeros((n, k))
newX = np.zeros((n, k))
#beta = np.matrix([1, 1], dtype=np.float64).reshape(2, 1)
beta = np.linalg.pinv(x.T.dot(x)).dot(x.T).dot(y)
#print(beta, "ols")
while (iteration <= maxit) & (error > toler):
betaold = beta.copy()
#print(betaold, 'betaold')
uv = np.sort(y - x.dot(beta), axis=0) # yes
quantile1 = (n-1)*tau - np.floor((n - 1)*tau) # yes
for i in range(0, k):
u[i] = quantile1[i] * uv[int(np.ceil((n - 1)*tau[i]))] \
+ (1-quantile1[i]) * uv[int(np.floor((n - 1)*tau[i]))]
yh = x.dot(beta)
for i in range(0, k):
r[:, i] = y - u[i] - yh
signw[:, i] = (1 - np.sign(r[:, i]))/2 * (1 - tau[i]) \
+ (np.sign(r[:, i]) + 1) * tau[i]/2
for j in range(0, p):
xbeta = beta[j] * x[:, j]
for i in range(0, k):
z[:, i] = (r[:, i]+xbeta)/x[:, j]
newX[:, i] = x[:, j] * signw[:, i]
vz = z.flatten()
order = vz.argsort()
sortz = vz[order] # yes
vnewX = newX.flatten()
w = np.abs(vnewX[order])
index = np.where(np.cumsum(w) > (np.sum(w)/2))[0][0]
# print(index)
beta[j] = sortz[index]
error = np.sum(np.abs(beta-betaold))
iteration = iteration + 1
print("beta:", beta)
print("tau:", tau)
print("cons:", np.percentile((y-x.dot(beta)), tau*100))
结果
> beta: [135.63724592 4.3381923 ]
> tau: [0.16666667 0.33333333 0.5 0.66666667 0.83333333]
> cons: [-114.9981428 -40.30912765 16.70578506 75.41938784 165.26588296]
1.2 基于MM算法的CQR及其R实现
cqrmm(x=x,y=y,tau=tau)
set.seed(1)
n=100
p=2
a=rnorm(n*p, mean = 1, sd =1)
x=matrix(a,n,p)
beta=rnorm(p,1,1)
beta=matrix(beta,p,1)
y=x%*%beta-matrix(rnorm(n,0.1,1),n,1)
tau=1:5/6
# x is 1000*10 matrix, y is 1000*1 vector, beta is 10*1 vector
cqr.mm(x,y,tau)
cqrmm = function(x, y, beta, to, m, tau){
if (missing(to)){
toler = 1e-3
}else{toler = to}
if (missing(m)){
maxit = 200
}else{maxit = m}
if (missing(tau)){
cat('no tau_k input','\n')
tau=1:5/6
}else{tau=tau}
x = x
X = x
#arma:: mat x=(xr),r,product,xt,denominator;
#arma:: vec W,uv,v,y=(yr),delta;
#arma:: vec betaold,beta=(betar),quantile,u,yh;
#arma::uvec order, index;
n=nrow(x)
p = ncol(x)
k=length(tau)
error=10000
epsilon=0.9999
iteration=1;
#u.zeros(k);
#r.zeros(n,k);
u <- numeric(k)
r <- matrix(0, n, k)
if (missing(beta)){
beta = solve(t(x)%*%x, t(x)%*%y)
}else{beta = beta}
xt=t(x)
product <- matrix(1, p, n)
while (iteration<=maxit && error>toler)
{
betaold = beta;
yh = x%*% beta; #y_hat
uv = sort(y - yh)
# u is vec of the quantiles of given vector
quantile = (n-1) * tau - floor((n-1) * tau)
for (i in 1:k){
u[i] = quantile[i] * uv[ceiling((n-1) * tau[i])] + (1 - quantile[i]) * uv[floor((n-1) * tau[i])]
}
for (i in 1:k){
r[,i] = y-u[i]-yh;
}
denominator=1/(abs(r)+epsilon)
W = rowSums(denominator)
v <- k - 2 * sum(tau) - rowSums(r * denominator)
for (i in 1:n){
product[,i] = xt[,i]*W[i]
}
delta = solve(product%*%x, xt%*%v);
beta = beta-delta
error = sum(abs(delta))
iteration = iteration + 1
}
b=quantile(y-X%*%beta, tau)
return(list(beta=beta,b=b))
}
1.3 基于EM算法的R语言实现
library(MASS)
library(cqrReg)
set.seed(NULL)
tau.k = 1:5 / 6
CQREM = function(X, y, tau, betar, weight, maxit, toler) {
if (missing(betar)) {
beta = solve(t(X) %*% X, t(X) %*% y)
cat(beta,'\n')
}
if (missing(tau)) {
tau.k = 1:5 / 6
alpha = tau
}
K = length(tau)
if (missing(weight)) {
weight = rep(1, times = K)
}
if (missing(maxit)) {
maxit = 1000
}
if(missing(toler)){
toler = 1e-5
}
weight = weight
alpha = tau
n = length(y)
tau.k = tau
theta.1 = (1 - 2 * tau.k) / (tau.k * (1 - tau.k))
theta.2 = 2 / (tau.k * (1 - tau.k))
error = 10000
epsilon = 0.9999
iteration = 1
while (iteration <= maxit && error > toler) {
#for (i in 1:maxit){
rbar = matrix(NA, n, K)
mu_new = matrix(NA, n, K)
mu = matrix(NA, n, K)
w.ik = matrix(NA, n, K)
d.ik = matrix(NA, n, K)
r = y - X %*% beta
r
for (k in 1:K) {
rbar[, k] <- sum(r - alpha[k])
}
rbar
delta2 = sqrt((theta.1 ^ 2 + 2 * theta.2)) / abs(rbar)
delta2
delta3 = (theta.2 * weight) / (theta.1 ^ 2 + 2 * theta.2) + abs(rbar) / (sqrt(theta.1 ^ 2 + 2 * theta.2))
delta3
for (k in 1:K) {
w.ik[, k] = delta2[, k] / (theta.2 * weight)[k]
}
w.ik
w.i = rowSums(w.ik)
w.i
W = diag(w.i)
W
#svd_W <- svd(W)
#U <- svd_W$u
#V <- svd_W$v
#D <- svd_W$d
# 构建逆矩阵
#W_inv <- diag(1/D)
#W_reg <- W + lambda * diag(nrow(W))
for (k in 1:K) {
d.ik[, k] = (theta.1 + alpha * delta2)[, k] / (theta.2 * weight)[k]
}
d.ik
d.i = rowSums(d.ik)
d.i
D = as.vector(d.i)
ybar = y - solve(W) %*% D
ybar
beta
beta_new = solve(t(X) %*% W %*% X) %*% t(X) %*% W %*% ybar
beta_new
A = matrix(NA, n, K)
for (k in 1:K){
A[,k] = (y-X%*%beta_new)*delta2[, k]
}
alpha
alpha_new=colSums(A - n*theta.1)/colSums(delta2)
for (k in 1:K) {
mu[, k] = X %*% beta_new + alpha_new[k]
}
weight_new = (2 / (3 * n)) * colSums((y - mu) ^ 2 / (2 * theta.2) * delta2 +
(theta.1 ^ 2 + 2 * theta.2) / (2 * theta.2) * delta3 -
(theta.1 * (y - mu)) / (theta.2))
#alpha_new = colSums(as.numeric((y[1] - t(
# as.matrix(X[1,], row = 1, col = 2)
#)) %*% beta) * delta2)
alpha_new
error = sum(abs(beta_new))
beta = beta_new
alpha = alpha_new
weight = weight_new
iteration = iteration + 1
#cat(error,'\n')
}
b = quantile(y - X %*% beta, tau.k)
return(list(beta = beta, b = b))
}
n = 300
p = 3
X = matrix(rnorm(n*p, 0, 1), n, p)
y = rnorm(n)
CQREM(X, y, tau=tau.k, maxit = 1000, tol=1e-5)
cqr.mm(X, y, tau= tau.k,toler = 1e-5)
>$beta
[,1]
[1,] -0.01504609
[2,] -0.14649880
[3,] 0.23344365
$b
16.66667% 33.33333% 50% 66.66667% 83.33333%
-0.881195701 -0.428558273 -0.000221886 0.470748757 1.102072419
$beta
[,1]
[1,] 0.05258097
[2,] -0.05178250
[3,] 0.13393040
$b
16.66667% 33.33333% 50% 66.66667% 83.33333%
-0.9269618879 -0.3647195276 -0.0001248183 0.4284496629 1.0367478727
EM算法的速度显著慢于MM算法,同时在估计结果上也有略微的差异。
1.4 基于凸优化(CVX)的复合分位回归
实际上还有一些暴力的求解方法,比如直接凸优化(实际上凸优化是个广义的说法, 在这个框架下有着众多的优化算法, 不同算法得到的结果可能并不相同).
library(CVXR)
library(cqrReg)
set.seed(1)
n=100
p=2
a=rnorm(n*p, mean = 1, sd =1)
x=matrix(a,n,p)
beta=rnorm(p,1,1)
beta=matrix(beta,p,1)
y=x%*%beta-matrix(rnorm(n,0.1,1),n,1)
tau=1:5/6
cqr.mm(x,y,tau)
# 假设数据已经定义好
Y <- y # 响应变量
X <- x # 解释变量矩阵
taus <- tau # 分位数向量
# 定义决策变量
beta <- Variable(ncol(X))
b <- Variable(length(taus))
# 定义分位数损失函数
quantile_loss <- function(residuals, tau) {
sum(pos(residuals) * tau + pos(-residuals) * (1 - tau))
}
# 构建目标函数的每一部分
objective_parts <- lapply(1:length(taus), function(i) {
quantile_loss(Y - b[i] - X %*% beta, taus[i])
})
# 合并目标函数的各个部分
objective <- Minimize(Reduce(`+`, objective_parts))
# 定义优化问题
problem <- Problem(objective, list())
# 求解
result <- solve(problem)
# 提取结果
result$getValue(beta)
> [,1]
> [1,] 1.470763
> [2,] 2.758932
result$getValue(b)
>
> [,1]
> [1,] -1.1859157
> [2,] -0.7531299
> [3,] -0.2539941
> [4,] 0.1203188
> [5,] 0.7931756
> cqr.mm(x,y,tau)
$beta
[,1]
[1,] 1.508830
[2,] 2.749125
$b
16.66667% 33.33333% 50% 66.66667% 83.33333%
-1.21780307 -0.78448530 -0.29961304 0.05043142 0.69900106
可以看到,与MM算法也存在一些差异
1.5 基于Rcpp的优化办法
相比较使用CVXR, Rcpp的编程量更小, 且不需要重复定义变量, 更容易进行封装和调用. 且在模拟实验中, Rcpp的循环速度显著优于R basic. 在此基础上可以进一步扩展带有惩罚项, 权重, 分层次等其他形式的复合分位回归模型. 以下是相关代码
对于基本的分位损失(check loss)其表达形式为
#include <RcppArmadillo.h>
//[[Rcpp::depends(RcppArmadillo)]]
using namespace Rcpp;
using namespace arma;
arma::vec rho_koenker(arma::vec x, double tau){
int n = x.n_elem;
arma::vec y(n);
for(int i = 0; i < n; ++i){
if(x(i) < 0){
y(i) = x(i) * (tau - 1);
} else {
y(i) = x(i) * tau;
}
}
return(y);
}
对其采取求和或加权求和(加权复合分位回归)后即为复合分位损失(composite check loss)
arma::vec rho_koenker_composite(arma::vec x, arma::vec taus) {
int n = x.n_elem;
int m = taus.n_elem;
arma::vec total_loss(n, arma::fill::zeros);
// 对每个 tau 进行循环
for(int t = 0; t < m; ++t) {
double tau = taus(t); // 当前的分位点
arma::vec loss(n); // 当前分位点的损失
for(int i = 0; i < n; ++i) {
if(x(i) < 0) {
loss(i) = x(i) * (tau - 1);
} else {
loss(i) = x(i) * tau;
}
}
total_loss += loss; // 累加当前分位点的损失
}
return total_loss; // 返回所有分位点损失的总和
}
C语言的一个明显好处就是能够直观的了解到返回变量或输入变量的具体 class. 例如此处返回的 total_loss 必然是vector. 那么在 R 中调用时就可以提前做出应对.
double loss_cqr_composite(arma::vec beta, arma::mat X, arma::vec y,
arma::vec taus) {
int N = X.n_rows;
double eta = 0; // 累计损失值
arma::vec res(N); // 残差向量
arma::vec rho(N); // 每个分位数对应的损失向量
res = y - (x * beta); // 计算残差
rho = rho_koenker_composite(res, taus);
eta = accu(rho)
return eta;
}
定义一个 main 函数,用于在输入初值 beta, 设计阵 X, 响应变量y, 分位数序列taus时调用损失函数, 将损失求和后为总损失. 此时就完成了对损失函数的完整定义, 只需优化该函数就可以得到相应的参数估计.
在 R 中调用该 Rcpp 函数
N = 50
p = 3
X = matrix(rnorm(N*p), nrow = N, ncol = p)
beta = as.matrix(c(1,5,3),3,1)
eps = rnorm(N)
taus = 1:5/6
y = X %*% beta + eps
Opt <- optim(par = c(0,0,0), fn = loss_cqr_composite,
method = "L-BFGS-B", y = y, X = X, tau = taus)
Opt$par
> [1] 0.9116826 4.8263317 3.1216979
获得参数估计值.
进一步思考
加权复合分位回归
复合分位回归是最小化一系列分位损失函数,但不难看出,CQR是赋予了每个
ρ
τ
k
\rho_{\tau_k}
ρτk完全相同的权重,那么就可以考虑不同的权重。这样得到的就是加权的CQR(WCQR)
(
α
^
1
,
…
,
α
^
K
,
β
^
W
C
Q
R
)
=
arg
min
α
1
,
…
,
α
K
,
β
∑
i
=
1
N
∑
k
=
1
K
ω
k
⋅
ρ
τ
k
(
Y
i
−
X
i
T
β
−
α
k
)
.
\left(\hat{\alpha}_{1}, \ldots, \hat{\alpha}_{K}, \hat{\beta}^{\mathrm{WCQR}}\right)=\arg \min _{\alpha_{1}, \ldots, \alpha_{K}, \beta} \sum_{i=1}^{N} \sum_{k=1}^{K} \omega_{k} \cdot \rho_{\tau_{k}}\left(Y_{i}-X_{i}^{T} \beta-\alpha_{k}\right) .
(α^1,…,α^K,β^WCQR)=argα1,…,αK,βmini=1∑Nk=1∑Kωk⋅ρτk(Yi−XiTβ−αk).
求解原理与CQR类似的,加入设定相同的权重w,那么就应该得到与CQR完全相同的结果
Y <- y # 响应变量
X <- x # 解释变量矩阵
taus <- tau # 分位数向量
w = c(0.2,0.2,0.2,0.2,0.2)
# 定义决策变量
beta <- Variable(ncol(X))
b <- Variable(length(taus))
# 定义分位数损失函数
quantile_loss <- function(residuals, tau) {
sum(pos(residuals) * tau + pos(-residuals) * (1 - tau))
}
# 构建目标函数的每一部分
objective_parts <- lapply(1:length(taus), function(i) {
w[i]*quantile_loss(Y - b[i] - X %*% beta, taus[i])
})
# 合并目标函数的各个部分
objective <- Minimize(Reduce(`+`, objective_parts))
# 定义优化问题
problem <- Problem(objective, list())
# 求解
result <- solve(problem)
# 提取结果
beta_val <- result$getValue(beta)
b_val <- result$getValue(b)
beta_val
b_val
> beta_val
[,1]
[1,] 1.470763
[2,] 2.758932
> b_val
[,1]
[1,] -1.1859157
[2,] -0.7531299
[3,] -0.2539941
[4,] 0.1203188
[5,] 0.7931756
现在设定不同的权重w:
# 生成n个随机数
n <- length(taus)
random_numbers <- runif(n)
# 归一化使得它们的总和为1
normalized_numbers <- random_numbers / sum(random_numbers)
# 验证它们的总和是否为1
sum(normalized_numbers)
Y <- y # 响应变量
X <- x # 解释变量矩阵
taus <- tau # 分位数向量
w = normalized_numbers
# 定义决策变量
beta <- Variable(ncol(X))
b <- Variable(length(taus))
# 定义分位数损失函数
quantile_loss <- function(residuals, tau) {
sum(pos(residuals) * tau + pos(-residuals) * (1 - tau))
}
# 构建目标函数的每一部分
objective_parts <- lapply(1:length(taus), function(i) {
w[i]*quantile_loss(Y - b[i] - X %*% beta, taus[i])
})
# 合并目标函数的各个部分
objective <- Minimize(Reduce(`+`, objective_parts))
# 定义优化问题
problem <- Problem(objective, list())
# 求解
result <- solve(problem)
# 提取结果
beta_val <- result$getValue(beta)
b_val <- result$getValue(b)
> beta_val
[,1]
[1,] 1.525781
[2,] 2.736585
> b_val
[,1]
[1,] -1.23440023
[2,] -0.79443834
[3,] -0.32373464
[4,] 0.05565281
[5,] 0.79790504
可见估计值有了差异。