今天给大家带来了稳健回归。这个命题源自我的博士同学,他在北京的儿科,和我问道,线性回归太不稳定,离群值都处理不了,问我有没有可替代的优异一些的方法,方法是有的,因为临床研究离不开连续变量的分析,所以我推荐稳健回归,稳健回归可以看作是多传统的线性回归的改良。这里介绍的稳健回归包括Theil Sen回归,RANSAC回归,Huber回归,怎么样?单凭名字就可以在论文里出彩吧?
# linear regression on a dataset with outliers
from random import random
from random import randint
from random import seed
from numpy import arange
from numpy import mean
from numpy import std
from numpy import absolute
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from matplotlib import pyplot
# prepare the dataset
def get_dataset():
X, y = make_regression(n_samples=100, n_features=1, tail_strength=0.9, effective_rank=1, n_informative=1, noise=3, bias=50, random_state=1)
# add some artificial outliers
seed(1)
for i in range(10):
factor = randint(2, 4)
if random() > 0.5:
X[i] += factor * X.std()
else:
X[i] -= factor * X.std()
return X, y
# evaluate a model
def evaluate_model(X, y, model):
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
return absolute(scores)
# plot the dataset and the model's line of best fit
def plot_best_fit(X, y, model):
# fut the model on all data
model.fit(X, y)
# plot the dataset
pyplot.scatter(X, y)
# plot the line of best fit
xaxis = arange(X.min(), X.max(), 0.01)
yaxis = model.predict(xaxis.reshape((len(xaxis), 1)))
pyplot.plot(xaxis, yaxis, color='r')
# show the plot
pyplot.title(type(model).__name__)
pyplot.show()
# load dataset
X, y = get_dataset()
# define the model
model = LinearRegression()
# evaluate model
results = evaluate_model(X, y, model)
print('Mean MAE: %.3f (%.3f)' % (mean(results), std(results)))
# plot the line of best fit
plot_best_fit(X, y, model)
################Huber Regression
# huber regression on a dataset with outliers
from rando