在线学习算法:FTRL
下面程序来自https://www.kaggle.com/jiweiliu/ftrl-starter-code/code
alpha = .005 # learning rate
beta = 1. # smoothing parameter for adaptive learning rate
L1 = 0.2 # L1 regularization, larger value means more regularized
L2 = 1. # L2 regularization, larger value means more regularized
# C, feature/hash trick
D = 2 ** 24 # number of weights to use
interaction = False # whether to enable poly2 feature interactions
class ftrl_proximal(object):
''' Our main algorithm: Follow the regularized leader - proximal
In short,
this is an adaptive-learning-rate sparse logistic-regression with
efficient L1-L2-regularization
Reference:
http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
'''
def __init__(self, alpha, beta, L1, L2, D, interaction):
# parameters
self.alpha = alpha
self.beta = beta
self.L1 = L1
self.L2 = L2
# feature related parameters
self.D = D
self.interaction = interaction
# model
# n: squared sum of past gradients
# z: weights
# w: lazy weights
self.n = [0.] * D
self.z = [random() for k in range(D)]#[0.] * D
self.w = {}
def _indices(self, x):
''' A helper generator that yields the indices in x
The purpose of this generator is to make the following
code a bit cleaner when doing feature interaction.
'''
# first yield index of the bias term
yield 0
# then yield the normal indices
for index in x:
yield index
# now yield interactions (if applicable)
if self.interaction:
D = self.D
L = len(x)
x = sorted(x)
for i in xrange(L):
for j in xrange(i+1, L):
# one-hot encode interactions with hash trick
yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D
def predict(self, x):
''' Get probability estimation on x
INPUT:
x: features
OUTPUT:
probability of p(y = 1 | x; w)
'''
# parameters
alpha = self.alpha
beta = self.beta
L1 = self.L1
L2 = self.L2
# model
n = self.n
z = self.z
w = {}
# wTx is the inner product of w and x
wTx = 0.
for i in self._indices(x):
sign = -1. if z[i] < 0 else 1. # get sign of z[i]
# build w on the fly using z and n, hence the name - lazy weights
# we are doing this at prediction instead of update time is because
# this allows us for not storing the complete w
if sign * z[i] <= L1:
# w[i] vanishes due to L1 regularization
w[i] = 0.
else:
# apply prediction time L1, L2 regularization to z and get w
w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)
wTx += w[i]
# cache the current w for update stage
self.w = w
# bounded sigmoid function, this is the probability estimation
return 1. / (1. + exp(-max(min(wTx, 35.), -35.)))
def update(self, x, p, y):
''' Update model using x, p, y
INPUT:
x: feature, a list of indices
p: click probability prediction of our model
y: answer
MODIFIES:
self.n: increase by squared gradient
self.z: weights
'''
# parameter
alpha = self.alpha
# model
n = self.n
z = self.z
w = self.w
# gradient under logloss
g = p - y
# update z and n
for i in self._indices(x):
sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
z[i] += g - sigma * w[i]
n[i] += g * g
参考:
1.《动手学深度学习》
2.http://ruder.io/optimizing-gradient-descent/
3.Ad Click Prediction: a View from the Trenches