基于Tensorflow.Net的RAdam以及Ranger优化器
Ranger优化器
class CustomizeOptimizer_Ranger
{
private Optimizer Opt = null;
private ResourceVariable[] G = null;
private List<IVariableV1> VlistT = null;
private Tensor Lrt = null;
private ResourceVariable Step = null;
private ResourceVariable[] M = null;
private ResourceVariable[] V = null;
public ResourceVariable[] SlowWeight = null;
private Tensor eps = tf.constant(1e-7f, TF_DataType.TF_FLOAT);
private Tensor beta1 = null, beta2 = null, alpha = null;
public ResourceVariable K = null;
public int Kv;
public CustomizeOptimizer_Ranger(Tensor _LearningRate, float _beta1 = 0.9f, float _beta2 = 0.999f, int _k = 5, float _alpha = 0.5f)
{
Kv = _k;
K = tf.Variable(_k, dtype: TF_DataType.TF_INT32);
alpha = tf.constant(_alpha, TF_DataType.TF_FLOAT);
beta1 = tf.constant(_beta1, TF_DataType.TF_FLOAT);
beta2 = tf.constant(_beta2, TF_DataType.TF_FLOAT);
Opt = tf.train.GradientDescentOptimizer(_LearningRate);
}
public Tensor[] InitSlowWeight(List<IVariableV1> Vlist)
{
SlowWeight = new IVariableV1[Vlist.Count()].Select((s, i) => TFModules.WeightZero(Vlist[i].shape, "G")).ToArray();
return SlowWeight.Select((s, i) => tf.assign(s, Vlist[i])).ToArray();
}
public ITensorOrOperation[] ComputeGradient(Tensor Loss, List<IVariableV1> Vlist, int Block)
{
VlistT = Vlist;
Step = tf.Variable(0, dtype: TF_DataType.TF_FLOAT);
M = new IVariableV1[Vlist.Count()].Select((s, i) => tf.Variable(0, dtype: TF_DataType.TF_FLOAT, shape: Vlist[i].shape)).ToArray();
V = new IVariableV1[Vlist.Count()].Select((s, i) => tf.Variable(0, dtype: TF_DataType.TF_FLOAT, shape: Vlist[i].shape)).ToArray();
G = new IVariableV1[Vlist.Count()].Select((s, i) => tf.Variable(0, dtype: TF_DataType.TF_FLOAT, shape: Vlist[i].shape)).ToArray();
var gradient = Opt.compute_gradients(Loss, Vlist);
return G.Select((s, i) => tf.assign_add(s, gradient[i].Item1 / (float)(Block))).ToArray();
}
public List<Operation> ApplyGradients()
{
var Op_K = tf.assign_add(K, -1);
var Op_S = tf.assign_add(Step, 1f);
var Op_M = M.Select((s, i) => tf.assign(s, (s * beta1) + (1f - beta1) * G[i])).ToArray();
var Op_V = V.Select((s, i) => tf.assign(s, (s * beta2) + (1f - beta2) * tf.square(G[i]))).ToArray();
var M_Hat = M.Select((s, i) => s / (1f - tf.pow(beta1, Step))).ToArray();
var V_Hat = V.Select((s, i) => tf.sqrt(s / (1f - tf.pow(beta2, Step)))).ToArray();
var P_inf = 2f / (1f - beta2) - 1f;
var P_t = P_inf - Step * 2f * tf.pow(beta2, Step) / (1f - tf.pow(beta2, Step));
var R_t = tf.sqrt(tf.nn.relu(((P_t - 4f) * (P_t - 2f) * P_inf) / ((P_inf - 4f) * (P_inf - 2f) * P_t)));
var T1 = G.Select((s, i) => R_t * M_Hat[i] / (V_Hat[i] + eps)).ToArray();
var T2 = G.Select((s, i) => M_Hat[i]).ToArray();
var Grad = T1.Select((s, i) => s * tf.cast(P_inf > 4, TF_DataType.TF_FLOAT) + T2[i] * tf.cast(P_inf <= 4, TF_DataType.TF_FLOAT)).ToArray();
var G_V = new Tuple<Tensor, IVariableV1>[G.Length].Select((s, i) => s = new Tuple<Tensor, IVariableV1>(Grad[i], VlistT[i])).ToArray();
var Op_A = Opt.apply_gradients(G_V);
var Op_C = G.Select(s => tf.assign(s, tf.zeros_like(s))).ToArray();
var KMask = tf.cast(K * 1 < 1, TF_DataType.TF_FLOAT);
var Op_SW = SlowWeight.Select((s, i) => tf.assign_add(s, ((ResourceVariable)VlistT[i] - s) * alpha * KMask)).ToArray();
var Grad2 = SlowWeight.Select((s, i) => s * KMask + (ResourceVariable)VlistT[i] * (1f - KMask)).ToArray();
var Op_StoF = Grad2.Select((s, i) => tf.assign(VlistT[i], s)).ToArray();
var Op_K2 = tf.assign_add(K, tf.cast(KMask, TF_DataType.TF_INT32) * Kv);
var G0 = tf.group(new[] { Op_K });
var G1 = tf.group(new[] { Op_S });
var G2 = tf.group(Op_M);
var G3 = tf.group(Op_V);
var G4 = tf.group(new[] { Op_A });
var G5 = tf.group(Op_C);
var G6 = tf.group(Op_SW);
var G7 = tf.group(Op_StoF);
var G8 = tf.group(new[] { Op_K2 });
//需要顺序调用Operation
return new List<Operation>() { G0, G1, G2, G3, G4, G5, G6, G7, G8 };
}
RAdam优化器
class CustomizeOptimizer_RAdam
{
private Optimizer Opt = null;
private ResourceVariable[] G = null;
private List<IVariableV1> VlistT = null;
private ResourceVariable Step = null;
private ResourceVariable[] M = null;
private ResourceVariable[] V = null;
private Tensor eps = tf.constant(1e-7f, TF_DataType.TF_FLOAT);
private Tensor beta1 = null, beta2 = null;
public CustomizeOptimizer_RAdam(Tensor _LearningRate, float _beta1 = 0.9f, float _beta2 = 0.999f)
{
beta1 = tf.constant(_beta1, TF_DataType.TF_FLOAT);
beta2 = tf.constant(_beta2, TF_DataType.TF_FLOAT);
Opt = tf.train.GradientDescentOptimizer(_LearningRate);
}
public ITensorOrOperation[] ComputeGradient(Tensor Loss, List<IVariableV1> Vlist, int Block)
{
VlistT = Vlist;
Lrt = tf.Variable(0, dtype: TF_DataType.TF_FLOAT);
Step = tf.Variable(0, dtype: TF_DataType.TF_FLOAT);
M = new IVariableV1[Vlist.Count()].Select((s, i) => tf.Variable(0, dtype: TF_DataType.TF_FLOAT, shape: Vlist[i].shape)).ToArray();
V = new IVariableV1[Vlist.Count()].Select((s, i) => tf.Variable(0, dtype: TF_DataType.TF_FLOAT, shape: Vlist[i].shape)).ToArray();
G = new IVariableV1[Vlist.Count()].Select((s, i) => tf.Variable(0, dtype: TF_DataType.TF_FLOAT, shape: Vlist[i].shape)).ToArray();
var gradient = Opt.compute_gradients(Loss, Vlist);
return G.Select((s, i) => tf.assign_add(s, gradient[i].Item1 / (float)(Block))).ToArray();
}
public List<Operation> ApplyGradients()
{
var Op_S = tf.assign_add(Step, 1f);
var Op_M = M.Select((s, i) => tf.assign(s, (s * beta1) + (1f - beta1) * G[i])).ToArray();
var Op_V = V.Select((s, i) => tf.assign(s, (s * beta2) + (1f - beta2) * tf.square(G[i]))).ToArray();
var M_Hat = M.Select((s, i) => s / (1f - tf.pow(beta1, Step))).ToArray();
var V_Hat = V.Select((s, i) => tf.sqrt(s / (1f - tf.pow(beta2, Step)))).ToArray();
var P_inf = 2f / (1f - beta2) - 1f;
var P_t = P_inf - Step * 2f * tf.pow(beta2, Step) / (1f - tf.pow(beta2, Step));
var R_t = tf.sqrt(tf.nn.relu(((P_t - 4f) * (P_t - 2f) * P_inf) / ((P_inf - 4f) * (P_inf - 2f) * P_t)));
var T1 = G.Select((s, i) => R_t * M_Hat[i] / (V_Hat[i] + eps)).ToArray();
var T2 = G.Select((s, i) => M_Hat[i]).ToArray();
var Grad = T1.Select((s, i) => s * tf.cast(P_inf > 4, TF_DataType.TF_FLOAT) + T2[i] * tf.cast(P_inf <= 4, TF_DataType.TF_FLOAT)).ToArray();
var G_V = new Tuple<Tensor, IVariableV1>[G.Length].Select((s, i) => s = new Tuple<Tensor, IVariableV1>(Grad[i], VlistT[i])).ToArray();
var Op_A = Opt.apply_gradients(G_V);
var Op_C = G.Select(s => tf.assign(s, tf.zeros_like(s))).ToArray();
var G0 = tf.group(new[] { Op_S });
var G1 = tf.group(Op_M);
var G2 = tf.group(Op_V);
var G3 = tf.group(new[] { Op_A });
var G4 = tf.group(Op_C);
//需要顺序调用Operation
return new List<Operation>() { G0, G1, G2, G3, G4 };
}
}
```