c++实现LSTM,ADAM优化,预测大写数字

自己写的c++实现LSTM算法,有解释;使用了ADAM优化,大大加快了收敛速度,可以与我之前未使用优化的算法进行比较,大约有350倍的提升。有问题请发邮箱:fan1974815@126.com,也可以在下面提出问题,我会尽量解答。
直接上代码,3部分:头文件.h、源文件.cpp、调用方法,使用了vs2015平台
1、头文件
#pragma once
#include “iostream”
#include “math.h”
#include “stdlib.h”
#include “time.h”
#include “vector”
#include “assert.h”
#include"DataPreProc.h"
#include"string"
using namespace std;

class AdamLSTM
{
public:
AdamLSTM();
~AdamLSTM();
#define innode 1 //输入结点数,存储特征向量
#define hidenode 128 //隐藏结点数,
#define outnode 10 //输出结点数,每个时间点用一个向量代表结果
#define alpha 0.0005 //学习速率
#define timesteps 3 //time steps 时间点数量
#define epochs 25000
#define trainErrThreshold 0.9
#define beta1 0.8
#define beta2 0.999
#define eps 1e-8

#define random(x) rand()%(x)
#define randval(high) ( (double)rand() / RAND_MAX * high )
#define uniform_plus_minus_one ( (double)( 2.0 * rand() ) / ((double)RAND_MAX + 1.0) - 1.0 ) //均匀随机分布

#define CHINESE

double W_I[innode][hidenode];     //连接输入与隐含层单元中输入门的权值矩阵
double W_I_m[innode][hidenode];     //连接输入与隐含层单元中输入门的权值 的第一动量
double W_I_v[innode][hidenode];     //连接输入与隐含层单元中输入门的权值 的第二动量
double U_I[hidenode][hidenode];   //连接上一隐层输出与本隐含层单元中输入门的权值矩阵
double U_I_m[hidenode][hidenode];   //连接上一隐层输出与本隐含层单元中输入门的权值矩阵 的第一动量
double U_I_v[hidenode][hidenode];   //连接上一隐层输出与本隐含层单元中输入门的权值矩阵 的第二动量
double B_I[hidenode][1];   //连接输入与隐含层单元中输入门的偏置
double B_I_m[hidenode][1];   //连接输入与隐含层单元中输入门的偏置 的第一动量
double B_I_v[hidenode][1];   //连接输入与隐含层单元中输入门的偏置 的第二动量

double W_F[innode][hidenode];     //连接输入与隐含层单元中遗忘门的权值矩阵
double W_F_m[innode][hidenode];     //连接输入与隐含层单元中遗忘门的权值矩阵 
double W_F_v[innode][hidenode];     //连接输入与隐含层单元中遗忘门的权值矩阵
double U_F[hidenode][hidenode];   //连接上一隐含层与本隐含层单元中遗忘门的权值矩阵
double U_F_m[hidenode][hidenode];   //连接上一隐含层与本隐含层单元中遗忘门的权值矩阵
double U_F_v[hidenode][hidenode];   //连接上一隐含层与本隐含层单元中遗忘门的权值矩阵
double B_F[hidenode][1];    //连接输入与隐含层单元中遗忘门的偏置
double B_F_m[hidenode][1];    //连接输入与隐含层单元中遗忘门的偏置
double B_F_v[hidenode][1];    //连接输入与隐含层单元中遗忘门的偏置

double W_O[innode][hidenode];     //连接输入与现在时刻的隐含层输出门的权值矩阵
double W_O_m[innode][hidenode];     //连接输入与现在时刻的隐含层输出门的权值矩阵
double W_O_v[innode][hidenode];     //连接输入与现在时刻的隐含层输出门的权值矩阵
double U_O[hidenode][hidenode];   //连接上一隐含层与现在时刻的隐含层的权值矩阵
double U_O_m[hidenode][hidenode];   //连接上一隐含层与现在时刻的隐含层的权值矩阵
double U_O_v[hidenode][hidenode];   //连接上一隐含层与现在时刻的隐含层的权值矩阵
double B_O[hidenode][1];    //连接输入与隐含层单元中输出门的偏置
double B_O_m[hidenode][1];    //连接输入与隐含层单元中输出门的偏置
double B_O_v[hidenode][1];    //连接输入与隐含层单元中输出门的偏置

double W_G[innode][hidenode];     //用于产生新记忆的权值矩阵
double W_G_m[innode][hidenode];     //用于产生新记忆的权值矩阵
double W_G_v[innode][hidenode];     //用于产生新记忆的权值矩阵
double U_G[hidenode][hidenode];   //用于产生新记忆的权值矩阵
double U_G_m[hidenode][hidenode];   //用于产生新记忆的权值矩阵
double U_G_v[hidenode][hidenode];   //用于产生新记忆的权值矩阵
double B_G_m[hidenode][1]; //用于产生新记忆的偏置
double B_G_v[hidenode][1]; //用于产生新记忆的偏置
double B_G[hidenode][1]; //用于产生新记忆的偏置

double W_out[hidenode][outnode];  //连接隐层与输出层的权值矩阵
double W_out_m[hidenode][outnode];  //连接隐层与输出层的权值矩阵
double W_out_v[hidenode][outnode];  //连接隐层与输出层的权值矩阵
double B_out[outnode][1];  //连接隐层与输出层的权值矩阵
double B_out_m[outnode][1];  //连接隐层与输出层的权值矩阵
double B_out_v[outnode][1];  //连接隐层与输出层的权值矩阵


double dsigmoid(double y);
double sigmoid(double x);
//tanh的导数,y为tanh值
double dtanh(double y);
void winit(double w[], int n); //权值初始化
void winit_withiZero(double w[], int n); //用零初始化权值

void train();
vector<string> encode2Truth(map<string, int> &dsMap, vector<double *> &predictedM);
vector<string> encode2Sample(map<string, int> &dsMap, vector<int> &sample);
vector<int> sample2Encode(map<string, int> &dsMap, vector<string> sample);
void BubbleSort(double  *p, int length, int * ind_diff);
void predict();

};

2、源文件
/*使用adam优化,公式网址:https://machinelearningmastery.com/adam-optimization-from-scratch/

*/
#include “stdafx.h”
#include “AdamLSTM.h”

AdamLSTM::AdamLSTM()
{
winit((double*)W_I, hidenode * innode);
winit((double*)U_I, hidenode * hidenode);
winit((double*)B_I, hidenode * 1);
winit((double*)W_F, hidenode * innode);
winit((double*)U_F, hidenode * hidenode);
winit((double*)B_F, hidenode * 1);
winit((double*)W_O, hidenode * innode);
winit((double*)U_O, hidenode * hidenode);
winit((double*)B_O, hidenode * 1);
winit((double*)W_G, hidenode * innode);
winit((double*)U_G, hidenode * hidenode);
winit((double*)B_G, hidenode * 1);
winit((double*)W_out, hidenode * outnode);
winit((double*)B_out, outnode * 1);

winit_withiZero((double*)W_I_m, hidenode * innode);
winit_withiZero((double*)W_I_v, hidenode * innode);
winit_withiZero((double*)U_I_m, hidenode * hidenode);
winit_withiZero((double*)U_I_v, hidenode * hidenode);
winit_withiZero((double*)B_I_m, hidenode * 1);
winit_withiZero((double*)B_I_v, hidenode * 1);

winit_withiZero((double*)W_F_m, hidenode * innode);
winit_withiZero((double*)W_F_v, hidenode * innode);
winit_withiZero((double*)U_F_m, hidenode * hidenode);
winit_withiZero((double*)U_F_v, hidenode * hidenode);
winit_withiZero((double*)B_F_m, hidenode * 1);
winit_withiZero((double*)B_F_v, hidenode * 1);

winit_withiZero((double*)W_O_m, hidenode * innode);
winit_withiZero((double*)W_O_v, hidenode * innode);
winit_withiZero((double*)U_O_m, hidenode * hidenode);
winit_withiZero((double*)U_O_v, hidenode * hidenode);
winit_withiZero((double*)B_O_m, hidenode * 1);
winit_withiZero((double*)B_O_v, hidenode * 1);

winit_withiZero((double*)W_G_m, hidenode * innode);
winit_withiZero((double*)W_G_v, hidenode * innode);
winit_withiZero((double*)U_G_m, hidenode * hidenode);
winit_withiZero((double*)U_G_v, hidenode * hidenode);
winit_withiZero((double*)B_G_m, hidenode * 1);
winit_withiZero((double*)B_G_v, hidenode * 1);

winit_withiZero((double*)W_out_m, hidenode * outnode);
winit_withiZero((double*)W_out_v, hidenode * outnode);

winit_withiZero((double*)B_out_m, outnode * 1);
winit_withiZero((double*)B_out_v, outnode * 1);

}

AdamLSTM::~AdamLSTM()
{
}

double AdamLSTM::dsigmoid(double y)
{
return y * (1.0 - y);
}

double AdamLSTM::sigmoid(double x)
{
return 1.0 / (1.0 + exp(-x));
}

double AdamLSTM::dtanh(double y)
{
y = tanh(y);
return 1.0 - y * y;
}

void AdamLSTM::winit(double w[], int n)
{
for (int i = 0; i<n; i++)
{
w[i] = uniform_plus_minus_one; //均匀随机分布
/* cout << “w”<<i<<"=" << w[i] << endl;*/
}
}

void AdamLSTM::winit_withiZero(double w[], int n)
{
for (int i = 0; i<n; i++)
{
w[i] =0.0;
/* cout << “w”<<i<<"=" << w[i] << endl;*/
}
}

void AdamLSTM::train()
{
//char s[] = “十 四 五 规 划 和 二 零 三 五 年 远 景 目 标 纲 要 明 确 实 施 积 极 应 对 人 口 老 龄 化 国 家 战 略 制 定 人 口 长 期 发 展 战 略 优 化 生 育 政 策”;
//string ss = “十四五规划和二零三五年远景目标纲要明确实施积极应对人口老龄化国家战略制定人口长期发展战略优化生育政策”;
//char s[] = “锄 禾 日 当 午 汗 滴 和 下 土 谁 知 盘 中 餐 粒 粒 皆 辛 苦”;
//string ss = “锄禾日当午汗滴和下土谁知盘中餐粒粒皆辛苦”;

char s[] = "壹 贰 叁 肆 伍 陆 柒 捌 玖 拾";
string ss = "壹贰叁肆伍陆柒捌玖拾";
int epoch = 0, i, j, k, m, p;
//中间变量值

vector<double*> I_vector;      //输入门
vector<double*> F_vector;      //遗忘门
vector<double*> O_vector;      //输出门
vector<double*> G_vector;      //新记忆
vector<double*> S_vector;      //状态值
vector<double*> h_vector;      //输出值
vector<double*> y_delta_vector;        //保存误差关于输出层的偏导
vector<double *>predictV;
vector<double*>truth_steps;//每个样本所有timesteps的真值,应该清零
DataPreProc dpp;
map<string, int> dsMap = dpp.loadDataset(s);
vector<int> curSample;//每个样本应该清零,存储当前样本中每个时间点的输入向量
vector<int> curTimestep_truth_10;//每个样本中每个timestep对应的真值,十进制;每训练一个样本应该清零


int offset = 0;

#ifdef CHINESE
int end_offset = (timesteps + 0) * 2;//如果用随机产生样本的方法,此处的偏移应该加0而不是1,因为随机数界限不包含最大值
#else
int end_offset = (timesteps + 1);
#endif // CHINESE
int randomMax = ss.size() - end_offset;//随机产生样本索引时,随机值的范围最大值
cout << “randomMax” << randomMax << endl;

double e = 10000000.0;  //误差

						/*for (epoch = 0; epoch < epochs; epoch++) */ //训练次数
while (e > trainErrThreshold)
{
	//1、清零和初始化
	double * x = new double[innode];
	if ((offset + end_offset) >= ss.size())
	{
		offset = 0;
	}
	e = 0.0;

	//存储每个时间点计算出来的权重delta,并累加;一个样本训练完之后进行更新权重,这个先不考虑
	//获取样本
	curSample.clear();
	curTimestep_truth_10.clear();

#ifdef CHINESE
for (size_t i = 0; i < timesteps * 2; i = i + 2)//获取本样本的timesteps个时间点数据,因为是汉字占2个位置,因此2
{
curSample.push_back(dsMap[ss.substr(offset + i, 2)]);
curTimestep_truth_10.push_back(dsMap[ss.substr(offset + i + 2, 2)]);
//cout << curSample.back() << endl;
//cout << curTimestep_truth_10.back() << endl;
}
#else
for (size_t i = 0; i < timesteps; i = i + 1)//获取本样本的timesteps个时间点数据,因为是汉字占2个位置,因此
2
{
curSample.push_back(dsMap[ss.substr(offset + i, 1)]);
curTimestep_truth_10.push_back(dsMap[ss.substr(offset + i + 1, 1)]);
//cout << curSample.back() << endl;
//cout << curTimestep_truth_10.back() << endl;
}
#endif // CHINESE
//对真值进行one_hot编码
truth_steps.clear();
for (size_t i = 0; i < timesteps; i++)
{
double LableD = new double[outnode] {0};//打印测试???
for (size_t j = 0; j < dsMap.size(); j++)
{
if ((j + 1) == curTimestep_truth_10[i])
{
LableD[j] = 1.0f;
/cout << “LableD:” << LableD[j] << endl;/
}
else
{
LableD[j] = 0.0f;
/
cout << “LableD:” << LableD[j] << endl;*/
}
}
/cout << “当前时间点真值:” << endl << curTimestep_truth_10[i] << endl;/
truth_steps.push_back(LableD);//矩阵必须要复制,否则每次都引用了相同的数据
/cout << “truth_steps=”<<i<<endl << truth_steps.back() << endl;/
}
//正向传播:一直计算完所有的time step
for (p = 0; p < timesteps; p++)//循环遍历二进制数组,从最低位开始,p相当于time steps
{
x[0] = (float)curSample[p];
/cout << “x[0]” << p << “=” << x[0] << endl;/
double *in_gate = new double[hidenode] {0}; //输入门取sigmoid函数后的值
double *out_gate = new double[hidenode] {0}; //输出门取sigmoid函数后的值
double *forget_gate = new double[hidenode] {0}; //遗忘门取sigmoid函数后的值
double *g_gate = new double[hidenode] {0}; //新记忆取tanh函数后的值
double *state = new double[hidenode] {0}; //状态值
double *h = new double[hidenode] {0}; //隐层输出值
double *output = new double[outnode] {0};//当前时间点的预测数组
double *y_pre = new double[outnode] {0};//当前时间点的预测数组
double * truthLabel = new double[outnode] {0};//当前时间点的真值数组
double * y_delta = new double[outnode] {0};//当前时间点的真值数组
if (p == 0)
{
//在0时刻是没有之前的隐含层的,所以初始化一个全为0的
double *S = new double[hidenode] {0}; //状态值
double *h = new double[hidenode] {0}; //输出值
for (size_t i = 0; i < hidenode; i++)
{
S[i] = 0.0;
h[i] = 0.0;
}
S_vector.push_back(S);
h_vector.push_back(h);
}
for (size_t j = 0; j < hidenode; j++)
{
double inGateValue = 0;
double forgetGateValue = 0;
double outGateValue = 0;
double gGateValue = 0;
//当前时间点输入
for (size_t k = 0; k < innode; k++)
{
forgetGateValue += x[k] * W_F[k][j];

				inGateValue += x[k] * W_I[k][j];
				gGateValue += x[k] * W_G[k][j];
				outGateValue += x[k] * W_O[k][j];
			}
			//前一个状态
			double * h_pre = h_vector.back();
			double * state_pre = S_vector.back();
			for (size_t i = 0; i < hidenode; i++)
			{
				forgetGateValue += h_pre[i] * U_F[i][j];
				inGateValue += h_pre[i] * U_I[i][j];
				gGateValue += h_pre[i] * U_G[i][j];
				outGateValue += h_pre[i] * U_O[i][j];
			}

			//偏置
			forgetGateValue += B_F[j][0] * 1.0;
			inGateValue += B_I[j][0] * 1.0;
			gGateValue += B_G[j][0] * 1.0;
			outGateValue += B_O[j][0] * 1.0;

			in_gate[j] = sigmoid(inGateValue);
			out_gate[j] = sigmoid(outGateValue);
			forget_gate[j] = sigmoid(forgetGateValue);
			g_gate[j] = tanh(gGateValue);//这里应该是tanh

			double s_pre = state_pre[j];
			state[j] = forget_gate[j] * s_pre + g_gate[j] * in_gate[j];//状态
			h[j] = out_gate[j] * tanh(state[j]);//输出
												/*cout << "h[j]=" << h[j] << endl;*/
		}
		truthLabel = truth_steps[p];
		/*for (size_t m = 0; m < outnode; m++)
		{
		cout << "truthLabel" << m << "=" << truthLabel[m] << endl;
		}*/
		for (k = 0; k < outnode; k++)//输出节点
		{
			//隐藏层传播到输出层,输出层的权重和sigmoid函数是自己根据实际需要添加的,一般情况下,LSTM就有个h输出
			for (j = 0; j < hidenode; j++)
			{
				double tmp = h[j] * W_out[j][k];
				/*cout << "tmp" <<  "=" << tmp << endl;
				cout << "output" << j << "=" << output[k] << endl;*/
				output[k] += tmp;
				//cout << "h" << j << "=" << h[j] << endl;
				//cout << "W_out" << j << k << "=" << W_out[j][k] << endl;
				//cout << "output" << j << "=" << output[k] << endl;
			}
			output[k] += B_out[k][0] * 1.0;
			y_pre[k] = sigmoid(output[k]);               //输出层各单元输出
														 /*	cout << "y_pre" << k << "=" << y_pre[k] << endl;*/
		}
		predictV.push_back(y_pre);
		I_vector.push_back(in_gate);
		F_vector.push_back(forget_gate);
		O_vector.push_back(out_gate);
		S_vector.push_back(state);
		G_vector.push_back(g_gate);
		h_vector.push_back(h);

		//保存标准误差关于输出层的偏导

		for (size_t k = 0; k < outnode; k++)
		{
			y_delta[k] = (truthLabel[k] - output[k]) * dsigmoid(y_pre[k]);
			e += fabs(truthLabel[k] - output[k]);          //误差
														   /*cout << "output" << k << "=" << output[k] << ";truthLabel" << k << "=" << truthLabel[k] << endl;*/
		}
		y_delta_vector.push_back(y_delta);
	}
	//误差反向传播
	//隐含层偏差,通过和当前时间点输出层的误差来计算
	double h_delta[hidenode]{ 0 };//当前时间点输出(不含输出层)的delta,包含两部分:后一个时间点的隐含层h(t+1)误差,输出层y的误差y_delta,这两部分相加
	double *y_deltaB = new double[outnode] { 0 };
	double *O_delta = new double[hidenode] {0};
	double *I_delta = new double[hidenode] {0};
	double *F_delta = new double[hidenode] {0};
	double *G_delta = new double[hidenode] {0};
	double *state_delta = new double[hidenode] {0};
	//后一个时间点的隐藏层误差
	double *O_future_delta = new double[hidenode] {0};
	double *I_future_delta = new double[hidenode] {0};
	double *F_future_delta = new double[hidenode] {0};
	double *G_future_delta = new double[hidenode] {0};
	double *state_future_delta = new double[hidenode] {0};
	double *forget_gate_future = new double[hidenode] {0};//存储前一个时间点的遗忘门节点值,不是delta
	//for (j = 0; j<hidenode; j++)//从最后一个时间点开始计算,此时门节点的delta都是0
	//{
	//	/*O_future_delta[j] = 0; */
	//	cout <<"what?:"<< O_future_delta[j] << endl;
	//	I_future_delta[j] = 0;
	//	F_future_delta[j] = 0;
	//	G_future_delta[j] = 0;
	//	state_future_delta[j] = 0;
	//	forget_gate_future[j] = 0;
	//}
	//从最后一个时间点反向计算
	for (p = timesteps - 1; p >= 0; p--)
	{
		x[0] = (float)curSample[p];
		//当前隐藏层,取出每个门的节点值,
		double *in_gate = I_vector[p];     //输入门,当前时间点中,门的所有神经元节点值
		double *out_gate = O_vector[p];    //输出门
		double *forget_gate = F_vector[p]; //遗忘门
		double *g_gate = G_vector[p];      //新记忆
		double *state = S_vector[p + 1];     //状态值,算0元素,共有8+1=9个元素
		double *h = h_vector[p + 1];         //隐层输出值,算0元素,共有8+1=9个元素
											 //前一个隐藏层
		double *h_pre = h_vector[p];
		double *state_pre = S_vector[p];
		y_deltaB = y_delta_vector[p];
		//对于网络中每个隐藏单元,计算误差项,并更新权值
		double mhat = 0.0, vhat = 0.0;
		double gradient = 0.0;
		for (j = 0; j < hidenode; j++)
		{
			h_delta[j] = 0.0;
			for (k = 0; k<outnode; k++)
			{
				h_delta[j] += y_deltaB[k] * W_out[j][k];
				/*cout << "h_delta" << j << h_delta[j] << endl;*/
			}
			for (k = 0; k<hidenode; k++)//当前的h也输入到下一个时间点的cell中了,因此也需要累加下一个时间点的相关delta
			{
				h_delta[j] += I_future_delta[k] * U_I[j][k];
				h_delta[j] += F_future_delta[k] * U_F[j][k];
				h_delta[j] += O_future_delta[k] * U_O[j][k];
				h_delta[j] += G_future_delta[k] * U_G[j][k];
			}
			//隐含层中每个门的每个神经元的误差
			O_delta[j] = h_delta[j] * tanh(state[j]) * dsigmoid(out_gate[j]);
			state_delta[j] = h_delta[j] * out_gate[j] * dtanh(state[j]) +
				state_future_delta[j] * forget_gate_future[j];
			F_delta[j] = state_delta[j] * state_pre[j] * dsigmoid(forget_gate[j]);
			I_delta[j] = state_delta[j] * g_gate[j] * dsigmoid(in_gate[j]);
			G_delta[j] = state_delta[j] * in_gate[j] * dtanh(g_gate[j]);//此处用dtanh???
			//更新前一个时间点隐含层和当前时间点隐含层之间的权值
			for (k = 0; k<hidenode; k++)
			{
				gradient = I_delta[j] * h_pre[k];//梯度计算:g(t) = f'(x(t-1))
				U_I_m[k][j] = beta1 * U_I_m[k][j] + (1.0 - beta1) * gradient;//第一个动量:m(t) = beta1 * m(t-1) + (1 – beta1) * g(t)
				U_I_v[k][j] = beta2 * U_I_v[k][j] + (1.0 - beta2) * pow(gradient,2);//第二个动量:v(t) = beta2 * v(t-1) + (1 – beta2) * g(t)^2
				mhat = U_I_m[k][j] * 1.0 / (1.0 - pow(beta1,(epoch + 1)));//mhat(t) = m(t) / (1 – beta1^t)
				vhat = U_I_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));//vhat(t) = v(t) / (1 – beta2^t)
				U_I[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);//x(t) =x(t-1) – alpha * mhat(t) / (sqrt(vhat(t)) + epslon)
				//U_I[k][j] += alpha * I_delta[j] * h_pre[k];
				
				gradient = F_delta[j] * h_pre[k];
				U_F_m[k][j] = beta1 * U_F_m[k][j] + (1.0 - beta1) * gradient;
				U_F_v[k][j] = beta2 * U_F_v[k][j] + (1.0 - beta2) * pow(gradient, 2);
				mhat = U_F_m[k][j] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));
				vhat = U_F_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));
				U_F[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);
				//U_F[k][j] += alpha * F_delta[j] * h_pre[k];
				
				gradient = O_delta[j] * h_pre[k];
				U_O_m[k][j] = beta1 * U_O_m[k][j] + (1.0 - beta1) * gradient;
				U_O_v[k][j] = beta2 * U_O_v[k][j] + (1.0 - beta2) * pow(gradient, 2);
				mhat = U_O_m[k][j] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));
				vhat = U_O_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));
				U_O[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);
				/*U_O[k][j] += alpha * O_delta[j] * h_pre[k];*/

				gradient = G_delta[j] * h_pre[k];
				U_G_m[k][j] = beta1 * U_G_m[k][j] + (1.0 - beta1) * gradient;
				U_G_v[k][j] = beta2 * U_G_v[k][j] + (1.0 - beta2) * pow(gradient, 2);
				mhat = U_G_m[k][j] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));
				vhat = U_G_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));
				U_G[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);
				/*U_G[k][j] += alpha * G_delta[j] * h_pre[k];*/
			}
			//更新当前时间点输入层和当前时间点隐含层之间的连接权
			for (k = 0; k<innode; k++)
			{
				gradient = I_delta[j] * x[k];
				W_I_m[k][j] = beta1 * W_I_m[k][j] + (1.0 - beta1) * gradient;
				W_I_v[k][j] = beta2 * W_I_v[k][j] + (1.0 - beta2) * pow(gradient, 2);
				mhat = W_I_m[k][j] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));
				vhat = W_I_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));
				W_I[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);
				/*W_I[k][j] += alpha * I_delta[j] * x[k];*/

				gradient = F_delta[j] * x[k];
				W_F_m[k][j] = beta1 * W_F_m[k][j] + (1.0 - beta1) * gradient;
				W_F_v[k][j] = beta2 * W_F_v[k][j] + (1.0 - beta2) * pow(gradient, 2);
				mhat = W_F_m[k][j] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));
				vhat = W_F_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));
				W_F[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);
				/*W_F[k][j] += alpha * ;*/

			/*	W_O[k][j] += alpha * ;*/
				gradient = O_delta[j] * x[k];
				W_O_m[k][j] = beta1 * W_O_m[k][j] + (1.0 - beta1) * gradient;
				W_O_v[k][j] = beta2 * W_O_v[k][j] + (1.0 - beta2) * pow(gradient, 2);
				mhat = W_O_m[k][j] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));
				vhat = W_O_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));
				W_O[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);

				/*W_G[k][j] += alpha * ;*/
				gradient = G_delta[j] * x[k];
				W_G_m[k][j] = beta1 * W_G_m[k][j] + (1.0 - beta1) * gradient;
				W_G_v[k][j] = beta2 * W_G_v[k][j] + (1.0 - beta2) * pow(gradient, 2);
				mhat = W_G_m[k][j] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));
				vhat = W_G_v[k][j] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));
				W_G[k][j] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);
			}
			//更新偏置
			/*B_I[j][0] += alpha * ;*/
			gradient = I_delta[j];
			B_I_m[j][0] = beta1 * B_I_m[j][0] + (1.0 - beta1) * gradient;
			B_I_v[j][0] = beta2 * B_I_v[j][0] + (1.0 - beta2) * pow(gradient, 2);
			mhat = B_I_m[j][0] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));
			vhat = B_I_v[j][0] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));
			B_I[j][0] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);

			/*B_O[j][0] += alpha * O_delta[j];*/
			gradient = O_delta[j];
			B_O_m[j][0] = beta1 * B_O_m[j][0] + (1.0 - beta1) * gradient;
			B_O_v[j][0] = beta2 * B_O_v[j][0] + (1.0 - beta2) * pow(gradient, 2);
			mhat = B_O_m[j][0] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));
			vhat = B_O_v[j][0] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));
			B_O[j][0] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);

			/*B_F[j][0] += alpha * F_delta[j];*/
			gradient = F_delta[j];
			B_F_m[j][0] = beta1 * B_F_m[j][0] + (1.0 - beta1) * gradient;
			B_F_v[j][0] = beta2 * B_F_v[j][0] + (1.0 - beta2) * pow(gradient, 2);
			mhat = B_F_m[j][0] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));
			vhat = B_F_v[j][0] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));
			B_F[j][0] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);

			//B_G[j][0] += alpha * G_delta[j];
			gradient = G_delta[j];
			B_G_m[j][0] = beta1 * B_G_m[j][0] + (1.0 - beta1) * gradient;
			B_G_v[j][0] = beta2 * B_G_v[j][0] + (1.0 - beta2) * pow(gradient, 2);
			mhat = B_G_m[j][0] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));
			vhat = B_G_v[j][0] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));
			B_G[j][0] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);
		}
		for (k = 0; k<outnode; k++)  //对于网络中每个输出单元,更新权值,这个在一般的LSTM并没有,一般的LSTM只有h层
		{
			//更新隐含层和输出层之间的连接权
			for (j = 0; j < hidenode; j++)
			{
				/*W_out[j][k] += alpha * y_deltaB[k] * h[j];*/
				gradient = y_deltaB[k] * h[j];
				W_out_m[j][k] = beta1 * W_out_m[j][k] + (1.0 - beta1) * gradient;
				W_out_v[j][k] = beta2 * W_out_v[j][k] + (1.0 - beta2) * pow(gradient, 2);
				mhat = W_out_m[j][k] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));
				vhat = W_out_v[j][k] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));
				W_out[j][k] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);

			}
			/*B_out[k][0] += alpha * y_deltaB[k];*/
			gradient = y_deltaB[k] ;
			B_out_m[k][0] = beta1 * B_out_m[k][0] + (1.0 - beta1) * gradient;
			B_out_v[k][0] = beta2 * B_out_v[k][0] + (1.0 - beta2) * pow(gradient, 2);
			mhat = B_out_m[k][0] * 1.0 / (1.0 - pow(beta1, (epoch + 1)));
			vhat = B_out_v[k][0] * 1.0 / (1.0 - pow(beta2, (epoch + 1)));
			B_out[k][0] += alpha * mhat * 1.0 / (sqrt(vhat) + eps);
		}
		if (p == timesteps - 1)//此处应该回收下面的内存
		{
			delete  []O_future_delta;
			delete  []F_future_delta;
			delete  []I_future_delta;
			delete  []G_future_delta;
			delete  []state_future_delta;
			delete  []forget_gate_future;
			O_future_delta = NULL;
			F_future_delta = NULL;
			I_future_delta = NULL;
			G_future_delta = NULL;
			state_future_delta = NULL;
			forget_gate_future = NULL;
		}
		O_future_delta = O_delta;
		F_future_delta = F_delta;
		I_future_delta = I_delta;
		G_future_delta = G_delta;
		state_future_delta = state_delta;
		forget_gate_future = forget_gate;

	}
	delete  []O_future_delta;
	delete  []F_future_delta;
	delete  []I_future_delta;
	delete  []G_future_delta;
	delete  []state_future_delta;
	delete []forget_gate_future;
	O_future_delta = NULL;
	F_future_delta = NULL;
	I_future_delta = NULL;
	G_future_delta = NULL;
	state_future_delta = NULL;
	forget_gate_future = NULL;
	delete []y_deltaB;
	y_deltaB = NULL;
	
	if (epoch % 1000 == 0)
	{

		cout << "第  " << epoch << "  epoch:" << endl;
		cout << "样本数据:" << endl;
		vector<string> vsamp = encode2Sample(dsMap, curSample);
		for (k = 0; k < timesteps; k++)
			cout << " " << vsamp[k];
		cout << endl;
		cout << "error: " << e << endl;
		cout << "pred: ";
		vector<string> vpre = encode2Truth(dsMap, predictV);
		for (k = 0; k < timesteps; k++)
			cout << " " << vpre[k];
		cout << endl;
		vector<string> vtru = encode2Truth(dsMap, truth_steps);
		cout << "true: ";
		for (k = 0; k < timesteps; k++)
			cout << " " << vtru[k];
		cout << endl;
		cout << endl;
	}
	//释放new分配的内存
	for (i = 0; i < predictV.size(); i++)
	{
		delete []predictV[i];
		predictV[i] = NULL;
	}
	for (i = 0; i<I_vector.size(); i++)
	{
		delete []I_vector[i];
		I_vector[i] = NULL;
	}

	for (i = 0; i < O_vector.size(); i++)
	{
		delete []O_vector[i];
		O_vector[i] = NULL;
	}
	for (i = 0; i < G_vector.size(); i++)
	{
		delete []G_vector[i];
		G_vector[i] = NULL;
	}
	for (i = 0; i < S_vector.size(); i++)
	{
		delete []S_vector[i];
		S_vector[i] = NULL;
	}
	for (i = 0; i < h_vector.size(); i++)
	{
		delete []h_vector[i];
		h_vector[i] = NULL;
	}
	for (i = 0; i < truth_steps.size(); i++)
	{
		delete []truth_steps[i];
		truth_steps[i] = NULL;
	}
	predictV.clear();
	truth_steps.clear();
	I_vector.clear();
	F_vector.clear();
	O_vector.clear();
	G_vector.clear();
	S_vector.clear();
	h_vector.clear();
	y_delta_vector.clear();

	delete []x;
	x = NULL;

	//随机获取一个新样本
	offset = random(randomMax);

#ifdef CHINESE
while (offset % 2 != 0)//必须是2的倍数,否则获取汉字时进入死循环
{
offset = random(randomMax);
}
#endif // CHINESE

	/*cout << "当前偏移:" << offset << endl;*/
	epoch++;
}

}

vector AdamLSTM::encode2Truth(map<string, int>& dsMap, vector<double*>& predictedM)
{
vector sRes;
sRes.clear();
int maxInd = 0;
for (size_t i = 0; i < predictedM.size(); i++)
{
double * pre = predictedM[i];
for (size_t j = 0; j < outnode; j++)
{
pre[j] = abs(pre[j]);
/cout << “当前节点 " << j << " 预测的值:” << pre[j] << endl;/
}
int ind[outnode] = { 0 };
BubbleSort(pre, outnode, ind);
maxInd = ind[outnode - 1];
//cout <<“当前时间点 “<<i<<” 最大值索引:”<< maxInd << endl;
for (map<string, int>::iterator it = dsMap.begin(); it != dsMap.end(); it++)
{
if ((*it).second == (maxInd + 1))
{
sRes.push_back((*it).first);
}
}
}
return sRes;
}

vector AdamLSTM::encode2Sample(map<string, int>& dsMap, vector& sample)
{
vector sRes;
sRes.clear();
for (size_t i = 0; i < sample.size(); i++)
{
for (map<string, int>::iterator it = dsMap.begin(); it != dsMap.end(); it++)
{
if (it->second == sample[i])
{
sRes.push_back(it->first);
}
}
}
return sRes;
}

vector AdamLSTM::sample2Encode(map<string, int>& dsMap, vector sample)
{
vector res;
for (size_t i = 0; i < sample.size(); i++)
{
for (map<string, int>::iterator it = dsMap.begin(); it != dsMap.end(); it++)
{
if (it->first == sample[i])
{
res.push_back(it->second);
}
}
}
return res;
}

void AdamLSTM::BubbleSort(double * p, int length, int * ind_diff)
{
for (int m = 0; m < length; m++)
{
ind_diff[m] = m;
}

for (int i = 0; i < length; i++)
{
	for (int j = 0; j < length - i - 1; j++)
	{
		if (p[j] > p[j + 1])
		{
			double temp = p[j];
			p[j] = p[j + 1];
			p[j + 1] = temp;

			int ind_temp = ind_diff[j];
			ind_diff[j] = ind_diff[j + 1];
			ind_diff[j + 1] = ind_temp;
		}
	}
}

}

void AdamLSTM::predict()
{
DataPreProc ddp;

char dataset[] = "壹 贰 叁 肆 伍 陆 柒 捌 玖 拾";
map<std::string, int> ds = ddp.loadDataset(dataset);
//输入样本数据
char s[200] = { NULL };
cout << "输入样本数据,不超过3个汉字且在训练数据集范围内(空格隔离):" << endl;
cin.getline(s, 200);
string ss(s);
vector<string> re = ddp.split(ss, " ");
vector<int> input = sample2Encode(ds, re);
vector<double*> predictV;      //预测值
vector<double*> I_vector;      //输入门
vector<double*> F_vector;      //遗忘门
vector<double*> O_vector;      //输出门
vector<double*> G_vector;      //新记忆
vector<double*> S_vector;      //状态值
vector<double*> h_vector;      //输出值
double *x = new double[innode];
//在0时刻是没有之前的隐含层的,所以初始化一个全为0的
double *S = new double[hidenode] {0};     //状态值
double *h = new double[hidenode] {0};     //输出值

vector<std::string> result(re.begin(), re.begin() + timesteps);
int cnt = 0;
while (cnt < 5)
{
	/*正向传播:一直计算完所有的time step*/
	for (int p = 0; p < timesteps; p++)//循环遍历二进制数组,从最低位开始,p相当于time steps
	{
		if (p == 0)
		{
			for (size_t i = 0; i < hidenode; i++)
			{
				S[i] = 0.0;
				h[i] = 0.0;
			}
			S_vector.push_back(S);
			h_vector.push_back(h);
		}
		x[0] = (float)input[p];
		//cout << "x[0]" << p << "=" << x[0] << endl;
		double *in_gate = new double[hidenode] {0};     //输入门取sigmoid函数后的值
		double *out_gate = new double[hidenode] {0};    //输出门取sigmoid函数后的值
		double *forget_gate = new double[hidenode] {0}; //遗忘门取sigmoid函数后的值
		double *g_gate = new double[hidenode] {0};      //新记忆取tanh函数后的值
		double *state = new double[hidenode] {0};       //状态值
		double *h = new double[hidenode] {0}; //隐层输出值
		double *output = new double[outnode] {0};//当前时间点的预测数组
		double *y_pre = new double[outnode] {0};//当前时间点的预测数组

												//前一个状态
		double * h_pre = h_vector.back();
		double * state_pre = S_vector.back();
		for (size_t j = 0; j < hidenode; j++)
		{
			double inGateValue = 0;
			double forgetGateValue = 0;
			double outGateValue = 0;
			double gGateValue = 0;
			//当前时间点输入
			for (size_t k = 0; k < innode; k++)
			{
				forgetGateValue += x[k] * W_F[k][j];
				inGateValue += x[k] * W_I[k][j];
				gGateValue += x[k] * W_G[k][j];
				outGateValue += x[k] * W_O[k][j];
			}
			for (size_t i = 0; i < hidenode; i++)
			{
				forgetGateValue += h_pre[i] * U_F[i][j];
				inGateValue += h_pre[i] * U_I[i][j];
				gGateValue += h_pre[i] * U_G[i][j];
				outGateValue += h_pre[i] * U_O[i][j];
			}

			//偏置
			forgetGateValue += B_F[j][0] * 1.0;
			inGateValue += B_I[j][0] * 1.0;
			gGateValue += B_G[j][0] * 1.0;
			outGateValue += B_O[j][0] * 1.0;

			in_gate[j] = sigmoid(inGateValue);
			out_gate[j] = sigmoid(outGateValue);
			forget_gate[j] = sigmoid(forgetGateValue);
			g_gate[j] = tanh(gGateValue);//这里应该是tanh

			double s_pre = state_pre[j];
			state[j] = forget_gate[j] * s_pre + g_gate[j] * in_gate[j];//状态
			h[j] = out_gate[j] * tanh(state[j]);//输出
												/*cout << "h[j]=" << h[j] << endl;*/
		}

		/*for (size_t m = 0; m < outnode; m++)
		{
		cout << "truthLabel" << m << "=" << truthLabel[m] << endl;
		}*/
		for (int k = 0; k < outnode; k++)//输出节点
		{
			//隐藏层传播到输出层,输出层的权重和sigmoid函数是自己根据实际需要添加的,一般情况下,LSTM就有个h输出
			for (int j = 0; j < hidenode; j++)
			{
				double tmp = h[j] * W_out[j][k];
				/*cout << "tmp" <<  "=" << tmp << endl;
				cout << "output" << j << "=" << output[k] << endl;*/
				output[k] += tmp;
				//cout << "h" << j << "=" << h[j] << endl;
				//cout << "W_out" << j << k << "=" << W_out[j][k] << endl;
				//cout << "output" << j << "=" << output[k] << endl;
			}
			output[k] += B_out[k][0] * 1.0;
			y_pre[k] = sigmoid(output[k]);               //输出层各单元输出
														 /*	cout << "y_pre" << k << "=" << y_pre[k] << endl;*/
		}
		predictV.push_back(y_pre);
		S_vector.push_back(state);
		h_vector.push_back(h);
	}
	vector<string> vpre = encode2Truth(ds, predictV);

	predictV.clear();
	S_vector.clear();
	h_vector.clear();

	//cout << "预测值:" << vpre[timesteps - 1] << endl;
	result.push_back(vpre[timesteps - 1]);
	cout << "预测" << cnt << ":";
	for (size_t i = 0; i < result.size(); i++)
	{
		cout << result[i];
	}
	cout << endl;
	//修改输入
	re.erase(re.begin());
	re.push_back(vpre[timesteps - 1]);
	input.clear();
	input = sample2Encode(ds, re);
	//for (size_t i = 0; i < input.size(); i++)
	//{
	//	cout << "input value:" << input[i] << endl;
	//}
	//cout << "re" <<re.back()<< endl;
	//继续预测下一个
	cnt++;
}
result.clear();

}

3、调用方法
int main()
{

//ActvAndDrt aad;

srand(time(NULL));
AdamLSTM adamLstm;
adamLstm.train();
cout << "训练结束" << endl;
adamLstm.predict();
system("pause");
return 0; 

}

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值