/*
This file is important that must be included
*/#ifndef TENSOR_H#define TENSOR_H#define GPU_EN //Enable GPU or parallel-computing-device#define STATIC_TENSOR 0 //tensors have their own data#define VARIABLE_TENSOR 1 //tenosrs have their own data and grad#define COMPUTE_TENSOR 2 //tensors whose data depend on otherstypedefstruct Tensor {
char name[20];
char type;
int dim;
int *shape; //cpu ramint len; //total data floatsint device; //-1:cpu ram, 0~n:gpu ram, indecate where data isfloat *data; //in cpu or gpu ram//static tensors have attributes abovefloat *grad; //same shape and place as data//variable tensors have attributes aboveint pn; //how many parent
Tensor **p; //parent's ptrint para[12]; //compute parametersint(*gendata)(Tensor* my); //compute my data accroding to parent's dataint(*gengrad)(Tensor* my); //accumulate each parent's grad accroding to their data and my grad //compute tensors have attributes and methods above
}Tensor;
void initptr_t (Tensor* T); //T=malloc_t();initptr_t(T);
Tensor* malloc_t (int device, constchar *name, int type, int dim, ...);
void free_t (Tensor *T);
int reshape_t (Tensor *T, int dim, ...);
int _reshape_t(Tensor *T, int *shape, int dim, int len);
int memcpy_t (Tensor *dst, Tensor *src); //static:data variable:data,grad compute:data,gradvoid print_t (Tensor *T);
void printshape(Tensor *T);
void printinfo (Tensor *T);
int reset_gpu (int device); //after all processes finished this must be calledint malloc_tp (Tensor *T, int pn, ...); //allocate parents if T is compute tensor
Tensor* copy_t (int device, constchar *name, Tensor *src); //copy tensor T, specify a deviceint memcpy_tdata(Tensor *T, constfloat *src, int srclen);
int memcpy_tgrad(Tensor *T, constfloat *src, int srclen);
void analyze_t (Tensor *T); //show all relationshipint forward_t (Tensor *T); //generate datavoid zerograd_t(Tensor *T); //zero grad, not fastint backward_t(Tensor *T); //generate gradvoid deepfree_t(Tensor *T); //free all relationship/*
this method malloc tensor array, should be freed after use:
Tensor** LT=all_tensors(T,FALSE,TRUE,FALSE,&len); free(LT);
every pointer in LT is copyed from original pointer
*/
Tensor** all_tensors(Tensor *T, int _static, int _variable, int _compute, int *len);
#define ROUND(X) (int)(X+0.5)#define CEIL(X) (int)(X+0.99999)#define FLOOR(X) (int)(X-0.99999)+1void _shuffle(int * L, int len);
#endif
tensor.c:
#include "stdafx.h"#include "tensor.h"#include "tensor_sq.h"#ifdef GPU_EN#include "cuda_runtime.h"#include "device_launch_parameters.h"//int GPU_SETDEVICE(int device)#define GPU_SETDEVICE cudaSetDevice //int GPU_MALLOC(void **devPtr, size_t size)//implement: *devPtr=(void*)allocated_ptr#define GPU_MALLOC cudaMalloc//void GPU_FREE(void *devPtr)#define GPU_FREE cudaFree//int GPU_MEMCPY(void *dst, void *src, size_t size, int kind)#define GPU_MEMCPY cudaMemcpy#define GPU_MEMCPY_KIND_D2H cudaMemcpyDeviceToHost#define GPU_MEMCPY_KIND_H2D cudaMemcpyHostToDevice#define GPU_MEMCPY_KIND_D2D cudaMemcpyDeviceToDevice//int GPU_RESET() to destory all allocations#define GPU_RESET cudaDeviceReset#endifvoid initptr_t(Tensor* T)
{
int i;
for (i = 0; i < 20; i++) T->name[i] = 0;
T->type = -1;
T->dim = 0;
T->shape = NULL;
T->len = 0;
T->device = -1;
T->data = NULL;
T->grad = NULL;
T->pn = 0;
T->p = NULL;
for (i = 0; i < 12; i++) T->para[i] = 0;
T->gendata = NULL;
T->gengrad = NULL;
}
Tensor* malloc_t(int device, constchar *name, int type, int dim, ...)
{
int i, d, len = 1, ret;
Tensor *T;
va_list vl;
//malloc tensor and shape
T = (Tensor*)malloc(sizeof(Tensor)); initptr_t(T);
T->shape = (int*)malloc(dim * sizeof(int));
//init tensor except data and grad
strcpy_s(T->name, name);
T->type = type;
va_start(vl, dim);
for (i = 0; i < dim; i++)
{
d = va_arg(vl, int);
T->shape[i] = d;
len *= d;
}
va_end(vl);
T->dim = dim;
T->len = len;
T->device = device;
//malloc data and gradif (device < 0) { //cpu
T->data = (float*)malloc(len * sizeof(float));
if (type != STATIC_TENSOR) T->grad = (float*)malloc(len * sizeof(float));
}
else { //gpu#ifdef GPU_EN//choose gpu
ret = GPU_SETDEVICE(device);
if (ret) {
fprintf(stderr, "GPU_SETDEVICE failed!");
free(T->shape);
free(T);
return NULL;
}
//malloc
ret = GPU_MALLOC((void**)&T->data, len * sizeof(float));
if (ret) {
fprintf(stderr, "GPU_MALLOC failed!");
GPU_FREE(T->data);
free(T->shape);
free(T);
return NULL;
}
if (type != STATIC_TENSOR) {
ret = GPU_MALLOC((void**)&T->grad, len * sizeof(float));
if (ret) {
fprintf(stderr, "GPU_MALLOC failed!");
GPU_FREE(T->grad);
GPU_FREE(T->data);
free(T->shape);
free(T);
return NULL;
}
}
#elsefree(T->shape);
free(T);
return NULL;
#endif
}
return T;
}
void free_t(Tensor *T)
{
int ret;
if (T == NULL) return;
if (T->device < 0) { //cpufree(T->shape);
free(T->data);
free(T->grad);
free(T->p);
free(T);
T = NULL;
}
else { //gpu#ifdef GPU_EN//choose gpu
ret = GPU_SETDEVICE(T->device);
if (ret) { fprintf(stderr, "GPU_SETDEVICE failed!"); return; }
//freefree(T->shape);
GPU_FREE(T->data);
if (T->type != STATIC_TENSOR) {
GPU_FREE(T->grad);
}
free(T->p);
free(T);
T = NULL;
#endif
}
}
int reshape_t(Tensor *T, int dim, ...)
{
int i, d, len = 1;
va_list vl;
if (T == NULL || T->shape == NULL) return -1;
va_start(vl, dim);
for (i = 0; i < dim; i++)
{
d = va_arg(vl, int);
len *= d;
}
va_end(vl);
if (T->len != len) return -1;
T->dim = dim;
T->len = len;
free(T->shape);
T->shape = (int*)malloc(dim * sizeof(int));
va_start(vl, dim);
for (i = 0; i < dim; i++)
{
d = va_arg(vl, int);
T->shape[i] = d;
}
va_end(vl);
return0;
}
int _reshape_t(Tensor *T, int *shape, int dim,int len)
{
int i;
if (T == NULL || T->shape == NULL) return -1;
T->dim = dim;
T->len = len;
free(T->shape);
T->shape = (int*)malloc(dim * sizeof(int));
for (i = 0; i < dim; i++)
T->shape[i] = shape[i];
return0;
}
int memcpy_t(Tensor *dst, Tensor *src)
{
int i, ret = 0, flag = 1;
float *data_buffer = NULL;
float *grad_buffer = NULL;
if (dst == NULL || src == NULL || dst->data == NULL || src->data == NULL) return -1;
if (dst->len != src->len) return -1; //same lenif (dst->type != src->type) return -1; //same typeif (dst->device < 0 && src->device < 0) { //cpu<-cpufor (i = 0; i < dst->len; i++) dst->data[i] = src->data[i];
if (src->type != STATIC_TENSOR)
for (i = 0; i < dst->len; i++) dst->grad[i] = src->grad[i];
flag = 0;
}
#ifdef GPU_ENelseif (dst->device < 0 && src->device >= 0) { //cpu<-gpu//choose gpu
ret = GPU_SETDEVICE(src->device);
if (ret) { fprintf(stderr, "GPU_SETDEVICE failed!"); return ret; }
//copy data
ret = GPU_MEMCPY(dst->data, src->data, src->len * sizeof(float), GPU_MEMCPY_KIND_D2H);
if (ret) { fprintf(stderr, "GPU_MEMCPY failed!"); return ret; }
//copy gradif (src->type != STATIC_TENSOR) {
ret = GPU_MEMCPY(dst->grad, src->grad, src->len * sizeof(float), GPU_MEMCPY_KIND_D2H);
if (ret) { fprintf(stderr, "GPU_MEMCPY failed!"); return ret; }
}
}
elseif (dst->device >= 0 && src->device < 0) { //gpu<-cpu//choose gpu
ret = GPU_SETDEVICE(dst->device);
if (ret) { fprintf(stderr, "GPU_SETDEVICE failed!"); return ret; }
//copy data
ret = GPU_MEMCPY(dst->data, src->data, src->len * sizeof(float), GPU_MEMCPY_KIND_H2D);
if (ret) { fprintf(stderr, "GPU_MEMCPY failed!"); return ret; }
//copy gradif (src->type != STATIC_TENSOR) {
ret = GPU_MEMCPY(dst->grad, src->grad, src->len * sizeof(float), GPU_MEMCPY_KIND_H2D);
if (ret) { fprintf(stderr, "GPU_MEMCPY failed!"); return ret; }
}
}
else { //gpu<-gpu;if (dst->device == src->device) { //same device//choose gpu
ret = GPU_SETDEVICE(src->device);
if (ret) { fprintf(stderr, "GPU_SETDEVICE failed!"); return ret; }
//copy data
ret = GPU_MEMCPY(dst->data, src->data, src->len * sizeof(float), GPU_MEMCPY_KIND_D2D);
if (ret) { fprintf(stderr, "GPU_MEMCPY failed!"); return ret; }
//copy gradif (src->type != STATIC_TENSOR) {
ret = GPU_MEMCPY(dst->grad, src->grad, src->len * sizeof(float), GPU_MEMCPY_KIND_D2D);
if (ret) { fprintf(stderr, "GPU_MEMCPY failed!"); return ret; }
}
}
else { //different deviceif (src->type == STATIC_TENSOR) {
data_buffer = (float*)malloc(src->len * sizeof(float));
//choose src gpu
ret = GPU_SETDEVICE(src->device);
if (ret) { fprintf(stderr, "GPU_SETDEVICE failed!"); return ret; free(data_buffer); }
//copy data to buffer
ret = GPU_MEMCPY(data_buffer, src->data, src->len * sizeof(float), GPU_MEMCPY_KIND_D2H);
if (ret) { fprintf(stderr, "GPU_MEMCPY failed!"); return ret; free(data_buffer); }
//choose dst gpu
ret = GPU_SETDEVICE(dst->device);
if (ret) { fprintf(stderr, "GPU_SETDEVICE failed!"); return ret; free(data_buffer); }
//copy buffer to dst gpu
ret = GPU_MEMCPY(dst->data, data_buffer, src->len * sizeof(float), GPU_MEMCPY_KIND_H2D);
if (ret) { fprintf(stderr, "GPU_MEMCPY failed!"); return ret; free(data_buffer); }
free(data_buffer);
}
else {
data_buffer = (float*)malloc(src->len * sizeof(float));
grad_buffer = (float*)malloc(src->len * sizeof(float));
//choose src gpu
ret = GPU_SETDEVICE(src->device);
if (ret) { fprintf(stderr, "GPU_SETDEVICE failed!"); return ret; free(data_buffer); free(grad_buffer); }
//copy data,grad to buffer
ret = GPU_MEMCPY(data_buffer, src->data, src->len * sizeof(float), GPU_MEMCPY_KIND_D2H);
if (ret) { fprintf(stderr, "GPU_MEMCPY failed!"); return ret; free(data_buffer); free(grad_buffer); }
ret = GPU_MEMCPY(grad_buffer, src->grad, src->len * sizeof(float), GPU_MEMCPY_KIND_D2H);
if (ret) { fprintf(stderr, "GPU_MEMCPY failed!"); return ret; free(data_buffer); free(grad_buffer); }
//choose dst gpu
ret = GPU_SETDEVICE(dst->device);
if (ret) { fprintf(stderr, "GPU_SETDEVICE failed!"); return ret; free(data_buffer); free(grad_buffer); }
//copy buffer to dst gpu
ret = GPU_MEMCPY(dst->data, data_buffer, src->len * sizeof(float), GPU_MEMCPY_KIND_H2D);
if (ret) { fprintf(stderr, "GPU_MEMCPY failed!"); return ret; free(data_buffer); free(grad_buffer); }
ret = GPU_MEMCPY(dst->grad, grad_buffer, src->len * sizeof(float), GPU_MEMCPY_KIND_H2D);
if (ret) { fprintf(stderr, "GPU_MEMCPY failed!"); return ret; free(data_buffer); free(grad_buffer); }
free(data_buffer);
free(grad_buffer);
}
}
}
#elseif (flag) return -1;
#endifreturn ret;
}
void print_t(Tensor *T)
{
int i, ct, dtp, len;
Tensor *Tp = NULL;
if (T == NULL || T->data == NULL) return;
len = T->len;
dtp = T->shape[T->dim - 1];
printinfo(T);
printf("\n");
if (T->device < 0) {
ct = 0;
printf("data:\n");
while (ct < len) {
for (i = 0; i < dtp; i++) {
printf("%-11.6f", T->data[ct++]);
}
printf("\n");
}
if (T->type != STATIC_TENSOR) {
ct = 0;
printf("grad:\n");
while (ct < len) {
for (i = 0; i < dtp; i++) {
printf("%-11.6f", T->grad[ct++]);
}
printf("\n");
}
}
}
else {
Tp = copy_t(-1, "", T);
if (memcpy_t(Tp, T)) {
free_t(Tp);
return;
}
ct = 0;
printf("data:\n");
while (ct < len) {
for (i = 0; i < dtp; i++) {
printf("%-11.6f", Tp->data[ct++]);
}
printf("\n");
}
if (T->type != STATIC_TENSOR) {
ct = 0;
printf("grad:\n");
while (ct < len) {
for (i = 0; i < dtp; i++) {
printf("%-11.6f", Tp->grad[ct++]);
}
printf("\n");
}
}
free_t(Tp);
}
printf("\n");
}
void printshape(Tensor * T)
{
int i;
if (T == NULL) return;
printf("(");
for (i = 0; i < T->dim; i++) {
printf("%d", T->shape[i]);
if (i != T->dim - 1) printf(",");
}
printf(")");
}
void printinfo(Tensor * T)
{
if (T == NULL) return;
printf("%s(%d", T->name, T->device);
if (T->type == STATIC_TENSOR) printf("S");
elseif (T->type == VARIABLE_TENSOR) printf("V");
elseif (T->type == COMPUTE_TENSOR) printf("C");
printf(")");
printshape(T);
}
int reset_gpu(int device)
{
int ret = 0;
#ifdef GPU_EN//choose gpu
ret = GPU_SETDEVICE(device);
if (ret) { fprintf(stderr, "GPU_SETDEVICE failed!"); return ret; }
//reset
ret = GPU_RESET();
if (ret) { fprintf(stderr, "GPU_RESET failed!"); return ret; }
#elsereturn -1;
#endifreturn ret;
}
int malloc_tp(Tensor *T, int pn, ...)
{
int i;
va_list vl;
if (T == NULL) return -1;
if (T->type != COMPUTE_TENSOR) return -1;
T->pn = pn;
T->p = (Tensor**)malloc(pn * sizeof(Tensor**));
va_start(vl, pn);
for (i = 0; i < pn; i++)
{
T->p[i] = va_arg(vl, Tensor*);
}
va_end(vl);
return0;
}
Tensor* copy_t(int device, constchar *name, Tensor *src)
{
int dim, i;
Tensor *T = NULL;
if (src == NULL || src->data == NULL) return NULL;
T = malloc_t(device, name, src->type, 1, src->len);
if (T == NULL) return NULL;
memcpy_t(T, src);
dim = src->dim;
free(T->shape);
T->shape = (int*)malloc(dim * sizeof(int));
for (i = 0; i < dim; i++) {
T->shape[i] = src->shape[i];
}
return T;
}
int memcpy_tdata(Tensor *T, constfloat * src, int srclen)
{
int ret = 0;
if (T == NULL || T->data == NULL) return -1;
if (srclen != T->len) return -1;
if (T->device < 0) { //cpu
memcpy_s(T->data, T->len * sizeof(float), src, srclen * sizeof(float));
}
else { //gpu#ifdef GPU_EN//choose gpu
ret = GPU_SETDEVICE(T->device);
if (ret) { fprintf(stderr, "GPU_SETDEVICE failed!"); return ret; }
//copy data
ret = GPU_MEMCPY(T->data, src, srclen * sizeof(float), GPU_MEMCPY_KIND_H2D);
if (ret) { fprintf(stderr, "GPU_MEMCPY failed!"); return ret; }
#elsereturn -1;
#endif
}
return ret;
}
int memcpy_tgrad(Tensor *T, constfloat *src, int srclen)
{
int ret = 0;
if (T == NULL || T->grad == NULL) return -1;
if (srclen != T->len) return -1;
if (T->device < 0) { //cpu
memcpy_s(T->grad, T->len * sizeof(float), src, srclen * sizeof(float));
}
else { //gpu#ifdef GPU_EN//choose gpu
ret = GPU_SETDEVICE(T->device);
if (ret) { fprintf(stderr, "GPU_SETDEVICE failed!"); return ret; }
//copy data
ret = GPU_MEMCPY(T->grad, src, srclen * sizeof(float), GPU_MEMCPY_KIND_H2D);
if (ret) { fprintf(stderr, "GPU_MEMCPY failed!"); return ret; }
#elsereturn -1;
#endif
}
return ret;
}
void analyze_t(Tensor *T)
{
int i;
Tensor *Tp;
tensor_sq queue_t;
tensor_sq stack_t;
if (T == NULL) return;
tsq_malloc(&queue_t);
tsq_malloc(&stack_t);
tq_init(&queue_t);
ts_init(&stack_t);
tq_enqueue(&queue_t, T);
while (!tq_empty(&queue_t)) {
Tp = tq_dequeue(&queue_t);
ts_push(&stack_t, Tp);
if (Tp->p != NULL) {
printinfo(Tp); printf(":\n");
for (i = 0; i < Tp->pn; i++) {
printinfo(Tp->p[i]); if (i != Tp->pn - 1)printf(", ");
tq_enqueue(&queue_t, Tp->p[i]);
}
printf("\n\n");
}
}
printf("topological sequence:\n");
while (!ts_empty(&stack_t)) {
Tp = ts_pop(&stack_t);
printinfo(Tp); printf(" -> ");
}
printf("\n\n");
tsq_free(&queue_t);
tsq_free(&stack_t);
}
int forward_t(Tensor * T)
{
int i, ret = 0;
Tensor *Tp;
tensor_sq queue_t;
tensor_sq stack_t;
if (T == NULL || T->type != COMPUTE_TENSOR) return;
tsq_malloc(&queue_t);
tsq_malloc(&stack_t);
tq_init(&queue_t);
ts_init(&stack_t);
tq_enqueue(&queue_t, T);
while (!tq_empty(&queue_t)) {
Tp = tq_dequeue(&queue_t);
ts_push(&stack_t, Tp);
if (Tp->p != NULL)
for (i = 0; i < Tp->pn; i++)
tq_enqueue(&queue_t, Tp->p[i]);
}
while (!ts_empty(&stack_t)) {
Tp = ts_pop(&stack_t);
if (Tp->gendata != NULL) {
ret = Tp->gendata(Tp);
if (ret) { tsq_free(&queue_t); tsq_free(&stack_t); return -1; }
}
}
tsq_free(&queue_t);
tsq_free(&stack_t);
return ret;
}
void zerograd_t(Tensor * T)
{
int i;
Tensor *Tp;
tensor_sq queue_t;
float *buffer = NULL;
if (T == NULL) return;
tsq_malloc(&queue_t);
tq_init(&queue_t);
tq_enqueue(&queue_t, T);
while (!tq_empty(&queue_t)) {
Tp = tq_dequeue(&queue_t);
if (Tp->p != NULL)
for (i = 0; i < Tp->pn; i++)
tq_enqueue(&queue_t, Tp->p[i]);
if (Tp->grad != NULL) {
buffer = (float*)malloc(Tp->len * sizeof(float));
for (i = 0; i < Tp->len; i++) buffer[i] = 0;
memcpy_tgrad(Tp, buffer, Tp->len);
free(buffer);
}
}
tsq_free(&queue_t);
}
int backward_t(Tensor * T)
{
int i, ret = 0;
float *fbuffer;
Tensor *Tp;
tensor_sq queue_t;
if (T == NULL || T->grad == NULL) return;
//init grad
fbuffer = (float*)malloc(T->len * sizeof(float));
for (i = 0; i < T->len; i++) fbuffer[i] = 1.0;
memcpy_tgrad(T, fbuffer, T->len);
free(fbuffer);
//backward
tsq_malloc(&queue_t);
tq_init(&queue_t);
tq_enqueue(&queue_t, T);
while (!tq_empty(&queue_t)) {
Tp = tq_dequeue(&queue_t);
if (Tp->p != NULL)
for (i = 0; i < Tp->pn; i++)
tq_enqueue(&queue_t, Tp->p[i]);
if (Tp->gengrad != NULL) {
ret = Tp->gengrad(Tp);
if (ret) { tsq_free(&queue_t); return -1; }
}
}
tsq_free(&queue_t);
return ret;
}
void deepfree_t(Tensor *T)
{
int i;
Tensor *Tp;
tensor_sq queue_t;
if (T == NULL) return;
tsq_malloc(&queue_t);
tq_init(&queue_t);
tq_enqueue(&queue_t, T);
while (!tq_empty(&queue_t)) {
Tp = tq_dequeue(&queue_t);
if (Tp->p != NULL)
for (i = 0; i < Tp->pn; i++)
tq_enqueue(&queue_t, Tp->p[i]);
free_t(Tp);
}
tsq_free(&queue_t);
}
Tensor** all_tensors(Tensor *T, int _static, int _variable, int _compute, int *len)
{
int i, ct = 0;
Tensor *Tp;
Tensor **LT;
tensor_sq queue_t;
tensor_sq stack_t;
if (T == NULL) return NULL;
tsq_malloc(&queue_t);
tsq_malloc(&stack_t);
tq_init(&queue_t);
ts_init(&stack_t);
tq_enqueue(&queue_t, T);
while (!tq_empty(&queue_t)) {
Tp = tq_dequeue(&queue_t);
ts_push(&stack_t, Tp);
if (Tp->p != NULL)
for (i = 0; i < Tp->pn; i++)
tq_enqueue(&queue_t, Tp->p[i]);
}
LT = (Tensor**)malloc(ts_getcount(&stack_t) * sizeof(Tensor*));
while (!ts_empty(&stack_t)) {
Tp = ts_pop(&stack_t);
if ((Tp->type == STATIC_TENSOR) && (_static))
LT[ct++] = Tp;
elseif ((Tp->type == VARIABLE_TENSOR) && (_variable))
LT[ct++] = Tp;
elseif ((Tp->type == COMPUTE_TENSOR) && (_compute))
LT[ct++] = Tp;
}
len[0] = ct;
tsq_free(&queue_t);
tsq_free(&stack_t);
return LT;
}
void _shuffle(int * L, int len)
{
int i, p, t;
for (i = len - 1; i > 0; i--) {
p = rand() % i;
t = L[i];
L[i] = L[p];
L[p] = t;
}
}
tensor_sq.h:
#ifndef TENSOR_SQ_H#define TENSOR_SQ_H#include "../tensor.h"#define SQ_BUFFER_BYTES 4096typedefstruct tensor_sq {
unsignedchar *buffer;
int tq_datasize;
int tq_front;
int tq_rear;
int tq_count;
int tq_fastfg;
int ts_datasize;
int ts_top;
int ts_fastfg;
}tensor_sq;
void tsq_malloc(tensor_sq *SQ);
void tsq_free(tensor_sq *SQ);
//queuevoid tq_init(tensor_sq *Q);
int tq_enqueue(tensor_sq *Q, Tensor *T);
Tensor* tq_dequeue(tensor_sq *Q);
int tq_empty(tensor_sq *Q);
int tq_getcount(tensor_sq *Q);
//stackvoid ts_init(tensor_sq *S);
int ts_push(tensor_sq *S, Tensor *T);
Tensor* ts_pop(tensor_sq *S);
Tensor* ts_gettop(tensor_sq *S);
int ts_empty(tensor_sq *S);
int ts_getcount(tensor_sq *S);
#endif
tensor.h:/*This file is important that must be included*/#ifndef TENSOR_H#define TENSOR_H#define GPU_EN //Enable GPU or parallel-computing-device#define STATIC_TENSOR 0 //tensors have the...