NV_NVDLA_cdp.cpp
reset_stats_regs
void NV_NVDLA_cdp::reset_stats_regs() {
lut_o_flow = 0;
lut_u_flow = 0;
lut_le_hit = 0;
lut_lo_hit = 0;
lut_hybrid_hit = 0;
o_cvt_o_flow = 0;
nan_input_num = 0;
inf_input_num = 0;
nan_output_num = 0;
}
这是 NV_NVDLA_cdp
类中的 reset_stats_regs
函数的定义部分。这个函数用于重置与统计相关的寄存器或变量,以便在统计新的数据之前清除之前的统计信息。以下是函数中执行的重置操作:
lut_o_flow
:将 LUT(查找表)的溢出计数器重置为0。lut_u_flow
:将 LUT 的下溢计数器重置为0。lut_le_hit
:将 LUT 的小于等于命中计数器重置为0。lut_lo_hit
:将 LUT 的小于命中计数器重置为0。lut_hybrid_hit
:将 LUT 的混合命中计数器重置为0。o_cvt_o_flow
:将输出转换器的溢出计数器重置为0。nan_input_num
:将输入数据中的NaN(非数字)数量计数器重置为0。inf_input_num
:将输入数据中的Inf(无穷大)数量计数器重置为0。nan_output_num
:将输出数据中的NaN 数量计数器重置为0。
这些重置操作可以用于在开始新的数据处理任务之前清除先前任务的统计信息,以确保统计信息的准确性。
update_stats_regs
void NV_NVDLA_cdp::update_stats_regs() {
cslInfo(("NV_NVDLA_cdp, nan_input_num:%d\n", nan_input_num));
cslInfo(("NV_NVDLA_cdp, inf_input_num:%d\n", inf_input_num));
cslInfo(("NV_NVDLA_cdp, nan_output_num:%d\n", nan_output_num));
cslInfo(("NV_NVDLA_cdp, lut_u_flow:%d\n", lut_u_flow));
cslInfo(("NV_NVDLA_cdp, lut_o_flow:%d\n", lut_o_flow));
cslInfo(("NV_NVDLA_cdp, lut_le_hit:%d\n", lut_le_hit));
cslInfo(("NV_NVDLA_cdp, lut_lo_hit:%d\n", lut_lo_hit));
cslInfo(("NV_NVDLA_cdp, lut_hybrid_hit:%d\n", lut_hybrid_hit));
cdp_reg_model::CdpUpdateStatusRegister((uint32_t)NVDLA_CDP_D_NAN_INPUT_NUM_0, cdp_consumer_, (uint32_t)nan_input_num);
cdp_reg_model::CdpUpdateStatusRegister((uint32_t)NVDLA_CDP_D_INF_INPUT_NUM_0, cdp_consumer_, (uint32_t)inf_input_num);
cdp_reg_model::CdpUpdateStatusRegister((uint32_t)NVDLA_CDP_D_NAN_OUTPUT_NUM_0, cdp_consumer_, (uint32_t)nan_output_num);
cdp_reg_model::CdpUpdateStatusRegister((uint32_t)NVDLA_CDP_D_OUT_SATURATION_0, cdp_consumer_, (uint32_t)o_cvt_o_flow);
// if (cdp_lut_en_ == NVDLA_CDP_D_PERF_ENABLE_LUT_EN_ENABLE) {
cdp_reg_model::CdpUpdateStatusRegister((uint32_t)NVDLA_CDP_D_PERF_LUT_UFLOW_0,
cdp_consumer_,
(uint32_t)lut_u_flow);
cdp_reg_model::CdpUpdateStatusRegister((uint32_t)NVDLA_CDP_D_PERF_LUT_OFLOW_0,
cdp_consumer_,
(uint32_t)lut_o_flow);
cdp_reg_model::CdpUpdateStatusRegister((uint32_t)NVDLA_CDP_D_PERF_LUT_HYBRID_0,
cdp_consumer_,
(uint32_t)lut_hybrid_hit);
cdp_reg_model::CdpUpdateStatusRegister((uint32_t)NVDLA_CDP_D_PERF_LUT_LE_HIT_0,
cdp_consumer_,
(uint32_t)lut_le_hit );
cdp_reg_model::CdpUpdateStatusRegister((uint32_t)NVDLA_CDP_D_PERF_LUT_LO_HIT_0,
cdp_consumer_,
(uint32_t)lut_lo_hit );
// }
}
update_stats_regs
函数用于更新统计相关的寄存器或变量的值,以便在数据处理任务完成后记录统计信息。以下是函数执行的操作:
-
通过
cslInfo
打印输出输入数据中的 NaN(非数字)数量,Inf(无穷大)数量以及输出数据中的 NaN 数量,以便进行调试和记录。 -
更新输入数据中的 NaN 数量、Inf 数量和输出数据中的 NaN 数量到相应的寄存器中。这些寄存器包括:
NVDLA_CDP_D_NAN_INPUT_NUM_0
用于记录输入数据中的 NaN 数量。NVDLA_CDP_D_INF_INPUT_NUM_0
用于记录输入数据中的 Inf 数量。NVDLA_CDP_D_NAN_OUTPUT_NUM_0
用于记录输出数据中的 NaN 数量。NVDLA_CDP_D_OUT_SATURATION_0
用于记录输出数据中的溢出数量。
-
如果激活了 LUT(查找表)功能(
cdp_lut_en_ == NVDLA_CDP_D_PERF_ENABLE_LUT_EN_ENABLE
),则更新与 LUT 相关的统计信息寄存器。这些寄存器包括:NVDLA_CDP_D_PERF_LUT_UFLOW_0
用于记录 LUT 下溢的数量。NVDLA_CDP_D_PERF_LUT_OFLOW_0
用于记录 LUT 溢出的数量。NVDLA_CDP_D_PERF_LUT_HYBRID_0
用于记录 LUT 混合命中的数量。NVDLA_CDP_D_PERF_LUT_LE_HIT_0
用于记录 LUT 小于等于命中的数量。NVDLA_CDP_D_PERF_LUT_LO_HIT_0
用于记录 LUT 小于命中的数量。
这些操作用于在数据处理任务完成后记录各种统计信息,以便后续的分析和监视。
CdpDataPathReadphileSequence
void NV_NVDLA_cdp::CdpDataPathReadphileSequence() {
reset_stats_regs();
CdpDataPathSequence();
update_stats_regs();
}
CdpDataPathReadphileSequence
函数用于执行 CDP(Channel Data Processor)数据路径的读操作序列。具体而言,它执行以下操作:
-
调用
reset_stats_regs
函数来重置统计相关的寄存器或变量,以准备记录新的统计信息。 -
调用
CdpDataPathSequence
函数执行 CDP 数据路径的操作,包括数据处理和计算等。 -
调用
update_stats_regs
函数来更新统计信息的寄存器或变量,以记录数据处理任务完成后的统计信息,包括 NaN 数量、Inf 数量、LUT(查找表)相关的统计信息等。
通过这些操作,函数实现了 CDP 数据路径的读操作序列,并记录了相关的统计信息,以供后续的分析和监视使用。
CdpDataPathWritephileSequence & CdpDataPathOrdinarySequence
#pragma CTC SKIP
void NV_NVDLA_cdp::CdpDataPathWritephileSequence() {
}
void NV_NVDLA_cdp::CdpDataPathOrdinarySequence() {
}
#pragma CTC ENDSKIP
CdpDataPathWritephileSequence
和 CdpDataPathOrdinarySequence
函数都被标记为跳过执行(CTC SKIP)。这意味着这两个函数可能是暂时不需要的或者不会被执行的代码块。在代码中使用这种方式可以方便地注释掉不需要的部分,而不必删除代码。如果需要后续添加或执行这些部分,只需取消跳过标记即可。
CdpDataPathSequence
void NV_NVDLA_cdp::CdpDataPathSequence() {
// Config variables, they have corresponding value in registers
uint32_t cube_width, cube_height, cube_channel;
uint8_t element_per_group_src;
// Control variables
// # Iterators
uint32_t width_iter;
uint32_t surf_iter;
uint32_t surf_num;
uint32_t line_iter;
uint32_t round_iter;
uint32_t atom_num;
uint32_t nxt_atom_num;
uint32_t atom_iter;
uint32_t nxt_atom_iter;
uint32_t processed_atom_num;
uint32_t atom_num_per_line;
uint32_t hls_out_num_;
uint32_t hls_lookup_num_;
// # Evaluated variable
//uint64_t payload_addr;
//uint32_t payload_size;
//uint32_t payload_atom_num;
//uint8_t element_size_src;
int8_t *first_half_calc_buffer;
int8_t *second_half_calc_buffer;
int8_t *cur_calc_buffer;
int8_t *pre_calc_buffer;
int8_t *nxt_calc_buffer;
// Varibles for HLS
// Input
// uint16_t data0, data1, data2, data3;
int16_t data_to_hls[12];
int8_t *data_to_hls_i8 = (int8_t*)(&data_to_hls[0]);
bool is_int8;
int8_t *byte;
uint32_t cdp_width_cnt, cdp_channel_cnt;
bool cdp_channel_last;
uint32_t round_num;
// Input
//uint32_t cdp_width_out, cdp_channel_cnt_out;
//bool cdp_channel_last_out;
int8_t *rdma_data_ptr;
CdpConfig *cfg;
uint32_t parallel_num;
// Copy from register value to local config variables, similar with RTL connection, begin
// # Cube setting
cfg = cdp_fifo_cfg_dp_->read();
cube_width = cfg->cdp_rdma_width_+1;
cube_height = cfg->cdp_rdma_height_+1;
cube_channel = cfg->cdp_rdma_channel_+1;
delete cfg;
// # Precision setting
is_int8 = false;
switch (cdp_input_data_type_) {
case DATA_FORMAT_IS_INT8: {
element_per_group_src = ELEMENT_PER_GROUP_INT8;
is_int8 = true;
parallel_num = 8;
break;
}
case DATA_FORMAT_IS_INT16: {
element_per_group_src = ELEMENT_PER_GROUP_INT16;
parallel_num = 4;
break;
}
case DATA_FORMAT_IS_FP16: {
element_per_group_src = ELEMENT_PER_GROUP_FP16;
cdp_datin_offset_ = 0;
parallel_num = 4;
break;
}
#pragma CTC SKIP
default: break;
#pragma CTC ENDSKIP
}
first_half_calc_buffer = dp_calc_buffer;
second_half_calc_buffer = dp_calc_buffer + 8*ATOM_CUBE_SIZE;
surf_num = (cube_channel+element_per_group_src-1) / element_per_group_src;
surf_iter = 0;
width_iter = 0; // width of current stripe
processed_atom_num = 0;
hls_out_num_ = 0;
hls_lookup_num_ = 0;
round_num = surf_num*ATOM_CUBE_SIZE/8;
cslInfo(("%s WxHxC=%dx%dx%d, format:%d\n", __FUNCTION__,
cube_width, cube_height, cube_channel,cdp_input_data_type_));
cslInfo(("\t surf_num:%d, round_num:%d\n", surf_num, round_num));
for (line_iter=0; line_iter < cube_height; line_iter++) {
processed_atom_num = 0;
atom_num_per_line = cube_width * surf_num;
while(processed_atom_num < atom_num_per_line) { // While loop to process the stripes in each line
// Process a stripe
cur_calc_buffer = first_half_calc_buffer; // size of dp_calc_buffer is 8*ATOM_CUBE_SIZE*2
nxt_calc_buffer = pre_calc_buffer = second_half_calc_buffer;
// Copy the first 256B data (may be less than 256B) to cur_calc_buffer
atom_num = rdma_atom_num_fifo_->read();
processed_atom_num += atom_num;
cslDebug((50, "CDP line_iter=%d processed_atom_num=%d\n", line_iter, processed_atom_num));
hls_atom_num_fifo_->write(atom_num);
for (atom_iter = 0; atom_iter < atom_num; atom_iter++) {
rdma_data_ptr = rdma_fifo_->read();
memcpy (&cur_calc_buffer[atom_iter*ATOM_CUBE_SIZE], rdma_data_ptr, ATOM_CUBE_SIZE); // pre_calc_buffer is 8*32*2B
delete [] rdma_data_ptr;
}
for (round_iter=0; round_iter<round_num; round_iter++) { // Each round produces 8B result from HLS module (in Channel direction)
for (atom_iter=0; atom_iter<atom_num; atom_iter++) {
byte = &cur_calc_buffer[atom_iter*ATOM_CUBE_SIZE];
if(round_iter==0) { // First surf, padding the first 8B
if (is_int8) {
// Each data is 1Byte
data_to_hls_i8[0] = data_to_hls_i8[1] =
data_to_hls_i8[2] = data_to_hls_i8[3] = (int8_t)cdp_datin_offset_;
memcpy(&data_to_hls_i8[4], byte, 12);
} else {
// Each data is 2Bytes
data_to_hls[0] = data_to_hls[1] =
data_to_hls[2] = data_to_hls[3] = cdp_datin_offset_;
memcpy(&data_to_hls[4], byte, 16);
}
}
else if(round_iter==(round_num-1)) { // Last round of last surf. Padding the last 8B
if (is_int8) {
memcpy(data_to_hls_i8, &byte[16+4], 12);
data_to_hls_i8[12] = data_to_hls_i8[13] =
data_to_hls_i8[14] = data_to_hls_i8[15] = (int8_t)cdp_datin_offset_;
} else {
// Each data is 2Bytes
memcpy(data_to_hls, &byte[16], 16);
data_to_hls[8] = data_to_hls[9] =
data_to_hls[10] = data_to_hls[11] = cdp_datin_offset_;
}
}
else if((round_iter%4)==1 || (round_iter%4)==2) {
if (is_int8) {
memcpy(data_to_hls_i8, &byte[((round_iter%4)-1)*8 + 4], 16);
} else {
memcpy(data_to_hls, &byte[((round_iter%4)-1)*8], 24);
}
}
else if(round_iter%4==0) {
if(cdp_sqsum_bypass_ && atom_iter==0) {
// Copy from rdma_fifo_ to nxt buffer
nxt_atom_num = rdma_atom_num_fifo_->read();
processed_atom_num += nxt_atom_num;
cslDebug((50, "CDP line_iter=%d processed_atom_num=%d\n", line_iter, processed_atom_num));
#pragma CTC SKIP
if(nxt_atom_num!=atom_num) {
FAIL((("atom_num should be same in same stripe. atom_num=%d, nxt_atom_num=%d"), atom_num, nxt_atom_num));
}
#pragma CTC ENDSKIP
hls_atom_num_fifo_->write(nxt_atom_num);
for (nxt_atom_iter = 0; nxt_atom_iter < nxt_atom_num; nxt_atom_iter++) {
rdma_data_ptr = rdma_fifo_->read();
memcpy (&cur_calc_buffer[nxt_atom_iter*ATOM_CUBE_SIZE], rdma_data_ptr, ATOM_CUBE_SIZE); // pre_calc_buffer is 8*32*2B
delete [] rdma_data_ptr;
}
}
if (is_int8) {
memcpy(data_to_hls_i8, &pre_calc_buffer[atom_iter*ATOM_CUBE_SIZE + 24 + 4], 4);
memcpy(&data_to_hls_i8[4], byte, 12);
} else {
memcpy(data_to_hls, &pre_calc_buffer[atom_iter*ATOM_CUBE_SIZE + 24], 8);
memcpy(&data_to_hls[4], byte, 16);
}
}
else if((round_iter%4)==3) {
// Last round, but not the last round of the last surf
if(!cdp_sqsum_bypass_ && atom_iter==0) {
// Copy from rdma_fifo_ to nxt buffer
nxt_atom_num = rdma_atom_num_fifo_->read();
processed_atom_num += nxt_atom_num;
cslDebug((50, "CDP line_iter=%d processed_atom_num=%d\n", line_iter, processed_atom_num));
#pragma CTC SKIP
if(nxt_atom_num!=atom_num) {
FAIL((("atom_num should be same in same stripe. atom_num=%d, nxt_atom_num=%d"), atom_num, nxt_atom_num));
}
#pragma CTC ENDSKIP
hls_atom_num_fifo_->write(nxt_atom_num);
for (nxt_atom_iter = 0; nxt_atom_iter < nxt_atom_num; nxt_atom_iter++) {
rdma_data_ptr = rdma_fifo_->read();
memcpy (&nxt_calc_buffer[nxt_atom_iter*ATOM_CUBE_SIZE], rdma_data_ptr, ATOM_CUBE_SIZE); // pre_calc_buffer is 8*32*2B
delete [] rdma_data_ptr;
}
}
if (is_int8) {
memcpy(data_to_hls_i8, &byte[16+4], 12);
memcpy(&data_to_hls_i8[12], &nxt_calc_buffer[atom_iter*ATOM_CUBE_SIZE], 4);
} else {
memcpy(data_to_hls, &byte[16], 16);
memcpy(&data_to_hls[8], &nxt_calc_buffer[atom_iter*ATOM_CUBE_SIZE], 8);
}
if(atom_iter==(atom_num-1)) { // The last atom of current round in current stripe
// Shift pointers cur_calc_buffer, pre_calc_buffer, nxt_calc_buffer
cur_calc_buffer = (cur_calc_buffer==first_half_calc_buffer)? second_half_calc_buffer: first_half_calc_buffer;
pre_calc_buffer = (pre_calc_buffer==first_half_calc_buffer)? second_half_calc_buffer: first_half_calc_buffer;
nxt_calc_buffer = pre_calc_buffer;
}
}
//garbage in left padding
if ((round_iter+0)*parallel_num > cube_channel) {
int garbage_element_num = std::min((round_iter+0)*parallel_num - cube_channel, (uint32_t)4);
int valid_element_num = 4 - garbage_element_num;
if (is_int8) {
for(int i = 0; i < garbage_element_num; i++) {
data_to_hls_i8[0 + valid_element_num + i] = (int8_t)cdp_datin_offset_;
}
} else {
for(int i = 0; i < garbage_element_num; i++) {
data_to_hls[0 + valid_element_num + i] = cdp_datin_offset_;
}
}
}
//garbage in current data
if ((round_iter+1)*parallel_num > cube_channel) {
int garbage_element_num = std::min((round_iter+1)*parallel_num - cube_channel, parallel_num);
int valid_element_num = parallel_num - garbage_element_num;
if (is_int8) {
for(int i = 0; i < garbage_element_num; i++) {
data_to_hls_i8[4 + valid_element_num + i] = (int8_t)cdp_datin_offset_;
}
} else {
for(int i = 0; i < garbage_element_num; i++) {
data_to_hls[4 + valid_element_num + i] = cdp_datin_offset_;
}
}
}
//garbage in right padding
if ((round_iter+2)*parallel_num > cube_channel) {
int garbage_element_num = std::min((round_iter+2)*parallel_num - cube_channel, parallel_num);
int valid_element_num = parallel_num - garbage_element_num;
if (is_int8) {
for(int i = 0; i < 4 - valid_element_num; i++) {
data_to_hls_i8[4 + parallel_num + valid_element_num + i] = (int8_t)cdp_datin_offset_;
}
} else {
for(int i = 0; i < 4 - valid_element_num; i++) {
data_to_hls[4 + parallel_num + valid_element_num + i] = cdp_datin_offset_;
}
}
}
cdp_width_cnt = width_iter + atom_iter;
cdp_channel_cnt = surf_iter*ATOM_CUBE_SIZE/8; // Each channel is 8B. Each surf is 32B.
if(cdp_channel_cnt==(surf_num-1)*ATOM_CUBE_SIZE/8)
cdp_channel_last = true;
else
cdp_channel_last = false;
hls_lookup_num_++;
cslDebug((70, "cdp_width_cnt=%d cdp_channel_cnt=%d cdp_channel_last=%d round_iter=%d data_to_hls:\n",
cdp_width_cnt, cdp_channel_cnt, cdp_channel_last, round_iter));
if(is_int8)
{
for(int i=0;i<16;i++) {
cslDebug((70, " %02x", data_to_hls_i8[i] & 0xff));
}
}
else
{
for(int i=0;i<12;i++) {
cslDebug((70, " %04x", data_to_hls[i] & 0xffff));
if(cdp_input_data_type_ == DATA_FORMAT_FP16)
{
uint32_t exp = (data_to_hls[i] >> 10) & 0x1f;
uint32_t frac = data_to_hls[i] & 0x3ff;
if(cdp_nan_to_zero_ && exp == 0x1f && frac != 0) data_to_hls[i] = 0; //nan flush to zero
}
}
}
cslDebug((70, "\n"));
if (is_int8) {
normalz_out_int8 = new int8_t[8];
lookup_lut_int8(data_to_hls_i8, 8);
hls_out_fifo_->write((int16_t*)normalz_out_int8); //8B
cslDebug((50, "normalz_out:"));
for(int i=0;i<8;i++) {
cslDebug((50, " %04x", normalz_out_int8[i]));
}
cslDebug((50, "\n"));
} else {
normalz_out = new int16_t[4];
lookup_lut(data_to_hls, 4);
hls_out_fifo_->write(normalz_out); //8B
cslDebug((50, "normalz_out:"));
for(int i=0;i<4;i++) {
if(cdp_input_data_type_ == DATA_FORMAT_IS_FP16)
{
uint32_t exp = (normalz_out[i] >> 10) & 0x1f;
uint32_t frac = normalz_out[i] & 0x3ff;
if(exp == 0x1f && frac != 0)
{
nan_output_num++;
}
}
cslDebug((50, " %04x", normalz_out[i]));
}
cslDebug((50, "\n"));
}
hls_out_num_++;
cslDebug((50, "hls_lookup_num_=%d\n", hls_lookup_num_));
cslDebug((50, "hls_out_num_=%d\n", hls_out_num_));
cslDebug((50, "\n"));
} // atom_iter
} // round_iter
// Finished processing one stripe (last surf in stripe)
if (surf_iter == (surf_num-1)) {
surf_iter = 0;
if ((width_iter + atom_num) == cube_width) {
width_iter = 0;
}
else
width_iter += atom_num;
}
else
surf_iter++;
} // stripe iter
} // line_iter
cslInfo(("%s finished process of current layer\n", __FUNCTION__));
}
这段代码看起来是关于Cdp数据路径操作的一个序列。下面是代码的主要结构和功能:
-
代码首先从配置寄存器中获取一些配置参数,如输入数据类型、通道数、立方体的宽度和高度等。
-
然后,代码通过迭代处理输入立方体的不同行和通道。外部循环是遍历立方体的不同行(line_iter),内部循环是处理每一行的不同通道和条带。这两个循环一起确保了整个立方体的处理。
-
在每个条带内部,代码处理一个条带的数据。它首先从RDMA模块读取一定数量的原始数据,然后将数据送入HLS模块进行计算。计算结果存储在
hls_out_fifo_
中。 -
在处理每个原子(atom)时,代码会对数据进行预处理,并将其传递给HLS模块。HLS模块处理数据后,将结果写入
hls_out_fifo_
中。 -
在处理每个原子时,还会执行一些数据的填充和清除操作,以确保数据的正确性。
-
最后,代码更新一些统计数据,如NaN输入数量、NaN输出数量等。
总之,这段代码实现了Cdp数据路径的操作,包括数据的预处理、HLS模块的调用、数据的填充和清除以及统计信息的更新。这些操作用于将输入数据转换为最终的输出数据。