void inner6_6(vec_ptr u,vec_ptr v, data_t *dest) {
long i;
long length =vec_length(u);
long limit = length-5;
data_t *udata = get_vec_start(u);
data_t *vdata = get_vec_start(v);
data_t sum = (data_t) 0;
for (i =0; i< limit; i+=6) {
sum0 = sum0 + udata[i]* vdata[i];
sum1 = sum1 + udata[i+1]* vdata[i+1];
sum2 = sum2 + udata[i+2]* vdata[i+2];
sum3 = sum3 + udata[i+3]* vdata[i+3];
sum4 = sum4 + udata[i+4]* vdata[i+4];
sum5 = sum5 + udata[i+5]* vdata[i+5];
}
for (; i < length; i++) {
sum = sum + udata[i]* vdata[i];
}
*dest = sum0+sum1+sum2+sum3+sum4+sum5;
}
6条关键路径中,并行累计6个值
限制性能达到PE等于1.00 是寄存器的数量,想想浮点数的寄存器一共有多少个来着?上面的一个sum运算用几个寄存器?