5.13
A.
画图:
关键路径为第三幅图加粗部分
B.
下界为浮点加法的延迟界限,CPE 为 3.00
C.
整数加法的延迟界限,CPE 为 1.00
D.
关键路径上只有浮点加法
5.14
void inner5(vec_ptr u, vec_ptr v, data_t *dest) {
long i;
long length = vec_length(u);
long limit = length - 5;
data_t *udata = get_ver_start(u);
data_t *vdata = get_vec_start(v);
data_t sum = (data_t)0;
for (i = 0; i < limit; i += 6) {
sum = sum + udata[i] * vdata[i] +
udata[i + 1] * vdata[i + 1] +
udata[i + 2] * vdata[i + 2] +
udata[i + 3] * vdata[i + 3] +
udata[i + 4] * vdata[i + 4] +
udata[i + 5] * vdata[i + 5];
}
for (; i < length; i++) {
sum = sum + udata[i] * vdata[i];
}
*dest = sum;
}
A.
之所以任何标量版本都无法达到比 1.00 小的 CPE,是因为每个时钟周期只能加载两个值;即使流水线是满的,CPE 也只能为 1.00
B.
无论循环展开多少次,关键路径上还是有 length 个浮点加法
5.15
void inner6(vec_ptr u, vec_ptr v, data_t *dest) {
long i;
long length = vec_length(u);
long limit = length - 5;
data_t *udata = get_ver_start(u);
data_t *vdata = get_vec_start(v);
data_t sum0 = (data_t)0;
data_t sum1 = (data_t)0;
data_t sum2 = (data_t)0;
data_t sum3 = (data_t)0;
data_t sum4 = (data_t)0;
data_t sum5 = (data_t)0;
for (i = 0; i < limit; i += 6) {
sum0 = sum0 + udata[i] * vdata[i];
sum1 = sum1 + udata[i + 1] * vdata[i + 1];
sum2 = sum2 + udata[i + 2] * vdata[i + 2];
sum3 = sum3 + udata[i + 3] * vdata[i + 3];
sum4 = sum4 + udata[i + 4] * vdata[i + 4];
sum5 = sum5 + udata[i + 5] * vdata[i + 5];
}
for (; i < length; i++) {
sum0 = sum0 + udata[i] * vdata[i];
}
*dest = sum0 + sum1 + sum2 + sum3 + sum4 + sum5;
}
只有两个加载单元,一个时钟周期只能加载两个值,CPE 最低只能到 1.00
5.16
void inner7(vec_ptr u, vec_ptr v, data_t *dest) {
long i;
long length = vec_length(u);
long limit = length - 5;
data_t *udata = get_ver_start(u);
data_t *vdata = get_vec_start(v);
data_t sum = (data_t)0;
for (i = 0; i < limit; i += 6) {
sum = sum + (udata[i] * vdata[i] +
(udata[i + 1] * vdata[i + 1] +
(udata[i + 2] * vdata[i + 2] +
(udata[i + 3] * vdata[i + 3] +
(udata[i + 4] * vdata[i + 4] +
udata[i + 5] * vdata[i + 5])))));
}
for (; i < length; i++) {
sum = sum + udata[i] * vdata[i];
}
*dest = sum;
}
5.17
void *new_memset(void *s, int c, size_t n) {
unsigned long w;
unsigned char *pw = (unsigned char *)&w;
size_t cnt = 0;
while (cnt < K) {
*pw++ = (unsigned char)c;
cnt++;
}
size_t i;
unsigned char *schar = s;
for (i = 0; (size_t)schar % K != 0 || i == n; i++) {
*schar++ = (unsigned char)c;
}
size_t limit = n - K + 1;
for (; i < limit && (int)limit > 0; i += K) {
*(unsigned long *)schar = w;
schar += K;
}
for (; i < n; i++) {
*schar++ = (unsigned char)c;
}
return s;
}
5.18
6*3a
double poly_6_3a(double a[], double x, long degree) {
long i = 1;
double result = a[0];
double result1 = 0;
double result2 = 0;
double xpwr = x;
double xpwr1 = x * x * x;
double xpwr2 = x * x * x * x * x;
double xpwr_step = x * x * x * x * x * x;
for (; i < degree - 5; i+=6) {
result = result + (a[i]*xpwr + a[i+1]*xpwr*x);
result1 = result1 + (a[i+2]*xpwr1 + a[i+3]*xpwr1*x);
result2 = result2 + (a[i+4]*xpwr2 + a[i+5]*xpwr2*x);
xpwr *= xpwr_step;
xpwr1 *= xpwr_step;
xpwr2 *= xpwr_step;
}
for (; i <= degree; i++) {
result = result + a[i]*xpwr;
xpwr *= x;
}
return result + result1 + result2;
}
5.19
void psum_4_1a(float a[], float p[], long n) {
long i;
float tmp, tmp1, tmp2, tmp3 = 0;
for (i = 0; i < n - 3; i += 4) {
tmp = tmp3 + a[i];
tmp1 = tmp + a[i + 1];
tmp2 = tmp1 + a[i + 2];
tmp3 = tmp2 + a[i + 3];
p[i] = tmp;
p[i + 1] = tmp1;
p[i + 2] = tmp2;
p[i + 3] = tmp3;
}
for (; i < n; i++) {
tmp3 += a[i];
p[i] = tmp3;
}
}