5.13
A
关键路径%xmm0 add。
B
浮点数加法的延迟界限。
C
整数加法的延迟界限。
D
由于乘法并未不是循环迭代间的关键路径,下一循环乘法开始时不需要上一循环的乘法结束,所以乘法操作可以一直处于流水线中并可以提前进行,乘法操作完毕后将值赋给当前循环的sum即可;且由于乘法操作的吞吐量大于该程序的运行速度,所以该程序瓶颈点并非乘法。
5.14
void inner4(vec_ptr u, vec_ptr v, data_t* dest) {
long i;
long len = vec_length(u);
data_t* udata = get_vec_start(u);
data_t* vdata = get_vec_start(v);
data_t sum = 0;
for (i = 0; i < len - 5; i++) {
sum += udata[0] * vdata[0] +
udata[1] * vdata[1] +
udata[2] * vdata[2] +
udata[3] * vdata[3] +
udata[4] * vdata[4] +
udata[5] * vdata[5];
}
for (; i < len; i++) {
sum += udata[i] * vdata[i];
}
*dest = sum;
}
A
每个时钟周期只能从内存中加载两个值。
B
浮点数加法限制了速度。
5.15
void inner4(vec_ptr u, vec_ptr v, data_t* dest) {
long i;
long len = vec_length(u);
data_t* udata = get_vec_start(u);
data_t* vdata = get_vec_start(v);
data_t sum0 = 0;
data_t sum1 = 0;
data_t sum2 = 0;
data_t sum3 = 0;
data_t sum4 = 0;
data_t sum5 = 0;
for (i = 0; i < len - 5; i++) {
sum0 += udata[0] * vdata[0];
sum1 += udata[1] * vdata[1];
sum2 += udata[2] * vdata[2];
sum3 += udata[3] * vdata[3];
sum4 += udata[4] * vdata[4];
sum5 += udata[5] * vdata[5];
}
for (; i < len; i++) {
sum0 += udata[i] * vdata[i];
}
sum0 += sum1 + sum2 + sum3 + sum4 + sum5;
*dest = sum0;
}
1.00只是理论的下界,有多种其他因素限制了实际速度。
5.16
void inner4(vec_ptr u, vec_ptr v, data_t* dest) {
long i;
long len = vec_length(u);
data_t* udata = get_vec_start(u);
data_t* vdata = get_vec_start(v);
data_t sum = 0;
for (i = 0; i < len - 5; i++) {
sum = sum +
(udata[0] * vdata[0] +
udata[1] * vdata[1] +
udata[2] * vdata[2] +
udata[3] * vdata[3] +
udata[4] * vdata[4] +
udata[5] * vdata[5]);
}
for (; i < len; i++) {
sum += udata[i] * vdata[i];
}
*dest = sum;
}
5.17
void* basic_memset(void* s, int c, size_t n) {
unsigned long qword = (unsigned char)c;
unsigned long* sqword = (unsigned long*)s;
unsigned char* schar = (unsigned char*)s;
qword &= qword << 8;
qword &= qword << 16;
qword &= qword << 32;
int cnt = 0;
int k = sizeof(unsigned long);
int limit = n - k + 1;
while (cnt < n && (unsigned int)sqword % k) {
*(schar++) = (unsigned char)c;
cnt++;
}
sqword = (unsigned long*)schar;
while (cnt < limit) {
*sqword = qword;
sqword += 8;
cnt += 8;
}
schar = (unsigned char*)sqword;
while (cnt < n) {
(*schar++) = (unsigned char)c;
++cnt;
}
return s;
}
5.18
double mul() {
double result0 = 0;
double result1 = 0;
double result2 = 0;
double result3 = 0;
double result4 = 0;
double xpwr = 1;
double xmul = x * x * x * x * x;
double xpwr1 = xmul;
int limit = N - 9;
int i = 0;
for (i = 0; i < limit; i+=10) {
result0 = result0 + (xpwr * a[i] + xpwr1 * a[i + 5]);
result1 = result1 + (xpwr * a[i + 1] + xpwr1 * a[i + 6]);
result2 = result2 + (xpwr * a[i + 2] + xpwr1 * a[i + 7]);
result3 = result3 + (xpwr * a[i + 3] + xpwr1 * a[i + 8]);
result4 = result4 + (xpwr * a[i + 4] + xpwr1 * a[i + 9]);
xpwr *= xmul;
xpwr1 = xpwr * xmul;
}
for (; i < N; i++) {
result0 += xpwr * a[i];
xpwr *= x;
}
result0 += x * result1 +
x * x * result2 +
x * x * x * result3 +
x * x * x * x * result4;
return result0;
}
5.19
两次展开:
void psum(float a[], float p[], long n) {
long i;
long limit = n - 1;
float last_val, val0, val1;
last_val = p[0] = a[0];
for (i = 1; i < limit; i+=2) {
val0 = last_val + a[i];
val1 = last_val + a[i] + a[i + 1];
p[i] = val0;
p[i + 1] = val1;
last_val = val1;
}
for (; i < n; i++) {
val0 = last_val + a[i];
}
}