CSAPP 第三版 第五章 家庭作业
自己做的 仅供参考 可能出现错误
注:5.18 5.19 mark一下
5.13
A. 略
B. 浮点加法的延迟界限,CPE 3.00
C. 整数加法的延迟界限,CPE 3.00
D. 浮点数乘法与关键路径上的浮点数加法并行,浮点数乘法功能单元容量>1,所以浮点数乘法不会成为关键路径的阻碍。而关键路径上只有浮点加法,浮点数加法的延迟界限为3。
5.14
程序如下:
/* Inner product. Accumulate in temporary */
void inner5(vec_ptr u, vec_ptr v, data_t *dest) {
long i;
long length = vec_length(u);
long limit = length - 5;
data_t *udata = get_ver_start(u);
data_t *vdata = get_vec_start(v);
data_t sum = (data_t)0;
for (i = 0; i < limit; i += 6) {
sum = sum + udata[i] * vdata[i] +
udata[i + 1] * vdata[i + 1] +
udata[i + 2] * vdata[i + 2] +
udata[i + 3] * vdata[i + 3] +
udata[i + 4] * vdata[i + 4] +
udata[i + 5] * vdata[i + 5];
}
for (; i < length; i++) {
sum = sum + udata[i] * vdata[i];
}
*dest = sum;
}
A. 当加法的功能单元和乘法的功能单元全都处于满流水的状态CPE最低,即合并运算达到吞吐量下界。对于整数运算,加法的吞吐量下界为0.5,乘法的吞吐量下界为1.0,故CPE=max{0.5,1.0};对于浮点数运算,加法的吞吐量下界是1.0,乘法的吞吐量下界是0.5,故CPE=max{1.0,0.5}=1.0。综上,CPE的下界是1.0。
B. 即使进行了 6 x 1 循环展开,但是还是要依次进行6次浮点加法,并没有减少内存读写的次数和流水线的发生,算下来单个元素还是需要3个时钟周期。
5.15
/* Inner product. Accumulate in temporary */
void inner6(vec_ptr u, vec_ptr v, data_t *dest) {
long i;
long length = vec_length(u);
long limit = length - 5;
data_t *udata = get_ver_start(u);
data_t *vdata = get_vec_start(v);
data_t sum0 = (data_t)0;
data_t sum1 = (data_t)0;
data_t sum2 = (data_t)0;
data_t sum3 = (data_t)0;
data_t sum4 = (data_t)0;
data_t sum5 = (data_t)0;
for (i = 0; i < limit; i += 6) {
sum0 = sum0 + udata[i] * vdata[i];
sum1 = sum1 + udata[i + 1] * vdata[i + 1];
sum2 = sum2 + udata[i + 2] * vdata[i + 2];
sum3 = sum3 + udata[i + 3] * vdata[i + 3];
sum4 = sum4 + udata[i + 4] * vdata[i + 4];
sum5 = sum5 + udata[i + 5] * vdata[i + 5];
}
for (; i < length; i++) {
sum0 = sum0 + udata[i] * vdata[i];
}
*dest = sum0 + sum1 + sum2 + sum3 + sum4 + sum5;
}
加载器的数量为2,而每个元素的循环就需要两个加载器。
5.16
/* Inner product. Accumulate in temporary */
void inner7(vec_ptr u, vec_ptr v, data_t *dest) {
long i;
long length = vec_length(u);
long limit = length - 5;
data_t *udata = get_ver_start(u);
data_t *vdata = get_vec_start(v);
data_t sum = (data_t)0;
for (i = 0; i < limit; i += 6) {
sum = sum + (udata[i] * vdata[i] +
(udata[i + 1] * vdata[i + 1] +
(udata[i + 2] * vdata[i + 2] +
(udata[i + 3] * vdata[i + 3] +
(udata[i + 4] * vdata[i + 4] +
udata[i + 5] * vdata[i + 5])))));
}
for (; i < length; i++) {
sum = sum + udata[i] * vdata[i];
}
*dest = sum;
}
5.17
void *new_memset(void *s, int c, size_t n) {
unsigned long w;
unsigned char *lw = (unsigned char *)&w;
size_t cnt = 0;
while (cnt < K) {
*lw++ = (unsigned char)c;
cnt++;
}
size_t i;
unsigned char *schar = s;
for (i = 0; (size_t)schar % K != 0 || i == n; i++) {
*schar++ = (unsigned char)c;
}
size_t limit = n - K + 1;
for (; i < limit && (int)limit > 0; i += K) {
*(unsigned long *)schar = w;
schar += K;
}
for (; i < n; i++) {
*schar++ = (unsigned char)c;
}
return s;
}
5.18
double poly_6_3a(double a[], double x, long degree) {
long i = 1;
double result0 = a[0];
double result1 = 0;
double result2 = 0;
double xpwr0 = x;
double xpwr1 = x * x * x;
double xpwr2 = x * x * x * x * x;
double xpwr_step = x * x * x * x * x * x;
for (; i < degree - 5; i+=6) {
result0 = result0 + (a[i] * xpwr0 + a[i + 1] * xpwr0 * x);
result1 = result1 + (a[i + 2] * xpwr1 + a[i + 3] * xpwr1 * x);
result2 = result2 + (a[i + 4] * xpwr2 + a[i + 5] * xpwr2 * x);
xpwr0 *= xpwr_step;
xpwr1 *= xpwr_step;
xpwr2 *= xpwr_step;
}
for (; i <= degree; i++) {
result0 = result0 + a[i] * xpwr0;
xpwr *= x;
}
return result0 + result1 + result2;
}
5.19
void psum_4_1a(float a[], float p[], long n) {
long i;
float tmp0, tmp1, tmp2, tmp3 = 0;
for (i = 0; i < n - 3; i += 4) {
tmp0 = tmp3 + a[i];
tmp1 = tmp0 + a[i + 1];
tmp2 = tmp1 + a[i + 2];
tmp3 = tmp2 + a[i + 3];
p[i] = tmp0;
p[i + 1] = tmp1;
p[i + 2] = tmp2;
p[i + 3] = tmp3;
}
for (; i < n; i++) {
tmp3 += a[i];
p[i] = tmp3;
}
}