- 5.13
A:略
B:未从xmm1使用上一次的数据,所以xmm1肯定不是关键路径上的寄存器。xmm0位于关键路径上,浮点数加法的延迟为3,所以CPE下界为3。
C:整数数加法的延迟为1,所以CPE下界为1。
D:首先,浮点数乘法可以与关键路径并行进行,且有多个浮点数乘法功能单元,多个加载器,所以只要配置够强大,浮点数乘法不能成为关键路径的阻碍。而关键路径浮点数加法的延迟为3。 - 5.14
A:因为只有两个加载器,而每个元素的循环就需要加载两个数值。
B:即使进行了6x1循环展开,但是还是要依次经行6次浮点加法,算下来单个元素还是需要3个时钟周期。 - 5.15
只有两个加载器,而每个元素的循环就需要加载两个数值。 - 5.16
- 5.17
include <limits.h>
#define K sizeof(unsigned long)
void *word_memset(void *s, int c, size_t n)
{
if (n < K)
{
size_t cnt = 0;
unsigned char *schar = s;
while (cnt < n)
{
*schar++ = (unsigned char)c;
cnt++;
}
}
else
{
unsigned long word = 0;
for (int i = 0; i < K; ++i)
{
word <<= K*CHAR_BIT;
word += (unsigned char)c;
}
size_t cnt = 0;
unsigned long *slong = s;
while (cnt < n)
{
*slong++ = word;
cnt += K;
}
unsigned char *schar = slong;
while (cnt < n)
{
*schar++ = (unsigned char)c;
cnt++;
}
}
return s;
}
- 5.18
double faster_poly(double a[], double x, long degree)
{
long i;
double result1 = a[0];
double result2 = 0;
double result3 = 0;
double result4 = 0;
double result5 = 0;
double result6 = 0;
double result7 = 0;
double result8 = 0;
double result9 = 0;
double result10 = 0;
double xpwr1 = x;
double xpwr2 = xpwr1 * x;
double xpwr3 = xpwr2 * x;
double xpwr4 = xpwr3 * x;
double xpwr5 = xpwr4 * x;
double xpwr6 = xpwr5 * x;
double xpwr7 = xpwr6 * x;
double xpwr8 = xpwr7 * x;
double xpwr9 = xpwr8 * x;
double xpwr10 = xpwr9 * x;
double x10 = xpwr10;
for (i = 1; (i+9) <= degree; i += 10)
{
result1 += a[i] * xpwr1;
result2 += a[i+1] * xpwr2;
result3 += a[i+2] * xpwr3;
result4 += a[i+3] * xpwr4;
result5 += a[i+4] * xpwr5;
result6 += a[i+5] * xpwr6;
result7 += a[i+6] * xpwr7;
result8 += a[i+7] * xpwr8;
result9 += a[i+8] * xpwr9;
result10 += a[i+9] * xpwr10;
xpwr1 *= x10;
xpwr2 *= x10;
xpwr3 *= x10;
xpwr4 *= x10;
xpwr5 *= x10;
xpwr6 *= x10;
xpwr7 *= x10;
xpwr8 *= x10;
xpwr9 *= x10;
xpwr10 *= x10;
}
for (; i <= degree; ++i)
{
result1 += a[i] * xpwr1;
xpwr1 *= x;
}
result1 += result2;
result1 += result3;
result1 += result4;
result1 += result5;
result1 += result6;
result1 += result7;
result1 += result8;
result1 += result9;
result1 += result10;
return result1;
}
如果是整型,编码器能够自行将程序有化成这样的代码
5.19(我暂时还没搞明白)
void faster_psum1a(float a[], float p[], long n)
{
long i;
float val = 0;
for (i = 0; (i+2) < n; i += 3)
{
float tmp1 = a[i];
float tmp2 = tmp1 + a[i+1];
float tmp3 = tmp2 + a[i+2];
p[i] = var + tmp1;
p[i+1] = var + tmp2;
p[i+2] = var = var + tmp3;
}
for (; i < n; ++i)
{
var += a[i];
p[i] = var;
}
代码都引用自https://www.cnblogs.com/liqiuhao/p/7989634.html