/*
***5.15
*/
A. 画图略
B. 3
C. 1
D. 乘法不在关键路径上,故乘法可以按流水线执行
/*
***5.16
*/
A. 每次要加载两个数据,故至少需要两个周期B. 循环展开并没有改变关键路径长
/*
***5.17
*/
A. 加载数据的时间B. IA32没有足够的寄存器来保存临时变量
/*
***5.18
*/
void inner4 (vec_ptr x,vec_ptr y;data_t *dest)
{
long int i;
int length = vec_length(x);
data_t *xdata = get_vec_start(x);
data_t *ydata = get_vec_start(y);
data_t sum = (data_t)0;
for(i=0;i<length - 2;i+=3)
{
sum= sum+(xdata[i]*ydata[i]+xdata[i+1]*ydata[i+1]+xdata[i+2]*ydata[i+2]);
}
for(;i<length ;i++)
{
sum = sum + xdata[i]*ydata[i];
}
*dest = sum;
}
/*
***5.19
*/
void *word_memeset(void *s, int c, size_t n)
{
size_t cnt = 0;
size_t k = sizeof(unsigned long);
unsigned long l;
unsigned char* schar = (unsigned char*)s;
unsigned char* lchar = (unsigned char *)&l;
//先将long的各字节变为c的低位字节
for(int i = 0; i < k; i++)
lchar[i] = (unsigned char)c;
//将地址调整为k的倍数,直到能够被k整除
while((size_t)schar % k)
{
*schar++ = (unsigned char)c;
cnt++;
}
//调整完地址之后求出新的n,更新cnt=0
n = n - cnt;
cnt = 0;
for(int i = 0;i < n - k +1;i+=k)
{
for(int j = 0;j < k/4;j++)
{
schar[0] = lchar[0];
schar[1] = lchar[1];
schar[2] = lchar[2];
schar[3] = lchar[3];
schar += 4;
cnt +=4;
}
}
//遍历最后几个元素
while(cnt < n)
{
*schar++ = (unsigned char)c;
cnt++;
}
return s;
}
/*
***5.20
*/
//直接求和法的并行
double poly(double a[], double x, int degree)
{
long int i;
double result = a[0];
double result1 = 0,result2 = 0,result3 = 0,result4 = 0;
double result5 =0;
double xpwr1 = x;
double xpwr2 = x * xpwr1;
double xpwr3 = x * xpwr2;
double xpwr4 = x * xpwr3;
double xpwr5 = x * xpwr4;
double step = xpwr5;
for(i = 1;i <= degree - 4;i += 5)
{
result1 += a[i] * xpwr1;
result2 += a[i+1] * xpwr2;
result3 += a[i+2] * xpwr3;
result4 += a[i+3] * xpwr4;
result5 += a[i+4] * xpwr5;
xpwr1 *= step;
xpwr2 *= step;
xpwr3 *= step;
xpwr4 *= step;
xpwr5 *= step;
}
for(;i <= degree;i++)
{
result += a[i] * xpwr1;
xpwr1 *= x;
}
return result = result + result1 + result2 + result3 + result4 + result5;
}
//Horner法的并行
//其实也是采用临时变量的并行累积,以等差的间隔将多项式分成n个部分后再用Horner方法
double polyh(double a[], double x, int degree)
{
long int i;
double result = 0;
if(degree < 5)
{
result = a[degree];
for(i = degree - 1; i >= 0; i--)
result = a[i] + x * result;
}
//否则,5路并行计算
else
{
double result1 = a[degree];
double result2 = a[degree - 1];
double result3 = a[degree - 2];
double result4 = a[degree - 3];
double result5 = a[degree - 4];
double step = x * x * x * x * x;
for(i = degree - 5; i >= 4; i-= 5)
{
result1 = a[i] + result1 * step;
result2 = a[i - 1] + result1 * step;
result3 = a[i - 2] + result2 * step;
result4 = a[i - 3] + result3 * step;
result5 = a[i - 4] + result4 * step;
}
for(;i >= 0;i--)
{
result = a[i] + x * result;
}
result += result1 + result2 + result3 + result4 + result5;
}
return result;
}
/*
***5.21
*/
void psum1(float a[], float p[], long int n)
{
long int i;
long int last_val,val1,val2,val3;
p[0] = last_val = a[0];
for(i = 1; i < n - 2; i += 3)
{
//每个加法和赋值运算都可并行
val1 = last_val + a[i];
val2 = last_val + (a[i] + a[i + 1]);
val3 = last_val + ((a[i] + a[i + 1]) + a[i + 2]);
p[i] = val1;
p[i + 1] = val2;
p[i + 2] = val3;
last_val = val3;
}
for(;i < n;i++)
{
val1 = last_val + a[i];
p[i] = val1;
last_val = val1;
}
}
/*
***5.22
*/
代入公式 S = 1/((1-α) + α/k) 可得:
方案1加速比为 1.25
方案2加速比为 1.2
故第一种方案比较好