第五章
5.13
A.如图:
关键路径:(粗线部分)更新%xmm0的加法
B. CPE的下界是浮点加法的延迟界限3.0
C. CPE的下界是整数加法的延迟界限1.0
D.因为关键路径上只有浮点数加法
5.14
6*1循环展开
void inner4(vec_ptr u,vec_ptr v, data_t *dest){
long i;
long length = vec_length(u);
data_t *udata = get_vec_start(u);
data_t *vdata = get_vec_start(v);
data_t sum = (data_t) 0;
for(i=0;i<length-5;i+=6){
sum = sum+udata[i]*vdata[i];
sum = sum+udata[i+1]*vdata[i+1];
sum = sum+udata[i+2]*vdata[i+2];
sum = sum+udata[i+3]*vdata[i+3];
sum = sum+udata[i+4]*vdata[i+4];
sum = sum+udata[i+5]*vdata[i+5];
}
for(;i<length;i++){
sum = sum+udata[i]*vdata[i];
}
*dest=sum;
}
A.使用了6*1循环展开后关键路径上仍然是n个加法操作,整数加法的延迟为1个周期,因此CPE不能达到比1.00更小
B.循环展开之前,CPE已经达到了延迟界限设下的限制,循环展开也不能突破这种限制。
5.15
6*6循环展开
void inner4(vec_ptr u,vec_ptr v, data_t *dest){
long i;
long length = vec_length(u);
data_t *udata = get_vec_start(u);
data_t *vdata = get_vec_start(v);
data_t sum = (data_t) 0;
data_t sum1 = (data_t) 0;
data_t sum2 = (data_t) 0;
data_t sum3 = (data_t) 0;
data_t sum4 = (data_t) 0;
data_t sum5 = (data_t) 0;
for(i=0;i<length-5;i+=6){
sum = sum+udata[i]*vdata[i];
sum1 = sum+udata[i+1]*vdata[i+1];
sum2 = sum+udata[i+2]*vdata[i+2];
sum3 = sum+udata[i+3]*vdata[i+3];
sum4 = sum+udata[i+4]*vdata[i+4];
sum5 = sum+udata[i+5]*vdata[i+5];
}
for(;i<length;i++){
sum = sum+udata[i]*vdata[i];
}
*dest=sum+sum1+sum2+sum3+sum4+sum5;
}
浮点数加法的吞吐量界限为1.00,受到功能单元吞吐量的限制,因此CPE最低只能为1.00
5.16
6*1a循环展开
void inner4(vec_ptr u,vec_ptr v,data_t *dest){
long i;
long length = vec_length(u);
data_t *udata = get_vec_start(u);
data_t *vdata = get_vec_start(v);
data_t sum = (data_t)0;
for(i=0;i<length-5;i=i+6){
sum = sum + (udata[i] * vdata[i] +
udata[i+1] * vdata[i+1] +
udata[i+2] * vdata[i+2] +
udata[i+3] * vdata[i+3] +
udata[i+4] * vdata[i+4] +
udata[i+5] * vdata[i+5]);
}
for(;i<length;i++){
sum = sum + udata[i] * vdata[i];
}
*dest = sum;
}
5.17
void* basic_memset(void *s, int c, size_t n) {
size_t cnt = 0;
unsigned char *schar = s;
while (cnt < n) {
*schar++ = (unsigned char) c;
cnt++;
}
return s;
}
void* effective_memset(void *s, unsigned long cs, size_t n) {
size_t K = sizeof(unsigned long);
size_t cnt = 0;
unsigned char *schar = s;
while (cnt < n) {
if ((size_t)schar % K == 0) {
break;
}
*schar++ = (unsigned char)cs;
cnt++;
}
unsigned long *slong = (unsigned long *)schar;
size_t rest = n - cnt;
size_t loop = rest / K;
size_t tail = rest % K;
for (size_t i = 0; i < loop; i++) {
*slong++ = cs;
}
schar = (unsigned char *)slong;
for (size_t i = 0; i < tail; i++) {
*schar++ = (unsigned char)cs;
}
return s;
}
5.18
double poly(double a[], double x, long degree) {
long i;
double result = a[0];
double xpwr = x;
for (i = 1; i <= degree; i++) {
result += a[i] * xpwr;
xpwr = x * xpwr;
}
return result;
}
double poly_6_3a(double a[], double x, long degree) {
long i = 1;
double result = a[0];
double result1 = 0;
double result2 = 0;
double xpwr = x;
double xpwr1 = x * x * x;
double xpwr2 = x * x * x * x * x;
double xpwr_step = x * x * x * x * x * x;
for (; i <= degree - 6; i+=6) {
result = result + (a[i]*xpwr + a[i+1]*xpwr*x);
result1 = result1 + (a[i+2]*xpwr1 + a[i+3]*xpwr1*x);
result2 = result2 + (a[i+4]*xpwr2 + a[i+5]*xpwr2*x);
xpwr *= xpwr_step;
xpwr1 *= xpwr_step;
xpwr2 *= xpwr_step;
}
for (; i <= degree; i++) {
result = result + a[i]*xpwr;
xpwr *= x;
}
return result + result1 + result2;
}
double polyh(double a[], double x, long degree) {
long i;
double result = a[degree];
for (i = degree-1; i >= 0; i--) {
result = a[i] + x*result;
}
return result;
}
5.19
void psum1a(float a[], float p[], long n) {
long i;
float last_val, val;
last_val = p[0] = a[0];
for (i = 1; i < n; i++) {
val = last_val + a[i];
p[i] = val;
last_val = val;
}
}
void psum_4_1a(float a[], float p[], long n) {
long i;
float val, last_val;
float tmp, tmp1, tmp2, tmp3;
last_val = p[0] = a[0];
for (i = 1; i < n - 4; i++) {
tmp = last_val + a[i];
tmp1 = tmp + a[i+1];
tmp2 = tmp1 + a[i+2];
tmp3 = tmp2 + a[i+3];
p[i] = tmp;
p[i+1] = tmp1;
p[i+2] = tmp2;
p[i+3] = tmp3;
last_val = last_val + (a[i] + a[i+1] + a[i+2] + a[i+3]);
}
for (; i < n; i++) {
last_val += a[i];
p[i] = last_val;
}
}