# Optimizing Program Performance

Implementation of vector abstract data type

typedef int data_t;

typedef struct{
long int len;
data_t *data;
}vec_rec, *vec_ptr;

#ifdef COMBIN_SUM
#define IDENT 0
#define OP +
#else
#define IDENT 1;
#define OP *;
#endif

/*Create vector of specified length*/
vec_ptr new_vec(long int len)
{
vec_ptr result = (vec_ptr)malloc(sizeof(vec_rec));
if(!result)
return NULL; /*Couldn;t allocate storage*/
result->len = len;
/*Allocate array*/
if(len > 0) {
data_t *data = (data_t*)calloc(sizeof(data_t)*len);
if(!data){
free((void*)result);
return NULL;
}
result->data = data;
}else
result->data = NULL;
return result;
}

/*
* Retrieve vector element and store at dest.
* Return 0 (out of bounds) or 1 (successful)
*/
int get_vec_element(vec_ptr v, long int index, data_t *dest)
{
if(index < 0 ||  index >= v->len)
return 0;
*dest = v->data[index];
return 1;
}

/* Return length of vector */
long int vec_length(vec_ptr v)
{
return v->len;
}

/*Implementation with maximum use of data abstraction*/
void combine1(vec_ptr v, data_t *dest)
{
long int i;

*dest = IDENT;
for(i=0; i<vec_length(v); i++) {
data_t val;
get_vec_element(v,i,&val);
*dest = *dest OP　val;
}
}

### Eliminating Loop Inefficiencies

for(i=0; i

/*Move call to vec_length out of loop*/
void combine2（vec_ptr v, data_t *dest)
{
long int i;
long int length = vec_length(v);

*dest = IDENT;
for(i=0; i<length; i++){
data_t val;
get_vec_element(v,i,&val);
*dest = *dest OP　val;
}
}

### Reducing Procedure Calls

data_t *get_vec_start(vec_ptr v)
{
return v->data;
}

void combine3(vec_ptr v, data_t *dest)
{
long int i;
long int length = vec_length(v);
data_t *data = get_vec_start(v);

*dest = IDENT;
for(i=0; i<length; i++）｛
*dest = *dest OP data[i];
}
}

### Eliminating Unneeded Menory References

combine3: data_t= float,OP = *
i in %rdx, data in %rax,  dest in %rbp
.L498:                          Loop:
movss (%rbp), %xmm0         Read product from dest
mulss (%rax,%rdx,4), %xmm0  Multiply product by data[i]
movss %xmmo, (%rbp)         Store product at dest
addq  $1, %rdx Increment i cmpq %rdx, %r12 Compare i: limit jg .L498 if > goto Loop  从上述的汇编代码中我们可以看到，在第i次迭代的过程中，程序读取位于dest位置的的数据，拿他乘以data[i]，然后把乘积存回dest。这里的读写操作就是多余的，因为在每次迭代中，读取进来的值和上一次迭代结束后存回去的值是同一个值。所以我们可以拿掉这个不必要的读写操作，得到combine4 /*Accumulate result in local variable*/ void combine4(vec_ptr v, data_t *dest) { long int i; lont int length = vec_length(v); data_t *data = get_vec_start(v); data_t acc = IDENT; for(i=0; i<length; i++){ acc = acc OP data[i]; } *dest = acc; } combine3相比，在循环的每一次迭代中，我们把内存操作从两个读一个写操作降到了一个读操作： combine4: data_t = float, OP ＝ ＊ i in %rdx, data in %rax, limit in %rbp, acc in %xmm0 .L488: Loop: mulss (%rax,%rdx,4), %xmm0 Mutilpy acc by data[i] addq$1, %rdx             Increment i
cmpq  %rdx, %rbp            Compare limit :i
jg    .L48                  if > , goto loop

### Loop Unrolling

/*Unroll loop by 2*/
void combine5(vec_ptr v, data_t *dest)
{
long int i;
long int length = vec_length(v);
long int limit = length-1;
data_t *data = get_vec_start(v);
data_t acc = IDENT;

/*Combine 2 elements at a time*/
for(i=0; i<limit; i+=2){
acc = (acc OP data[i])　OP data[i+1];
}

/*Finish any remaining elements*/
for(; i<length; i++) {
acc = acc OP　data[i];
}
*dest = acc;
}

### Enhancing Parallelism

/*Unroll loop by 2, 2-way parallelism */
void combine6(vec_ptr v, data_t *dest)
{
long int i;
long int length = vec_length(v);
long int limit = length - 1;
data_t *data = get_vec_start(v);
data_t acc0 = IDENT;
data_t acc1 = IDENT;

/*Combine 2 elements at a time*/
for(i=0; i<limit; i+=2){
acc0 = acc0 OP data[i];
acc1 = acc1 OP data[i+1];
}

/*Finish any remaining elements*/
for(; i<length; i++){
acc0 = acc0 OP data[i];
}
*dest = acc0 OP acc1;
}

• 本文已收录于以下专栏：

举报原因： 您举报文章：程序优化方法——CSAPP 读书笔记 色情 政治 抄袭 广告 招聘 骂人 其他 (最多只允许输入30个字)