优化小实验

一直以来都在DM642平台下写程序,而自己所做的大部分工作也和优化相关。为了更加清楚地理解TI CCS编译器的优化规则,做了以下试验:

 

分别写了7个相同功能的函数,操作相同, 但实现的方式不一样,自然运行的时间也不一样:

----------------------debug mode----------------------------------

combine1(v, &dest)    Time elapse: 0.028967 ms.
combine2(v, &dest)    Time elapse: 0.024751 ms.
combine3(v, &dest)    Time elapse: 0.023541 ms.
combine4(v, &dest)    Time elapse: 0.012635 ms.
combine4p(v, &dest)  Time elapse: 0.012090 ms.
combine5p(v, &dest)  Time elapse: 0.007475 ms.
combine6(v, &dest)    Time elapse: 0.011119 ms.
combine6p(v, &dest)  Time elapse: 0.006823 ms.

 

combine1 > combine2 > combine3 > combine4 > conbine4p > combine6 > conbine5p > combine6p

 

由于没有开优化,在debug模式下运行,这个结果和自己预期的比较一致。从combine1到combine6p, 依次减少了函数的调用和对存储器的访问,以以及循环的展开等等。自然其耗时越来越短。然而当我把-o3优化打开时,结果令我费解:

------------------release mode---------------------------------------

combine1(v, &dest)   Time elapse: 0.011491 ms.
combine2(v, &dest)   Time elapse: 0.009616 ms.
combine3(v, &dest)   Time elapse: 0.009603 ms.
combine4(v, &dest)   Time elapse: 0.003884 ms.
combine4p(v, &dest) Time elapse: 0.004096 ms.
combine5p(v, &dest) Time elapse: 0.005573 ms.
combine6(v, &dest)   Time elapse: 0.005120 ms.
combine6p(v, &dest) Time elapse: 0.004987 ms.
最值得注意的变化是combine4和combine4p. 下降的幅度最大,并且成为了耗时最短的函数。

仔细看一下combine4中的核心循环:

 for (i = 0; i < len; i++)
 {
    x = x OPER data[i];
 }

这里并没有任何人为的循环展开。而正是因为如此编译器对它所进行的优化程度甚至超过了人为循环展开的函数。

这使得它一下子成为了最快的函数。而且combine4与combine4p的唯一区别在于一个使用数组,一个用指针。

而在这里也证明了一点:编译器对数组的优化程度大于指针。尽管差别不是太大。

打开profile查看:

0:0x800202c0-0x80020374,combine4,125-143:test.c,  function,  1,1580,1580,1430,1430,7,7,7,7,

0:0x800207b4-0x80020980,combine6,217-239:test.c,  function,  1,2117,2117,2021,2021,12,12,9,9,

0:0x80020344-0x80020354,combine4,138-141:test.c, loop,    124,1102,1102,978,978,5,5,4,4, (循环124次,cache hit 5/(5+4) = 0.56)
0:0x80020880-0x80020898,combine6,227-231:test.c, loop,       60,1041,1041,981,981,6,6,4,4, (循环60次,cache hit 4/10 = 0.4)

 

看来编译器做循环展开比我们人为地做要好。
另外,combine6p做了很大的循环展开,但并没有像想象的那样有几何级数的提速,原因是循环展开到一定程度,内存的访问等待时间成为瓶颈。

  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 #include <math.h>
  4 #include <csl.h>
  5 #include <csl_cache.h>
  6 #include <time.h>
  7 #include "myMath.h"
  8 #include "HKY_testTime.h"
  9
 10 #define IDENT 0
 11 #define OPER +
 12 #define VEC_LEN 1024
 13
 14 typedef int data_t;
 15
 16 typedef struct {
 17     int len;
 18     data_t *data;
 19 }vec_rec, *vec_ptr;
 20
 21 vec_ptr new_vec(int len);
 22 int get_vec_element(vec_ptr v, int index, data_t *dest);
 23 int vec_length(vec_ptr v);
 24
 25 void combine1(vec_ptr v, data_t *dest);
 26
 27 vec_ptr new_vec(int len)
 28 {
 29     vec_ptr result = (vec_ptr)malloc(sizeof(vec_rec));
 30     if (!result)
 31         {
 32             return NULL;
 33         }
 34     
 35     result->len = len;
 36     
 37     if (len > 0)
 38         {
 39             int i;
 40             data_t *data = (data_t *)malloc(len * sizeof(data_t));
 41             if (!data)
 42                 {
 43                     free((void *)result);
 44                     return NULL;
 45                 }
 46             result->data = data;
 47             
 48             for (i = 0; i < len; i++)
 49             {
 50                 data[i] = i;
 51             }
 52         }
 53     else
 54         {
 55             result->data = NULL;
 56         }
 57     return result;
 58 }
 59
 60 int get_vec_element(vec_ptr v, int index, data_t *dest)
 61 {
 62     if (index < 0 || index >= v->len)
 63         {
 64             return 0;
 65         }
 66     *dest = v->data[index];
 67     return 1;
 68 }
 69
 70 int vec_length(vec_ptr v)
 71 {
 72     return v->len;
 73 }
 74
 75 void combine1(vec_ptr v, data_t *dest)
 76 {
 77     int i;
 78     
 79     *dest = IDENT;
 80
 81     for (i = 0; i < vec_length(v); i++)
 82     {
 83         data_t val;
 84         get_vec_element(v, i, &val);
 85         *dest = *dest OPER val;
 86     }
 87 }
 88
 89 void combine2(vec_ptr v, data_t *dest)
 90 {
 91     int i;
 92     int len;
 93     
 94     *dest = IDENT;
 95     len = vec_length(v);
 96     
 97     for (i = 0; i < len; i++)
 98     {
 99         data_t val;
100         get_vec_element(v, i, &val);
101         *dest = *dest OPER val;
102     }
103 }
104
105 void combine3(vec_ptr v, data_t *dest)
106 {
107     int i;
108     int len;
109     data_t x;
110     
111     *dest = IDENT;
112     x = IDENT;
113     
114     len = vec_length(v);
115     
116     for (i = 0; i < len; i++)
117     {
118         data_t val;
119         get_vec_element(v, i, &val);
120         x = x OPER val;
121     }
122     *dest = x;
123 }
124
125 void combine4(vec_ptr v, data_t *dest)
126 {
127     int i;
128     int len;
129     data_t x;
130     data_t *data;
131     
132     *dest = IDENT;
133     x = IDENT;
134     
135     len = vec_length(v);
136     data = v->data;
137
138     for (i = 0; i < len; i++)
139     {
140         x = x OPER data[i];
141     }
142     *dest = x;
143 }
144
145 void combine4p(vec_ptr v, data_t *dest)
146 {
147     int i;
148     int len;
149     data_t x;
150     data_t *data;
151     data_t *dend;
152     
153     *dest = IDENT;
154     x = IDENT;
155     
156     len = vec_length(v);
157     data = v->data;
158     dend = data + len;
159
160     for (; data < dend; data++)
161     {
162         x = x OPER (*data);
163     }
164     *dest = x;
165 }
166 void combine5(vec_ptr v, data_t *dest)
167 {
168     int i;
169     int len;
170     data_t x;
171     data_t *data;
172     
173     *dest = IDENT;
174     x = IDENT;
175     
176     len = vec_length(v);
177     data = v->data;
178
179     for (i = 0; i < len; i += 2)
180     {
181         x = x OPER data[i];
182         x = x OPER data[i+1];
183     }
184     *dest = x;
185 }
186 void combine5p(vec_ptr v, data_t *dest)
187 {
188     int i;
189     int len;
190     data_t x;
191     data_t *data;
192     data_t *dend;
193     data_t *dlimit;
194     
195     *dest = IDENT;
196     x = IDENT;
197         
198     len = vec_length(v);
199     data = v->data;
200     dend = data + len;
201     dlimit = dend - 7;
202
203     for (; data < dlimit; data += 8)
204     {
205         x = x OPER data[0] OPER data[1] OPER data[2] OPER data[3]
206               OPER data[4] OPER data[5] OPER data[6] OPER data[7];
207     }
208     
209     for (; data < dend; data++)
210     {
211         x = x OPER data[0];
212     }
213
214     *dest = x;
215 }
216
217 void combine6(vec_ptr v, data_t *dest)
218 {
219     int length = vec_length(v);
220     int limit = length - 1;
221     int i;
222     
223     data_t *data = v->data;
224     data_t x0 = IDENT;
225     data_t x1 = IDENT;
226     
227     for (i = 0; i < limit; i += 2)
228     {
229         x0 = x0 OPER data[i];
230         x1 = x1 OPER data[i+1];
231     }
232     
233     for (; i < length; i++)
234     {
235         x0 = x0 OPER data[i];
236     }
237     
238     *dest = x0 OPER x1;
239 }
240         
241 void combine6p(vec_ptr v, data_t *dest)
242 {
243     int i;
244     int len;
245     data_t x;
246     data_t *data;
247     data_t *dend;
248     data_t *dlimit;
249     
250     *dest = IDENT;
251     x = IDENT;
252         
253     len = vec_length(v);
254     data = v->data;
255     dend = data + len;
256     dlimit = dend - 15;
257
258     for (; data < dlimit; data += 16)
259     {
260         x = x OPER data[0] OPER data[1] OPER data[2] OPER data[3] OPER
261                    data[4] OPER data[5] OPER data[6] OPER data[7] OPER
262                    data[8] OPER data[9] OPER data[10] OPER data[11] OPER
263                    data[12] OPER data[13] OPER data[14] OPER data[15];
264     }
265     
266     for (; data < dend; data++)
267     {
268         x = x OPER data[0];
269     }
270
271     *dest = x;
272 }
273
274 #define CLIP(X,AMIN,AMAX) (((X)<(AMIN)) ? (AMIN) : ((X)>(AMAX)) ? (AMAX) : (X))
275
276 #define CLIP1(Y, X, AMIN, AMAX) if (X < AMIN) { Y = AMIN;} /
277                                 else if (X > AMAX) {Y = AMAX;}/
278                                 else {Y = X;}
279
280 void genCosTable()
281 {
282 #define PI (3.1415926)
283 #define ROUND(x) ((x) - floor(x)) > 0.5 ? ((int)(x) + 1) : (int)(x)
284     double temp;
285     int temp_d;
286     int i;
287     double theta;
288
289     for (i = 0; i < 360 * 8; i++)
290     {
291         theta = (double)(i) * PI / (8 * 180);
292         temp = sin(theta) * 65536;
293         temp_d = ROUND(temp);
294         printf("temp = %d /n", temp_d);
295     }  
296
297 }
298
299 int test_combine()
300 {
301     vec_ptr v = new_vec(128);
302     data_t dest;
303     HKY_CSL_INIT();
304
305     CALL_FUN_TIME(combine1(v, &dest));
306     CALL_FUN_TIME(combine2(v, &dest));
307     CALL_FUN_TIME(combine3(v, &dest));
308     CALL_FUN_TIME(combine4(v, &dest));
309     CALL_FUN_TIME(combine4p(v, &dest));
310     CALL_FUN_TIME(combine5(v ,&dest));
311     CALL_FUN_TIME(combine5p(v, &dest));
312     CALL_FUN_TIME(combine6(v, &dest));
313     CALL_FUN_TIME(combine6p(v, &dest));
314     return 0;
315 }
316
317
318 #define N 128
319 void mm_ijk(short *c, short *b, short *a)
320 {
321     int i, j, k;
322     short sum;
323     
324     short (*C)[N] = (short (*)[N])c;
325     short (*B)[N] = (short (*)[N])b;
326     short (*A)[N] = (short (*)[N])a;
327
328     for (i = 0; i < N; i++)
329     {
330         for (j = 0; j < N; j++)
331         {
332             sum = 0;
333             for (k = 0; k < N; k++)
334             {
335                 sum += A[i][k] * B[k][j];
336             }
337             C[i][j] += sum;
338         }
339     }
340
341 }
342
343 int main()
344 {
345     test_combine();
346     return 0;
347 }

--------------------header files-------------------------------------

#ifndef _HKY_TEST_TIME_H_
#define _HKY_TEST_TIME_H_

#include <csl.h>          
#include <csl_timer.h>  
#include <csl_cache.h>
#include <time.h>

//#define USE_CLOCK_FUNC    //if you want to use clock(), open this

TIMER_Config MyConfig = {
  0x00000200, /* ctl */
  0xFFFFFFFF, /* prd */
  0x00000000  /* cnt */
};
TIMER_Handle myhTimer;
double start_time, end_time, cur_time;

#define HKY_CSL_INIT()/
 CSL_init();/
 CACHE_setL2Mode(CACHE_256KCACHE);/
 CACHE_enableCaching(CACHE_EMIFA_CE00);/
 CACHE_enableCaching(CACHE_EMIFA_CE01);/
 myhTimer = TIMER_open(TIMER_DEV0, 0);/
 TIMER_config(myhTimer, &MyConfig);/
 TIMER_setCount(myhTimer,0);/
 TIMER_start(myhTimer)
 

#ifndef USE_CLOCK_FUNC
#define CALL_FUN_TIME(fun_arg) /
 CACHE_clean(CACHE_L2ALL, (void *)0, 0);/
 TIMER_setCount(myhTimer, 0);/
 start_time = TIMER_getCount(myhTimer);/
 fun_arg;/
 end_time = TIMER_getCount(myhTimer);/
 cur_time = (end_time - start_time) * 1.33e-5;/
 printf(#fun_arg"/tTime elapse: %f ms./n", cur_time)
#else
#define CALL_FUN_TIME(fun_arg)/
 CACHE_clean(CACHE_L2ALL, (void *)0, 0);/
 start_time = clock();/
 fun_arg;/
 end_time = clock();/
 cur_time = (end_time - start_time) * 1.67e-6;/
 printf(#fun_arg"/tTime elapse: %f ms./n", cur_time)
  
#endif //USE_CLOCK_FUNC


#endif //_HKY_TEST_TIME_H_

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值