CS61C (su20) lab09

cs61c_lab9

Exercise 1 - Familiarize Yourself with the SIMD Functions

  1. __m128 _mm_div_ps (__m128 a, __m128 b)

  2. __m128i _mm_max_epi8 (__m128i a, __m128i b)

  3. __m128i _mm_sra_epi16 (__m128i a, __m128i count)

Exercise 2 - Writing SIMD Code

代码如下:

long long int sum_simd(unsigned int vals[NUM_ELEMS]) {
	clock_t start = clock();
	__m128i _127 = _mm_set1_epi32(127);		// This is a vector with 127s in it... Why might you need this?
	long long int result = 0;				   // This is where you should put your final result!
	/* DO NOT DO NOT DO NOT DO NOT WRITE ANYTHING ABOVE THIS LINE. */
	
	for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
		/* YOUR CODE GOES HERE */
        __m128i result_itr = _mm_setzero_si128();
        for (unsigned int i = 0; i < NUM_ELEMS / 4 * 4; i += 4) {
            __m128i vals_m128i = _mm_loadu_si128((__m128i *) &vals[i]);
            result_itr = _mm_add_epi32(result_itr ,_mm_and_si128(vals_m128i, _mm_cmpgt_epi32(vals_m128i, _127)));
        }
        unsigned int results[4] = {0,0,0,0};
        _mm_storeu_si128((__m128i *)results, result_itr);
        for (int i = 0; i < 4; i ++) {
            result += results[i];
        }
		/* You'll need a tail case. */
        for (int i = NUM_ELEMS / 4 * 4; i < NUM_ELEMS; i ++) {
            if(vals[i] >= 128) result += vals[i];
        }
	}
	clock_t end = clock();
	printf("Time taken: %Lf s\n", (long double)(end - start) / CLOCKS_PER_SEC);
	return result;
}

Exercise 3 - Loop Unrolling

代码如下:

long long int sum_simd_unrolled(unsigned int vals[NUM_ELEMS]) {
	clock_t start = clock();
	__m128i _127 = _mm_set1_epi32(127);
	long long int result = 0;
	for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
		/* COPY AND PASTE YOUR sum_simd() HERE */
		/* MODIFY IT BY UNROLLING IT */
        __m128i result_itr = _mm_setzero_si128();
        for (unsigned int i = 0; i < NUM_ELEMS / 16 * 16; i += 16) {
            __m128i vals_m128i = _mm_loadu_si128((__m128i *) &vals[i]);
            result_itr = _mm_add_epi32(result_itr ,_mm_and_si128(vals_m128i, _mm_cmpgt_epi32(vals_m128i, _127)));
            vals_m128i = _mm_loadu_si128((__m128i *) &vals[i + 4]);
            result_itr = _mm_add_epi32(result_itr ,_mm_and_si128(vals_m128i, _mm_cmpgt_epi32(vals_m128i, _127)));
            vals_m128i = _mm_loadu_si128((__m128i *) &vals[i + 8]);
            result_itr = _mm_add_epi32(result_itr ,_mm_and_si128(vals_m128i, _mm_cmpgt_epi32(vals_m128i, _127)));
            vals_m128i = _mm_loadu_si128((__m128i *) &vals[i + 12]);
            result_itr = _mm_add_epi32(result_itr ,_mm_and_si128(vals_m128i, _mm_cmpgt_epi32(vals_m128i, _127)));
        }
        unsigned int results[4] = {0,0,0,0};
        _mm_storeu_si128((__m128i *)results, result_itr);
        for (int i = 0; i < 4; i ++) {
            result += results[i];
        }
		/* You'll need 1 or maybe 2 tail cases here. */
        for (int i = NUM_ELEMS / 16 * 16; i < NUM_ELEMS; i ++) {
            if(vals[i] >= 128) result += vals[i];
        }
	}
	clock_t end = clock();
	printf("Time taken: %Lf s\n", (long double)(end - start) / CLOCKS_PER_SEC);
	return result;
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值