使用OpenMP编写并行化的计数排序

最新推荐文章于 2023-11-29 04:34:06 发布

EricGeorge

最新推荐文章于 2023-11-29 04:34:06 发布

阅读量1.7k

点赞数 5

分类专栏：并行计算

本文链接：https://blog.csdn.net/yaoyaoyao2000/article/details/107135530

版权

并行计算专栏收录该内容

2 篇文章 0 订阅

订阅专栏

背景

计数排序（Count Sort）基本思想是对于列表a中的每个元素a[i]，计算小于a[i]的元素个数，将a[i]插入到由count决定的列表下标位置中，算法结束后，用临时列表覆盖原始列表。

问题

如果我们试图并行化外层循环，哪些变量为private，哪些变量为shared？

答：a, n, temp为shared；i, j, count为private。

#  pragma omp parallel for num_threads(thread_count) \
    default(none) shared(a, n, temp) private(i, j, count)\
    schedule(static, 2)

是否存在循环携带的数据依赖性？为什么？

答：不存在。由于计数排序是计算小于a[i]的元素个数，所以下一层循环不会对上一层产生影响，所以不存在循环携带的数据依赖型。

编写并行化的Count_sort。

全部源代码请见github

答：只需要在串行化的代码中加上一句Count_sort的预处理指令即可。
并行化的Count_sort函数的源代码如下：

/*-----------------------------------------------------------------
 * Function:     Count_sort_parallel
 * Purpose:      Sort list using Count sort
 * In args:      n
 * In/out args:  a
 */
void Count_sort_parallel(int a[], int n) {
	int i,j,count;
	int *temp = malloc(n*sizeof(int));

#  pragma omp parallel for num_threads(thread_count) \
   default(none) shared(a, n, temp) private(i, j, count)\
   schedule(static, 2)
	for(i=0; i<n; i++) {
		count = 0;
		for(j=0; j<n; j++) {
			if(a[j]<a[i])
				count++;
			else if(a[j] == a[i] && j<i)
				count++;
		}
		temp[count] = a[i];
	}

	memcpy(a, temp, n*sizeof(int));
	free(temp);
}/* Count_sort_parallel */

程序的main函数的源代码如下:

/*-----------------------------------------------------------------*/
int main(int argc, char* argv[]) {
	int  n;
	char g_i;
	int* a;
	int* b;
	int* c;
	double start, finish;
	clock_t  start_time, end_time;
	double Total_time;

	Get_args(argc, argv, &n, &g_i);
	a = malloc(n*sizeof(int));
	b = malloc(n*sizeof(int));
	c = malloc(n*sizeof(int));
	if (g_i == 'g') {
		Generate_list(a, n);
#     ifdef DEBUG
		Print_list(a, n, "Before sort");
#     endif
	} else {
		Read_list(a, n);
	}
	for (int i = 0; i < n; ++i) {
		b[i] = a[i];
		c[i] = a[i];
	}

	start = omp_get_wtime();
	Count_sort_parallel(a, n);
	finish = omp_get_wtime();
	printf("Parallel count sort time = %e seconds\n", finish - start);

#  ifdef DEBUG
	Print_list(a, n, "After sort");
#  endif

	start_time = clock();
	Count_sort_serial(b, n);
	end_time = clock();
	Total_time = (double)(end_time-start_time) / CLOCKS_PER_SEC;
	printf("Serial count sort time = %e seconds\n", Total_time);

#  ifdef DEBUG
	Print_list(b, n, "After sort");
#  endif

	start_time = clock();
	qsort(c, n, sizeof(int), cmp);
	end_time = clock();
	Total_time = (double)(end_time-start_time) / CLOCKS_PER_SEC;
	printf("qsort time = %e seconds\n", Total_time);

#  ifdef DEBUG
	Print_list(c, n, "After sort");
#  endif

	free(a);
	free(b);
	free(c);
	return 0;
}  /* main */

并行化的Count_sort与串行化的Count_sort相比，性能如何？

答：./omp后面的第一个数字是线程数、第二个数字是待排序的数组大小、第三个字母是数组的输入方式。如图1所示，当所排序的数组大小很小的时候，串行排序的时间反而比并行排序的时间慢，但是当所排序的数组大小达到10^3数量级的时候，只要线程数大于1，并行的Count_sort比串行的Count_sort要快3倍以上。
两者的排序速度受待排序列表的数量级的影响，主要是因为并行排序存在着fork和join的时间开销。当待排序列表很小的时候，这种开销甚至会增加程序的运行时间，但是当待排序列表很大的时候，这种开销相比于整体排序时间就微不足道了，所以串行的Count_sort比并行的Count_sort慢3倍以上。
在这里插入图片描述

并行化的Count_sort与串行化的qsort库函数相比，性能如何？

答：如图1所示，由于Count_sort的时间复杂度是O(n²)，而qsort的时间复杂度是O(nlgn)，所以无论线程数或者是待排序的数组的大小怎么变化，qsort的运行时间总是要比并行化的Count_sort的运行时间要低。当待排序的数组的数据量小于10³数量级时，qsort的排序时间要比并行化的Count_sort的排序时间要低10到100倍；当待排序的数组的数据量大于10³数量级时，qsort的排序时间要比并行化的Count_sort的排序时间要低10³数量级。

程序运行方法

运行环境：Ubuntu16.04
在控制台中输入以下命令即可编译：

gcc -g -Wall -fopenmp -I. -o omp omp.c

输入以下命令即可运行：

./omp <thread count> <n> <g|i>

其中：

n是待排序列表的元素个数
g是通过随机数生成器生成的待排序列表
i是用户输入的列表

源代码附后

/* File:    omp.c
 *
 * Purpose: Compare parallel count sort, serial count sort, qsort.
 *
 * Compile: gcc -g -Wall -fopenmp -I. -o omp omp.c
 * Usage:   ./omp <thread count> <n> <g|i>
 *             n:   number of elements in list
 *            'g':  generate list using a random number generator
 *            'i':  user input list
 *
 * Input:   list (optional)
 * Output:  elapsed time for parallel count sort, serial count sort,
 *			qsort.
 *
 * Note:
 * 1.  DEBUG flag prints the contents of the list
 * 2.  This version forks and joins the threads only once.
 * 3.  Uses the OpenMP library function omp_get_wtime for timing.
 *     This function returns the number of seconds since some time
 *     in the past.
 *
 * IPP:  Section 5.6.2 (pp. 235 and ff.)
 */
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <string.h>
#include <time.h>

#ifdef DEBUG
const int RMAX = 100;
#else
const int RMAX = 10000000;
#endif

int thread_count;

void Usage(char* prog_name);
void Get_args(int argc, char* argv[], int* n_p, char* g_i_p);
void Generate_list(int a[], int n);
void Print_list(int a[], int n, char* title);
void Read_list(int a[], int n);
void Count_sort_parallel(int a[], int n);
void Count_sort_serial(int a[], int n);
int cmp(const void * a, const void *b);

/*-----------------------------------------------------------------*/
int main(int argc, char* argv[]) {
	int  n;
	char g_i;
	int* a;
	int* b;
	int* c;
	double start, finish;
	clock_t  start_time, end_time;
	double Total_time;

	Get_args(argc, argv, &n, &g_i);
	a = malloc(n*sizeof(int));
	b = malloc(n*sizeof(int));
	c = malloc(n*sizeof(int));
	if (g_i == 'g') {
		Generate_list(a, n);
#     ifdef DEBUG
		Print_list(a, n, "Before sort");
#     endif
	} else {
		Read_list(a, n);
	}
	for (int i = 0; i < n; ++i) {
		b[i] = a[i];
		c[i] = a[i];
	}

	start = omp_get_wtime();
	Count_sort_parallel(a, n);
	finish = omp_get_wtime();
	printf("Parallel count sort time = %e seconds\n", finish - start);

#  ifdef DEBUG
	Print_list(a, n, "After sort");
#  endif

	start_time = clock();
	Count_sort_serial(b, n);
	end_time = clock();
	Total_time = (double)(end_time-start_time) / CLOCKS_PER_SEC;
	printf("Serial count sort time = %e seconds\n", Total_time);

#  ifdef DEBUG
	Print_list(b, n, "After sort");
#  endif

	start_time = clock();
	qsort(c, n, sizeof(int), cmp);
	end_time = clock();
	Total_time = (double)(end_time-start_time) / CLOCKS_PER_SEC;
	printf("qsort time = %e seconds\n", Total_time);

#  ifdef DEBUG
	Print_list(c, n, "After sort");
#  endif

	free(a);
	free(b);
	free(c);
	return 0;
}  /* main */


/*-----------------------------------------------------------------
 * Function:  Usage
 * Purpose:   Summary of how to run program
 */
void Usage(char* prog_name) {
	fprintf(stderr, "usage:   %s <thread count> <n> <g|i>\n", prog_name);
	fprintf(stderr, "   n:   number of elements in list\n");
	fprintf(stderr, "  'g':  generate list using a random number generator\n");
	fprintf(stderr, "  'i':  user input list\n");
}  /* Usage */


/*-----------------------------------------------------------------
 * Function:  Get_args
 * Purpose:   Get and check command line arguments
 * In args:   argc, argv
 * Out args:  n_p, g_i_p
 */
void Get_args(int argc, char* argv[], int* n_p, char* g_i_p) {
	if (argc != 4 ) {
		Usage(argv[0]);
		exit(0);
	}
	thread_count = strtol(argv[1], NULL, 10);
	*n_p = strtol(argv[2], NULL, 10);
	*g_i_p = argv[3][0];

	if (*n_p <= 0 || (*g_i_p != 'g' && *g_i_p != 'i') ) {
		Usage(argv[0]);
		exit(0);
	}
}  /* Get_args */


/*-----------------------------------------------------------------
 * Function:  Generate_list
 * Purpose:   Use random number generator to generate list elements
 * In args:   n
 * Out args:  a
 */
void Generate_list(int a[], int n) {
	int i;

	srandom(1);
	for (i = 0; i < n; i++)
		a[i] = random() % RMAX;
}  /* Generate_list */


/*-----------------------------------------------------------------
 * Function:  Print_list
 * Purpose:   Print the elements in the list
 * In args:   a, n
 */
void Print_list(int a[], int n, char* title) {
	int i;

	printf("%s:\n", title);
	for (i = 0; i < n; i++)
		printf("%d ", a[i]);
	printf("\n\n");
}  /* Print_list */


/*-----------------------------------------------------------------
 * Function:  Read_list
 * Purpose:   Read elements of list from stdin
 * In args:   n
 * Out args:  a
 */
void Read_list(int a[], int n) {
	int i;

	printf("Please enter the elements of the list\n");
	for (i = 0; i < n; i++)
		scanf("%d", &a[i]);
}  /* Read_list */


/*-----------------------------------------------------------------
 * Function:     Count_sort_parallel
 * Purpose:      Sort list using Count sort
 * In args:      n
 * In/out args:  a
 */
void Count_sort_parallel(int a[], int n) {
	int i,j,count;
	int *temp = malloc(n*sizeof(int));

#  pragma omp parallel for num_threads(thread_count) \
   default(none) shared(a, n, temp) private(i, j, count)\
   schedule(static, 2)
	for(i=0; i<n; i++) {
		count = 0;
		for(j=0; j<n; j++) {
			if(a[j]<a[i])
				count++;
			else if(a[j] == a[i] && j<i)
				count++;
		}
		temp[count] = a[i];
	}

	memcpy(a, temp, n*sizeof(int));
	free(temp);
}/* Count_sort_parallel */


/*-----------------------------------------------------------------
 * Function:     Count_sort_serial
 * Purpose:      Sort list using Count sort
 * In args:      n
 * In/out args:  a
 */
void Count_sort_serial(int a[], int n) {
	int i,j,count;
	int *temp = malloc(n*sizeof(int));

	for(i=0; i<n; i++) {
		count = 0;
		for(j=0; j<n; j++) {
			if(a[j]<a[i])
				count++;
			else if(a[j] == a[i] && j<i)
				count++;
		}
		temp[count] = a[i];
	}

	memcpy(a, temp, n*sizeof(int));
	free(temp);
}/* Count_sort_serial */

/*-----------------------------------------------------------------
 * Function:     cmp
 * Purpose:      compare 2 close elements from small to big
 * In args:      a b
 * In/out args:  a b
 */
int cmp(const void * a, const void *b) {
	return *(int *)a - *(int *)b;
}