Thrust is a powerful and easy-to-use library to utilize NVDIA GPU improve the performance of the program who has relatively abundant of memory and computation.
Thrust is self -independent which often needs to pass thrust's own device pointer(device_ptr) or device_vector as input argument of function API.
It is sometimes painful to work with cuda native memory raw pointers allocated by cudaMalloc.
My previous blog post has a short discussion about the conversion between cuda raw pointers and thrust's device pointer.
http://blog.csdn.net/niuqingpeng/article/details/12489807
This blog post will give a short and independent example about how the conversion help and it is also example about how to use thrust's reduce_by_key API.
#include<cuda.h>
#include<stdio.h>
#include<thrust/device_ptr.h>
#include<thrust/reduce.h>
template <typename Key, typename Value>
int reduce_by_key_with_raw_pointers(Key* d_key, Key* d_key_last, Value* d_value,
Key* d_okey, Value* d_ovalue) {
thrust::device_ptr<Key> d_keyp = thrust::device_pointer_cast(d_key);
thrust::device_ptr<Key> d_key_lastp = thrust::device_pointer_cast(d_key_last);
thrust::device_ptr<Value> d_valuep = thrust::device_pointer_cast(d_value);
thrust::device_ptr<Key> d_okeyp = thrust::device_pointer_cast(d_okey);
thrust::device_ptr<Value> d_ovaluep = thrust::device_pointer_cast(d_ovalue);
thrust::pair<thrust::device_ptr<Key>, thrust::device_ptr<Value> > new_end;
new_end = thrust::reduce_by_key(d_keyp, d_key_lastp, d_valuep, d_okeyp, d_ovaluep);
return new_end.first - d_okeyp;
}
void output_device_array_with_raw_pointer(int* d_array, int count, const char* msg) {
printf("%s\n", msg);
thrust::device_ptr<int> d_arrayp = thrust::device_pointer_cast(d_array);
for (int i = 0; i < count; ++i) {
printf("%d\t", (int)d_arrayp[i]);
}
printf("\n");
}
int main() {
const int N = 7;
int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
int* d_A = NULL;
cudaMalloc((void**)&d_A, N * sizeof(unsigned));
cudaMemcpy(d_A, A, sizeof(int) * N, cudaMemcpyHostToDevice);
int* d_B = NULL;
cudaMalloc((void**)&d_B, N * sizeof(unsigned));
cudaMemcpy(d_B, B, sizeof(int) * N, cudaMemcpyHostToDevice);
output_device_array_with_raw_pointer(d_A, N, "d_A");
output_device_array_with_raw_pointer(d_B, N, "d_B");
int n = reduce_by_key_with_raw_pointers<int, int>(
d_A, d_A + N, d_B, d_A, d_B);
output_device_array_with_raw_pointer(d_A, n, "d_A");
output_device_array_with_raw_pointer(d_B, n, "d_B");
// The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
// The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4. return 0;
}
Output results will be
d_A
1 3 33221
d_B
9 8 76543
d_A
1 3 21
d_B
9 21 93