opencv读入图片,将图片数据传递到cuda处理
#include<iostream>
#include<opencv2/core/core.hpp>
#include<opencv2/highgui/highgui.hpp>
#include<opencv2/imgproc/imgproc.hpp>
#include<stdio.h>
using namespace std;
using namespace cv;
#define NUM_BLOCK 300 // Number of thread blocks
#define NUM_THREAD 64
__global__ void hello(uchar *a, uchar *b,int bins,int nthreads, int nblocks)
{
int i;
int idx = blockIdx.x*blockDim.x+threadIdx.x; // Sequential thread index across the blocks
for (i=idx; i<bins; i+=nthreads*nblocks) {
a[idx]+=b[idx];
if(a[idx]>255)
a[idx]=255;
if(a[idx]<0)
a[idx]=0;
}
}
int main()
{
IplImage* img1=cvLoadImage("test1.jpg",0);
IplImage* img2=cvLoadImage("test2.jpg",0);
uchar* a=(uchar*)img1->imageData;
uchar* b=(uchar*)img2->imageData;
int N=img1->height*img1->widthStep;
uchar *ad;
uchar *bd;
const int csize = N*sizeof(uchar);
const int isize = N*sizeof(uchar);
cudaMalloc( (void**)&ad, csize );
cudaMalloc( (void**)&bd, isize );
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );
dim3 dimGrid(NUM_BLOCK,1,1); // Grid dimensions
dim3 dimBlock(NUM_THREAD,1,1); // Block dimensions
hello<<<dimGrid, dimBlock>>>(ad, bd,N,NUM_THREAD, NUM_BLOCK);
cudaMemcpy( b, ad, csize, cudaMemcpyDeviceToHost );
cudaFree( ad );
cudaFree( bd );
cvNamedWindow("图像显示",CV_WINDOW_AUTOSIZE);
cvShowImage("图像显示",img2);
cvWaitKey(0);
return 0;
}
参考代码:计算圆周率
#include <stdio.h>
#include<windows.h>
#include <cuda.h>
#define NBIN 1000000000 // Number of bins
#define NUM_BLOCK 300 // Number of thread blocks
#define NUM_THREAD 64 // Number of threads per block
int tid;
float pi = 0;
// Kernel that executes on the CUDA device
__global__ void cal_pi(float *sum, int nbin, float step, int nthreads, int nblocks) {
int i;
float x;
int idx = blockIdx.x*blockDim.x+threadIdx.x; // Sequential thread index across the blocks
for (i=idx; i< nbin; i+=nthreads*nblocks) {
x = (i+0.5)*step;
sum[idx] += 4.0/(1.0+x*x);
}
}
//Main routine that executes on the host
int main(void) {
LARGE_INTEGER frec;
LARGE_INTEGER strt;
LARGE_INTEGER ed;
QueryPerformanceFrequency(&frec);
QueryPerformanceCounter(&strt);
dim3 dimGrid(NUM_BLOCK,1,1); // Grid dimensions
dim3 dimBlock(NUM_THREAD,1,1); // Block dimensions
float *sumHost, *sumDev; // Pointer to host & device arrays
float step = 1.0/NBIN; // Step size
size_t size = NUM_BLOCK*NUM_THREAD*sizeof(float); //Array memory size
sumHost = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &sumDev, size); // Allocate array on device
// Initialize array in device to 0
cudaMemset(sumDev, 0, size);
// Do calculation on device
cal_pi <<<dimGrid, dimBlock>>> (sumDev, NBIN, step, NUM_THREAD, NUM_BLOCK); // call CUDA kernel
// Retrieve result from device and store it in host array
cudaMemcpy(sumHost, sumDev, size, cudaMemcpyDeviceToHost);
for(tid=0; tid<NUM_THREAD*NUM_BLOCK; tid++)
pi += sumHost[tid];
pi *= step;
// Print results
printf("PI = %f\n",pi);
// Cleanup
free(sumHost);
cudaFree(sumDev);
QueryPerformanceCounter(&ed);
printf("%e\n",(ed.QuadPart-strt.QuadPart)*1000/frec.QuadPart);
return 0;
}