生成图的效果如下
本次主要是参考cuda的Sample将示例中的opengl es 130换成我常用的opengl es 300,并且在Qt中实现。
主要包括cuda在windows平台的Qt环境中的一些配置,cuda访问opengl的纹理在gpu中的地址,实现在gpu上生成图像并且直接拷贝到opengl的纹理进行显示,不在需要从cpu传入;可以减少cpu到显卡之间的数据传递。
工程结构如下
根据自己的配置,配置好cuda编译。我的如下
CUDA_SOURCE = $$PWD/process.cu
NVCC_OPTIONS = --use-local-env --cl-version 2015 -gencode=arch=compute_35,code=sm_35 \
-gencode=arch=compute_37,code=sm_37 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_52,code=sm_52 \
-gencode=arch=compute_60,code=sm_60
win32{
INCLUDEPATH += $$(CUDA_PATH)\include
contains(QT_ARCH,i386){
QMAKE_LIBDIR += $$(CUDA_PATH)\lib\Win32
CUDA_LIBS = cuda.lib cudart.lib
LIBS += $$CUDA_LIBS
CONFIG(debug, debug | release){
NVCC_OPTIONS += -Xcompiler \"/EHsc /W3 /nologo /Od /FS /Zi /RTC1 /MDd\"
}else{
NVCC_OPTIONS += -Xcompiler \"/EHsc /W3 /nologo /Od /FS /Zi /RTC1 /MD\"
}
cuda_d.input = CUDA_SOURCE
cuda_d.output = cuda/${QMAKE_FILE_BASE}.o
cuda_d.commands = $$(CUDA_PATH)\bin\nvcc $$join(INCLUDEPATH,'" -I"','-I"','"') --machine 32 -Xcompiler $$NVCC_OPTIONS \
-c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
cuda_d.dependency_type = TYPE_C
QMAKE_EXTRA_COMPILERS += cuda_d
}else{
QMAKE_LIBDIR += $$(CUDA_PATH)\lib\x64
CUDA_LIBS = cuda.lib cudart.lib
LIBS += $$CUDA_LIBS
CONFIG(debug, debug | release){
NVCC_OPTIONS += -Xcompiler \"/EHsc /W3 /nologo /Od /FS /Zi /RTC1 /MDd\"
}else{
NVCC_OPTIONS += -Xcompiler \"/EHsc /W3 /nologo /Od /FS /Zi /RTC1 /MD\"
}
cuda_d.input = CUDA_SOURCE
cuda_d.output = cuda/${QMAKE_FILE_BASE}.o
cuda_d.commands = $$(CUDA_PATH)\bin\nvcc $$join(INCLUDEPATH,'" -I"','-I"','"') --machine 64 -Xcompiler $$NVCC_OPTIONS \
-c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
cuda_d.dependency_type = TYPE_C
QMAKE_EXTRA_COMPILERS += cuda_d
}
}
适用于windows 32bit、64bit,debug和release模式。渲染器实现如下
#ifndef CUDAGLRENDER_H
#define CUDAGLRENDER_H
#include <QOpenGLShaderProgram>
#include <QOpenGLBuffer>
#include <QOpenGLTexture>
#include <QOpenGLExtraFunctions>
#include <cuda_gl_interop.h>
class CudaGLRender
{
public:
CudaGLRender();
void initsize(QOpenGLExtraFunctions *f);
void render(QOpenGLExtraFunctions *f, QMatrix4x4 pMatrix,QMatrix4x4 vMatrix,QMatrix4x4 mMatrix);
protected:
private:
QOpenGLShaderProgram program_;
QOpenGLBuffer vbo_;
int image_width = 512,image_height = 512;
GLuint tex_cudaResult;
unsigned int *cuda_dest_resource{nullptr};
cudaGraphicsResource *cuda_tex_result_resource{nullptr};
};
#endif // CUDAGLRENDER_H
#include <QDebug>
#include <cuda_runtime.h>
#include "cudaglrender.h"
extern "C" bool gpuInit(); //初始化
extern "C" void
launch_cudaProcess(dim3 grid, dim3 block, int sbytes,
unsigned int *g_odata,
int imgw); //cuda核函数
CudaGLRender::CudaGLRender()
{
}
void CudaGLRender::initsize(QOpenGLExtraFunctions *f)
{
program_.addCacheableShaderFromSourceFile(QOpenGLShader::Vertex,"vertex.vsh");
program_.addCacheableShaderFromSourceFile(QOpenGLShader::Fragment,"fragment.fsh");
program_.link();
bool isOk = gpuInit();
if(!isOk)return;
f->glGenTextures(1, &tex_cudaResult); //创建纹理
cudaError res;
f->glBindTexture(GL_TEXTURE_2D,tex_cudaResult);
f->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
f->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
f->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
f->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);//GL_RGBA8UI_EXT GL_RGBA_INTEGER_EXT
f->glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, image_width, image_height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); //传NULL表示生成纹理,不传递数据
res = cudaGraphicsGLRegisterImage(&cuda_tex_result_resource,tex_cudaResult,GL_TEXTURE_2D,cudaGraphicsMapFlagsWriteDiscard); //注册一个cuda的资源
if(res != cudaSuccess){
qDebug() << __FILE__ << __LINE__ << "cudaGraphicsGLRegisterImage:" << res;
}
res = cudaMalloc((void**)&cuda_dest_resource,image_width * image_height * 4 * sizeof(GLubyte)); //分配cuda内存
if(res != cudaSuccess){
qDebug() << __FILE__ << __LINE__ << "cudaMalloc:" << res;
}
GLfloat points[]{
-1,1,0,
-1,-1,0,
1,-1,0,
1,1,0,
0,0,
0,1,
1,1,
1,0
};
vbo_.create();
vbo_.bind();
vbo_.allocate(points,sizeof(points)); //分配顶点内存,QOpenglBuffer是在gpu上
}
void CudaGLRender::render(QOpenGLExtraFunctions *f, QMatrix4x4 pMatrix, QMatrix4x4 vMatrix, QMatrix4x4 mMatrix)
{
f->glDisable(GL_DEPTH_TEST);
f->glEnable(GL_CULL_FACE);
program_.bind();
vbo_.bind();
program_.setUniformValue("uPMatrix",pMatrix);
program_.setUniformValue("uVMatrix",vMatrix);
program_.setUniformValue("uMMatrix",mMatrix);
program_.setUniformValue("sTexture",0);
f->glBindTexture(GL_TEXTURE_2D,tex_cudaResult);
cudaError res;
dim3 block(16,16,1);
dim3 grid(image_width / block.x,image_height / block.y,1);
launch_cudaProcess(grid, block, 0, cuda_dest_resource, image_width); //生成图像,
cudaArray *texture_ptr;
res = cudaGraphicsMapResources(1,&cuda_tex_result_resource,0); //映射资源
if(res != cudaSuccess){
qDebug() << __FILE__ << __LINE__ << "cudaGraphicsMapResources:" << res;
}
res = cudaGraphicsSubResourceGetMappedArray(&texture_ptr,cuda_tex_result_resource,0,0); //获取纹理在设备地址
if(res != cudaSuccess){
qDebug() << __FILE__ << __LINE__ << "cudaGraphicsSubResourceGetMappedArray:" << res;
}
res = cudaMemcpyToArray(texture_ptr, 0, 0, cuda_dest_resource, image_width * image_height * 4 * sizeof(GLubyte), cudaMemcpyDeviceToDevice); //拷贝图像到纹理
f->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
f->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
f->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
f->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
res = cudaGraphicsUnmapResources(1, &cuda_tex_result_resource, 0);
if(res != cudaSuccess){
qDebug() << __FILE__ << __LINE__ << "cudaGraphicsUnmapResources:" << res;
}
// glTexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
// cudaDeviceSynchronize();
program_.enableAttributeArray(0);
program_.enableAttributeArray(1);
program_.setAttributeBuffer(0,GL_FLOAT,0,3,3*sizeof(GLfloat));
program_.setAttributeBuffer(1,GL_FLOAT,3 * 4 * sizeof(GLfloat),2,2*sizeof(GLfloat));
f->glDrawArrays(GL_TRIANGLE_FAN,0,4);
program_.disableAttributeArray(0);
program_.disableAttributeArray(1);
vbo_.release();
program_.release();
f->glDisable(GL_CULL_FACE);
}
cuda函数实现如下,此实现为sample里面的原样实现
#include <Windows.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <cuda_gl_interop.h>
// clamp x to range [a, b]
__device__ float clamp(float x, float a, float b)
{
return max(a, min(b, x));
}
__device__ int clamp(int x, int a, int b)
{
return max(a, min(b, x));
}
// convert floating point rgb color to 8-bit integer
__device__ int rgbToInt(float r, float g, float b)
{
r = clamp(r, 0.0f, 255.0f);
g = clamp(g, 0.0f, 255.0f);
b = clamp(b, 0.0f, 255.0f);
return (int(b)<<16) | (int(g)<<8) | int(r);
}
__global__ void
cudaProcess(unsigned int *g_odata, int imgw)
{
extern __shared__ uchar4 sdata[];
int tx = threadIdx.x;
int ty = threadIdx.y;
int bw = blockDim.x;
int bh = blockDim.y;
int x = blockIdx.x*bw + tx;
int y = blockIdx.y*bh + ty;
uchar4 c4 = make_uchar4((x & 0x20)?100:0,0,(y & 0x20)?100:0,0);
g_odata[y*imgw+x] = rgbToInt(c4.z, c4.y, c4.x);
}
extern "C" void
launch_cudaProcess(dim3 grid, dim3 block, int sbytes,
unsigned int *g_odata,
int imgw)
{
cudaProcess<<< grid, block, sbytes >>>(g_odata, imgw);
}
extern "C" bool gpuInit()
{
int deviceCount;
cudaError res;
res = cudaGetDeviceCount(&deviceCount);
if(res != cudaSuccess){
std::cout << __FILE__ << __LINE__ << " " << res << std::endl;
return false;
}
if(deviceCount == 0){
std::cout << "CUDA error: no devices supporting CUDA." << std::endl;
return false;
}
int dev = 0;
cudaDeviceProp deviceProp;
res = cudaGetDeviceProperties(&deviceProp,dev);
if(res != cudaSuccess){
std::cout << __FILE__ << __LINE__ << " " << res << std::endl;
return false;
}
if (deviceProp.computeMode == cudaComputeModeProhibited)
{
std::cout << "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice()." << std::endl;
return false;
}
if(deviceProp.major < 1){
std::cout << "Error: device does not support CUDA." << std::endl;
return false;
}
std::cout << "Using device " << dev << ": " << deviceProp.name << std::endl;
return !cudaGLSetGLDevice(dev);
}
着色器和前面一样,一个简单的绘制纹理
#version 330
uniform mat4 uPMatrix,uVMatrix,uMMatrix;
layout(location = 0) in vec3 aPosition;
layout(location = 1) in vec2 aTexCood;
smooth out vec2 vTextureCood;
void main(void)
{
gl_Position = uPMatrix * uVMatrix * uMMatrix * vec4(aPosition,1);
vTextureCood = aTexCood;
}
#version 330
uniform sampler2D sTexture;
in vec2 vTextureCood;
out vec4 fragColor;
void main()
{
fragColor = texture2D(sTexture,vTextureCood);
}
在Widget中的调用还是一样,新建一个渲染器,初始化,然后再渲染。
#ifndef WIDGET_H
#define WIDGET_H
#include <QOpenGLWidget>
#include "cudaglrender.h"
class Widget : public QOpenGLWidget
{
Q_OBJECT
public:
Widget(QWidget *parent = 0);
~Widget();
protected:
void initializeGL() override;
void resizeGL(int w, int h) override;
void paintGL() override;
private:
QTimer *tm_{nullptr};
CudaGLRender render_;
QMatrix4x4 pMatrix_;
QVector3D camera_;
private slots:
void slotTimeout();
};
#endif // WIDGET_H
#include <QTimer>
#include "widget.h"
Widget::Widget(QWidget *parent)
: QOpenGLWidget(parent)
{
tm_ = new QTimer(this);
connect(tm_,SIGNAL(timeout()),this,SLOT(slotTimeout()));
tm_->start(30);
}
Widget::~Widget()
{
}
void Widget::initializeGL()
{
camera_ = QVector3D(0,0,3);
render_.initsize(QOpenGLContext::currentContext()->extraFunctions());
}
void Widget::resizeGL(int w, int h)
{
pMatrix_.setToIdentity();
pMatrix_.perspective(45,float(w)/h,0.01f,100.0f);
}
void Widget::paintGL()
{
QOpenGLExtraFunctions *f = QOpenGLContext::currentContext()->extraFunctions();
f->glClearColor(0.0,0.0,0.0,1.0);
f->glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
QMatrix4x4 vMatrix;
vMatrix.lookAt(camera_,QVector3D(0,0,0),QVector3D(0,1,0));
QMatrix4x4 mMatrix;
render_.render(f,pMatrix_,vMatrix,mMatrix);
}
void Widget::slotTimeout()
{
update();
}
这篇主要实现了在Qt中用cuda生成图像,并与opengl互调,减少cpu负荷