CUDA一维线程

最新推荐文章于 2024-03-06 17:00:00 发布

NYG8945

最新推荐文章于 2024-03-06 17:00:00 发布

阅读量605

点赞数

分类专栏： CUDA学习日记文章标签： cuda 并行计算线程

本文链接：https://blog.csdn.net/nyg8945/article/details/52851181

版权

CUDA学习日记专栏收录该内容

12 篇文章 2 订阅

订阅专栏

自己敲的第一个并行计算程序

/**

实现线程的显示

*/



#include <stdio.h>

#include <iostream>
#include <cuda_runtime.h> // For the CUDA runtime routines (prefixed with "cuda_")

#include <DEVICE_LAUNCH_PARAMETERS.h> //我在查询中找到的头文件 有可能有别的表达方式








/**

* CUDA Kernel Device code

*

*/





__global__ void ShowData(float *x)

{
    int tid = threadIdx.x + blockDim.x * blockIdx.x;
    x[tid] = (float) threadIdx.x;

    //*c = a + b;

}



int main()

{
    float *pfHost, *pfDecice;
    int nBlocks, nThreads, nSizes, n;
    cudaError_t err = cudaSuccess;

    nBlocks = 2;
    nThreads = 8;
    nSizes = nBlocks * nThreads;

    //分配主机、设备存储空间

    pfHost = (float*)malloc(nSizes * sizeof(float));

    cudaMalloc((void**)&pfDecice, nSizes * sizeof(float));

    //执行全局函数
    ShowData << <nBlocks, nThreads >> > (pfDecice);
    err = cudaMemcpy(pfHost, pfDecice, nSizes * sizeof(float), cudaMemcpyDeviceToHost);
    if (err != cudaSuccess)

    {

        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));

        exit(EXIT_FAILURE);

    }
    for (n = 0; n < nSizes; n++) printf("%d %f \n", n, pfHost[n]);

    cudaFree(pfDecice);
    free(pfHost);

    system("pause");

    return 0;

}

参考 GPGPU编程技术-从GLSL、CUDA到OpenCl 第四章代码书134页

用软件nvidia visual profiler分析时，主函数末尾要加上释放线程的函数。如下：

#include <stdio.h>

#include <iostream>
#include <cuda_runtime.h> // For the CUDA runtime routines (prefixed with "cuda_")

#include <DEVICE_LAUNCH_PARAMETERS.h> //我在查询中找到的头文件 有可能有别的表达方式








/**

* CUDA Kernel Device code

*

*/





__global__ void ShowData(float *x)

{
    int tid = threadIdx.x + blockDim.x * blockIdx.x;
    x[tid] = (float)threadIdx.x;

    //*c = a + b;

}



int main()

{
    float *pfHost, *pfDecice;
    int nBlocks, nThreads, nSizes, n;
    cudaError_t err = cudaSuccess;

    nBlocks = 2;
    nThreads = 8;
    nSizes = nBlocks * nThreads;

    //分配主机、设备存储空间

    pfHost = (float*)malloc(nSizes * sizeof(float));

    cudaMalloc((void**)&pfDecice, nSizes * sizeof(float));

    //执行全局函数
    ShowData << <nBlocks, nThreads >> > (pfDecice);
    err = cudaMemcpy(pfHost, pfDecice, nSizes * sizeof(float), cudaMemcpyDeviceToHost);
    if (err != cudaSuccess)

    {

        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));

        exit(EXIT_FAILURE);

    }
    for (n = 0; n < nSizes; n++) printf("%d %f \n", n, pfHost[n]);

    cudaFree(pfDecice);
    free(pfHost);
    cudaThreadExit();//退出线程  在用NVIDIA visual profiler分析时要加入这个

    system("pause");


    return 0;

}