深度学习完全攻略!(连载五:GPU加速技术指南)

本文以同步至公众号,欢迎订阅。

第四章  cm编译器

这一章,我们就用一个例子来说明cm的client和server时如何联系到一起,并最终运行的。以高斯模糊为例。此例也是intel提供的一个案例,但是没有说明怎么用。哈哈哈。

第一节 建一个client的程序

假设文件名字为gauss_client.cpp

#include "cm_rt.h"
#include "common/bitmap_helpers.h"
#include "common/cm_rt_helpers.h"
#include "common/isa_helpers.h"

using cm::util::bitmap::BitMap;
// Defines the number of columns per thread.
#define NUM_COLS_PER_THREAD 8
// Defines the number of rows per thread.
#define NUM_ROWS_PER_THREAD 8

// Declares coefficients for gaussian filter.
float a0 = 0, a1 = 0, a2 = 0, a3 = 0, b1 = 0, b2 = 0, coefp = 0, coefn = 0;

// This function is used to computes coefficients for gaussian filter.
void CalculateCoefficients(float sigma, int order) {
    const float nsigma = sigma < 0.1f ? 0.1f : sigma;
    const float alpha = 1.695f / nsigma;
    const float ema = (float)exp(-alpha);
    const float ema2 = (float)exp(-2 * alpha);

    b1 = -2 * ema;
    b2 = ema2;

    switch (order) {
    case 0: {
        const float k = (1 - ema) * (1 - ema) / (1 + 2 * alpha * ema - ema2);
        a0 = k;
        a1 = k * (alpha - 1) * ema;
        a2 = k * (alpha + 1) * ema;
        a3 = -k * ema2;
    } break;

    case 1: {
        const float k = (1 - ema) * (1 - ema) / ema;
        a0 = k * ema;
        a1 = a3 = 0;
        a2 = -a0;
    } break;

    case 2: {
        const float
            ea = (float)exp(-alpha),
            k = -(ema2 - 1) / (2 * alpha * ema),
            kn = (-2 * (-1 + 3 * ea - 3 * ea * ea + ea * ea * ea) /
                       (3 * ea + 1 + 3 * ea * ea + ea * ea * ea));
        a0 = kn;
        a1 = -kn * (1 + k * alpha) * ema;
        a2 = kn * (1 - k * alpha) * ema;
        a3 = -kn * ema2;
    } break;

    default:
        fprintf(stderr, "gaussianFilter: invalid order parameter!\n");
        return;
    }
    coefp = (a0 + a1) / (1 + b1 + b2);
    coefn = (a2 + a3) / (1 + b1 + b2);
    printf("Coefficients are: \n");
    printf(" a0 = %f, a1 = %f, a2 = %f, a3 = %f, b1 = %f, b2 = %f\n", a0, a1, a2, a3, b1, b2);
}

int main(int argc, char *argv[]) {
    // Loads an input image named "lena.bmp".
    auto input_image = BitMap::load("lena.bmp");

    // Gets the width and height of the input image.
    unsigned int width = input_image.getWidth();
    unsigned int height = input_image.getHeight();
    printf("image width = %d, height = %d\n", width, height);

    // Checks the value of width, height and bpp(bits per pixel) of the image.
    // Only images in 8-bit RGB format are supported.
    // Only images with width and height a multiple of 8 are supported.
    if (width & 7 || height & 7 || input_image.getBPP() != 24) {
        std::cerr << "Error: Only images in 8-bit RGB format with width and "
                  << "height a multiple of 8 are supported.\n";
        std::exit(1);
    }

    // Copies input image to output except for the data.
    auto output_image = input_image;

    // Sets image size in bytes. There are a total of width*height pixels and
    // each pixel occupies (out.getBPP()/8) bytes.
    unsigned int img_size = width * height * output_image.getBPP() / 8;

    // Sets output to blank image.
    output_image.setData(new unsigned char[img_size]);

    // Allocates system memory for rgb_to_rgba to convert image format from
    // RGB to RGBA.
    // Allocates system memory for rgba_to_rgb to convert image format from
    // RGBA to RGB.
    unsigned int num_pixels = width * height;
    unsigned char *rgb_to_rgba = new unsigned char[num_pixels * 4];
    unsigned char *rgba_to_rgb = new unsigned char[num_pixels * 4];

    // Converts image format from RGB to RGBA.
    // Copies the RGB values from the image, set the 4th byte with zero.
    for (int i = 0; i < num_pixels; i++) {
        rgb_to_rgba[i * 4] = input_image.getData()[i * 3];
        rgb_to_rgba[i * 4 + 1] = input_image.getData()[i * 3 + 1];
        rgb_to_rgba[i * 4 + 2] = input_image.getData()[i * 3 + 2];
        rgb_to_rgba[i * 4 + 3] = 0;
    }

    // Computes coefficients for gaussian filter.
    float sigma = 10.0f;
    int order = 0;
    CalculateCoefficients(sigma, order);

    // Creates a CmDevice from scratch.
    // Param device: pointer to the CmDevice object.
    // Param version: CM API version supported by the runtime library.
    CmDevice *device = nullptr;
    unsigned int version = 0;
    cm_result_check(::CreateCmDevice(device, version));

    // The file gaussian_blur_test_genx.isa is generated when the kernels in the
    // file gaussian_blur_test_genx.cpp are compiled by the CM compiler.
    // Reads in the virtual ISA from "gaussian_blur_test_genx.isa" to the code
    // buffer.
    std::string isa_code = cm::util::isa::loadFile("gauss_genx.isa");
    if (isa_code.size() == 0) {
        std::cerr << "Error: empty ISA binary.\n";
        std::exit(1);
    }

    // Creates a CmProgram object consisting of the kernels loaded from the code
    // buffer.
    // Param isa_code.data(): Pointer to the code buffer containing the virtual
    // ISA.
    // Param isa_code.size(): Size in bytes of the code buffer containing the
    // virtual ISA.
    CmProgram *program = nullptr;
    cm_result_check(device->LoadProgram(const_cast<char *>(isa_code.data()),
                                        isa_code.size(),
                                        program));

    // For vertical direction.

    // Creates the kernel.
    // Param program: CM Program from which the kernel is created.
    // Param "gaussianVertical": The kernel name which should be no more than 256
    // bytes including the null terminator.
    CmKernel *kernel_vertical = nullptr;
    cm_result_check(device->CreateKernel(program,
                                         "gaussianVertical",
                                         kernel_vertical));

    // Creates input surface with given width and height in pixels and format.
    CmSurface2D *input_surface = nullptr;
    cm_result_check(device->CreateSurface2D(4 * width,
                                            height,
                                            CM_SURFACE_FORMAT_A8,
                                            input_surface));

    // Copies system memory content to the input surface using the CPU. The
    // system memory content is the data of the input image in RGBA format.
    // The size of data copied is the size of data in the rgb_to_rgba.
    cm_result_check(input_surface->WriteSurface(rgb_to_rgba, nullptr));

    // Creates the temp surface. The width, height and format is the same as
    // the input surface.
    // The temp surface contains the output of kernel_vertical.
    CmSurface2D *temp_surface = nullptr;
    cm_result_check(device->CreateSurface2D(4 * width,
                                            height,
                                            CM_SURFACE_FORMAT_A8,
                                            temp_surface));

    // When a surface is created by the CmDevice a SurfaceIndex object is
    // created. This object contains a unique index value that is mapped to the
    // surface.
    // Gets the input surface index.
    SurfaceIndex *input_surface_idx = nullptr;
    cm_result_check(input_surface->GetIndex(input_surface_idx));

    // Gets the temp surface index.
    SurfaceIndex *temp_surface_idx = nullptr;
    cm_result_check(temp_surface->GetIndex(temp_surface_idx));

    // Sets a per kernel argument.
    // Sets input surface index as the first argument of kernel_vertical.
    // Sets temp surface index as the second argument of kernel_vertical.
    cm_result_check(kernel_vertical->SetKernelArg(0,
                                                  sizeof(SurfaceIndex),
                                                  input_surface_idx));
    cm_result_check(kernel_vertical->SetKernelArg(1,
                                                  sizeof(SurfaceIndex),
                                                  temp_surface_idx));

    // Sets the image width and height as the third and the fourth argument
    // of kernel_vertical.
    cm_result_check(kernel_vertical->SetKernelArg(2, 4, &width));
    cm_result_check(kernel_vertical->SetKernelArg(3, 4, &height));

    // Sets filter coefficients as the rest arguments of kernel_vertical.
    cm_result_check(kernel_vertical->SetKernelArg(4, 4, &a0));
    cm_result_check(kernel_vertical->SetKernelArg(5, 4, &a1));
    cm_result_check(kernel_vertical->SetKernelArg(6, 4, &a2));
    cm_result_check(kernel_vertical->SetKernelArg(7, 4, &a3));
    cm_result_check(kernel_vertical->SetKernelArg(8, 4, &b1));
    cm_result_check(kernel_vertical->SetKernelArg(9, 4, &b2));
    cm_result_check(kernel_vertical->SetKernelArg(10, 4, &coefp));
    cm_result_check(kernel_vertical->SetKernelArg(11, 4, &coefn));

    // Each CmKernel can be executed by multiple concurrent threads.
    // Here, for "kernel_vertical" kernel, each thread works on
    // NUM_COLS_PER_THREAD columns in vertical direction.
    int thread_width = width / NUM_COLS_PER_THREAD;

    // Creates a CmThreadSpace object.
    // There are two usage models for the thread space. One is to define the
    // dependency between threads to run in the GPU. The other is to define a
    // thread space where each thread can get a pair of coordinates during
    // kernel execution. For this example, we use the latter usage model.
    CmThreadSpace *thread_space = nullptr;
    cm_result_check(device->CreateThreadSpace(thread_width,
                                              1,
                                              thread_space));

    // Creates a task queue.
    // The CmQueue is an in-order queue. Tasks get executed according to the
    // order they are enqueued. The next task does not start execution until the
    // current task finishes.
    CmQueue *cmd_queue = nullptr;
    cm_result_check(device->CreateQueue(cmd_queue));

    // Creates a CmTask object.
    // The CmTask object is a container for CmKernel pointers. It is used to
    // enqueue the kernels for execution.
    CmTask *task = nullptr;
    cm_result_check(device->CreateTask(task));

    // Adds a CmKernel pointer to CmTask.
    // This task has one kernel.
    cm_result_check(task->AddKernel(kernel_vertical));

    // Launches the task on the GPU. Enqueue is a non-blocking call, i.e. the
    // function returns immediately without waiting for the GPU to start or
    // finish execution of the task. The runtime will query the HW status. If
    // the hardware is not busy, the runtime will submit the task to the
    // driver/HW; otherwise, the runtime will submit the task to the driver/HW
    // at another time.
    // An event, "sync_event", is created to track the status of the task.
    CmEvent *sync_event = nullptr;
    cm_result_check(cmd_queue->Enqueue(task,
                                       sync_event,
                                       thread_space));

    // Destroys a CmTask object.
    // CmTask will be destroyed when CmDevice is destroyed.
    // Here, the application destroys the CmTask object by itself.
    cm_result_check(device->DestroyTask(task));

    // For horizontal direction.

    // Creates the kernel.
    // Param program: CM Program from which the kernel is created.
    // Param "gaussianHorizontal": The kernel name which should be no more than
    // 256 bytes including the null terminator.
    CmKernel *kernel_horizontal = nullptr;
    cm_result_check(device->CreateKernel(program,
                                         "gaussianHorizontal",
                                         kernel_horizontal));

    // Creates the output surface. The width, height and format is the same as
    // the input surface.
    // The output surface contains the output of kernel_horizontal.
    CmSurface2D *output_surface = nullptr;
    cm_result_check(device->CreateSurface2D(4 * width,
                                            height,
                                            CM_SURFACE_FORMAT_A8,
                                            output_surface));

    // Gets the output surface index.
    SurfaceIndex *output_surface_idx = nullptr;
    cm_result_check(output_surface->GetIndex(output_surface_idx));

    // Sets a per kernel argument.
    // Sets the output of kernel_vertical as the input of kernel_horizontal.
    // Sets temp surface index as the first argument of kernel_horizontal.
    // Sets output surface index as the second argument of kernel_horizontal.
    cm_result_check(kernel_horizontal->SetKernelArg(0,
                                                    sizeof(SurfaceIndex),
                                                    temp_surface_idx));
    cm_result_check(kernel_horizontal->SetKernelArg(1,
                                                    sizeof(SurfaceIndex),
                                                    output_surface_idx));

    // Sets the image width and height as the third and the fourth argument
    // of kernel_horizontal.
    cm_result_check(kernel_horizontal->SetKernelArg(2, 4, &width));
    cm_result_check(kernel_horizontal->SetKernelArg(3, 4, &height));

    // Sets filter coefficients as the rest arguments of kernel_horizontal.
    cm_result_check(kernel_horizontal->SetKernelArg(4, 4, &a0));
    cm_result_check(kernel_horizontal->SetKernelArg(5, 4, &a1));
    cm_result_check(kernel_horizontal->SetKernelArg(6, 4, &a2));
    cm_result_check(kernel_horizontal->SetKernelArg(7, 4, &a3));
    cm_result_check(kernel_horizontal->SetKernelArg(8, 4, &b1));
    cm_result_check(kernel_horizontal->SetKernelArg(9, 4, &b2));
    cm_result_check(kernel_horizontal->SetKernelArg(10, 4, &coefp));
    cm_result_check(kernel_horizontal->SetKernelArg(11, 4, &coefn));

    // Each CmKernel can be executed by multiple concurrent threads.
    // Here, for "kernel_horizontal" kernel, each thread works on
    // NUM_ROWS_PER_THREAD rows in horizontal direction.
    int thread_height = height / NUM_ROWS_PER_THREAD;

    // Creates a CmThreadSpace object.
    // There are two usage models for the thread space. One is to define the
    // dependency between threads to run in the GPU. The other is to define a
    // thread space where each thread can get a pair of coordinates during
    // kernel execution. For this example, we use the latter usage model.
    cm_result_check(device->CreateThreadSpace(thread_height,
                                              1,
                                              thread_space));

    // Creates a CmTask object.
    // The CmTask object is a container for CmKernel pointers. It is used to
    // enqueue the kernels for execution.
    cm_result_check(device->CreateTask(task));

    // Adds a CmKernel pointer to CmTask.
    // This task has one kernels.
    cm_result_check(task->AddKernel(kernel_horizontal));

    // Launches the task on the GPU. Enqueue is a non-blocking call, i.e. the
    // function returns immediately without waiting for the GPU to start or
    // finish execution of the task. The runtime will query the HW status. If
    // the hardware is not busy, the runtime will submit the task to the
    // driver/HW; otherwise, the runtime will submit the task to the driver/HW
    // at another time.
    // An event, "sync_event", is created to track the status of the task.
    cm_result_check(cmd_queue->Enqueue(task,
                                       sync_event,
                                       thread_space));

    // Destroys a CmTask object.
    // CmTask will be destroyed when CmDevice is destroyed.
    // Here, the application destroys the CmTask object by itself.
    cm_result_check(device->DestroyTask(task));

    // Reads the output surface content to the system memory using the CPU.
    // The size of data copied is the size of data in Surface.
    // It is a blocking call. The function will not return until the copy
    // operation is completed.
    // The dependent event "sync_event" ensures that the reading of the surface
    // will not happen until its state becomes CM_STATUS_FINISHED.
    cm_result_check(output_surface->ReadSurface(rgba_to_rgb,
                                                sync_event));

    // Destroys the CmDevice.
    // Also destroys surfaces, kernels, tasks, thread spaces, and queues that
    // were created using this device instance that have not explicitly been
    // destroyed by calling the respective destroy functions.
    cm_result_check(::DestroyCmDevice(device));

    // Converts image format from RGBA to RGB.
    unsigned char *tmp = new unsigned char[num_pixels * 3];
    for (int i = 0; i < num_pixels; i++) {
        tmp[i * 3] = rgba_to_rgb[i * 4];
        tmp[i * 3 + 1] = rgba_to_rgb[i * 4 + 1];
        tmp[i * 3 + 2] = rgba_to_rgb[i * 4 + 2];
    }
    output_image.setData(tmp);

    // Saves the output image data into the file "blur_out.bmp".
    output_image.save("blur_out.bmp");

    // Frees memory.
    delete[] rgb_to_rgba;
    delete[] rgba_to_rgb;

    // Checks result.
    if (BitMap::checkResult("blur_out.bmp",
                            "blur_gold.bmp",
                            5)) {
        std::cout << "PASSED" << std::endl;
        return 0;
    } else {
        std::cout << "FAILED" << std::endl;
        return -1;
    }
}

第二节 建一个server的程序

假设文件名字为gauss_genx.cpp

#include <cm/cm.h>
#define NUM_COMPONENTS 4
// number of rows we read in at once
#define NUM_ROWS_PER_ITER 8
// number of columns per thread
#define NUM_COLS_PER_THREAD 8
#define SIMD_SIZE (NUM_COLS_PER_THREAD * NUM_COMPONENTS)

#define CLAMP_TO_EDGE 1

// for horizontal direction
// number of rows per thread
#define NUM_ROWS_PER_THREAD 8
// number of columns we read in at once
#define NUM_COLS_PER_ITER 8

// Each thread processes 32 columns independently
// For now assume height is divisible by 8
extern "C" _GENX_MAIN_ void
gaussianVertical( SurfaceIndex INBUF, SurfaceIndex OUTBUF, int width, int height, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn )
{

  matrix<uchar, NUM_ROWS_PER_ITER, SIMD_SIZE> image;
  matrix<uchar, NUM_ROWS_PER_ITER, SIMD_SIZE> outImage;
  vector<float, SIMD_SIZE> in;
  vector<float, SIMD_SIZE> out;
  vector<float, SIMD_SIZE> inMinusOne;
  vector<float, SIMD_SIZE> outMinusOne;
  vector<float, SIMD_SIZE> outMinusTwo;

  uint id = get_thread_origin_x();

#if CLAMP_TO_EDGE
  matrix<uchar, 1, SIMD_SIZE> firstRow;
  read( INBUF, id * SIMD_SIZE, 0, firstRow);
  inMinusOne = firstRow;
  inMinusOne *= 1/255.0f;
  outMinusTwo = coefp * inMinusOne;
  outMinusOne = outMinusTwo;
#else
  inMinusOne = 0;
  outMinusOne = 0;
  outMinusTwo = 0;
#endif

  //read in 8 rows at a time
  for( int i = 0; i < height; i += NUM_ROWS_PER_ITER ) {
    read( INBUF, id * SIMD_SIZE, i, image );

    #pragma unroll
    for( unsigned j = 0; j < NUM_ROWS_PER_ITER; j++ ) {
      in = image.row(j);
      in *= 1/255.0f;
      //out = a0 * in + a1 * inMinusOne - b1 * outMinusOne - b2 * outMinusTwo;
      out = a0 * in + a1 * inMinusOne - (b1 * outMinusOne + b2 * outMinusTwo);
      inMinusOne = in;
      outMinusTwo = outMinusOne;
      outMinusOne = out;

      //clamp the value to [0,1]
      out = cm_add<float>(out, 0.0f, SAT);
      outImage.row(j) = out * 255.0f;
    }

    //write back to surface
    write( OUTBUF, id*SIMD_SIZE, i, outImage );
  }

  vector<float, SIMD_SIZE> inPlusOne;
  vector<float, SIMD_SIZE> inPlusTwo;
  vector<float, SIMD_SIZE> outPlusOne;
  vector<float, SIMD_SIZE> outPlusTwo;
  vector<float, SIMD_SIZE> temp;

#if CLAMP_TO_EDGE
  matrix<uchar, 1, SIMD_SIZE> lastRow;
  read( INBUF, id * SIMD_SIZE, height - 1, lastRow );
  inPlusOne = lastRow;
  inPlusOne *= 1/255.0f;
  inPlusTwo = inPlusOne;
  outPlusOne = coefn * inPlusOne;
  outPlusTwo = outPlusOne;
#else
  inPlusOne = 0;
  inPlusTwo = 0;
  outPlusOne = 0;
  outPlusTwo = 0;
#endif

  //read 8 rows at a time, in reverse direction
  for( int i = height - NUM_ROWS_PER_ITER; i >= 0; i -= NUM_ROWS_PER_ITER ) {
    read( INBUF, id * SIMD_SIZE, i, image );
    read( MODIFIED(OUTBUF), id * SIMD_SIZE, i, outImage );

    #pragma unroll
    for( int j = NUM_ROWS_PER_ITER - 1; j >= 0; j-- ) {
      in = image.row(j);
      in *= 1 / 255.0f;
      //temp = a2 * inPlusOne + a3 * inPlusTwo - b1 * outPlusOne - b2 * outPlusTwo;
      temp = a2 * inPlusOne + a3 * inPlusTwo - (b1 * outPlusOne + b2 * outPlusTwo);
      inPlusTwo = inPlusOne;
      inPlusOne = in;
      outPlusTwo = outPlusOne;
      outPlusOne = temp;

      out = outImage.row(j);
      out = cm_add<float>( out * (1/255.0f), temp, SAT );
      outImage.row(j) = out * 255;
    }

    //write back to surface
    write( OUTBUF, id*SIMD_SIZE, i, outImage );
  }
}

extern "C" _GENX_MAIN_ void
transpose( SurfaceIndex INBUF, SurfaceIndex OUTBUF, unsigned id, int width, int height ) {

  matrix<uint, 8, 8> in;
  matrix<uint, 8, 8> out;

  for( int i = 0; i < height; i += 8 ) {
    read( INBUF, id * 32, i, in );
    out.row(0) = in.column(0);
    out.row(1) = in.column(1);
    out.row(2) = in.column(2);
    out.row(3) = in.column(3);
    out.row(4) = in.column(4);
    out.row(5) = in.column(5);
    out.row(6) = in.column(6);
    out.row(7) = in.column(7);

    write( OUTBUF, i * 4, id * 8, out );
  }
}

// Like gaussianVertical, except we process 8 independent rows at once
extern "C" _GENX_MAIN_ void
gaussianHorizontal( SurfaceIndex INBUF, SurfaceIndex OUTBUF, int width, int height, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn )
{

  matrix<uchar, NUM_ROWS_PER_THREAD, NUM_COLS_PER_ITER * NUM_COMPONENTS> image;
  matrix<uchar, NUM_ROWS_PER_THREAD, NUM_COLS_PER_ITER * NUM_COMPONENTS> outImage;
  matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> in;
  matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> out;
  matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> inMinusOne;
  matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> outMinusOne;
  matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> outMinusTwo;

  uint id = get_thread_origin_x();

#if CLAMP_TO_EDGE
  matrix<uchar, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> firstColumn;
  read( MODIFIED(INBUF), 0, id * NUM_ROWS_PER_THREAD, firstColumn );
  inMinusOne = firstColumn;
  inMinusOne *= 1/255.0f;
  outMinusTwo = coefp * inMinusOne;
  outMinusOne = outMinusTwo;
#else
  inMinusOne = 0;
  outMinusOne = 0;
  outMinusTwo = 0;
#endif

  //read 8 rows at a time
  for( int i = 0; i < width; i += NUM_COLS_PER_ITER ) {
    read( MODIFIED(INBUF), i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, image );

    #pragma unroll
    for( unsigned j = 0; j < NUM_COLS_PER_ITER; j++ ) {

      in = image.select<NUM_ROWS_PER_THREAD, 1, NUM_COMPONENTS, 1>(0, j * NUM_COMPONENTS);
      in *= 1/255.0f;
      //out = a0 * in + a1 * inMinusOne - b1 * outMinusOne - b2 * outMinusTwo;
      out = a0 * in + a1 * inMinusOne - (b1 * outMinusOne + b2 * outMinusTwo);
      inMinusOne = in;
      outMinusTwo = outMinusOne;
      outMinusOne = out;

      //clamp the value to [0,1]
      out = cm_add<float>( out, 0.0f, SAT ) * 255.0f;
      outImage.select<NUM_ROWS_PER_THREAD, 1, NUM_COMPONENTS, 1>(0, j*NUM_COMPONENTS) = out;
    }

    //write back to surface
    write( OUTBUF, i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, outImage );
  }

  //reverse direction
  matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> inPlusOne;
  matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> inPlusTwo;
  matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> outPlusOne;
  matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> outPlusTwo;
  matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> temp;

#if CLAMP_TO_EDGE
  matrix<uchar, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> lastColumn;
  read( MODIFIED(INBUF), width - NUM_COMPONENTS, id * 8, lastColumn );
  inPlusOne = lastColumn;
  inPlusOne *= 1/255.0f;
  inPlusTwo = inPlusOne;
  outPlusOne = coefn * inPlusOne;
  outPlusTwo = outPlusOne;
#else
  inPlusOne = 0;
  inPlusTwo = 0;
  outPlusOne = 0;
  outPlusTwo = 0;
#endif

  for( int i = width - NUM_COLS_PER_ITER; i >= 0; i -= NUM_COLS_PER_ITER ) {
    read( MODIFIED(INBUF), i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, image );
    read( MODIFIED(OUTBUF), i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, outImage );

    #pragma unroll
    for( int j = NUM_COLS_PER_ITER - 1; j >= 0; j-- ) {
      in = image.select<NUM_ROWS_PER_THREAD, 1, NUM_COMPONENTS, 1>(0, j*NUM_COMPONENTS);
      in *= 1/255.0f;
      //temp = a2 * inPlusOne + a3 * inPlusTwo - b1 * outPlusOne - b2 * outPlusTwo;
      temp = a2 * inPlusOne + a3 * inPlusTwo - (b1 * outPlusOne + b2 * outPlusTwo);
      inPlusTwo = inPlusOne;
      inPlusOne = in;
      outPlusTwo = outPlusOne;
      outPlusOne = temp;

      //The mul * 1 forces out to not be coalesced with outImage, so we can use SIMD16
      //operations instead of SIMD4
      out = outImage.select<NUM_ROWS_PER_THREAD, 1, NUM_COMPONENTS, 1>(0, j*NUM_COMPONENTS) * 1.0f;
      //out = outImage.select<NUM_ROWS_PER_THREAD, 1, NUM_COMPONENTS, 1>(0, j*NUM_COMPONENTS);

      out = cm_add<float>( out * (1/255.0f), temp, SAT );
      outImage.select<NUM_ROWS_PER_THREAD, 1, NUM_COMPONENTS, 1>(0, j*NUM_COMPONENTS) = out * 255.0f;
    }

    write( OUTBUF, i * NUM_COMPONENTS, id * 8, outImage );
  }
}

第三节 最重要的编译

在第一节中,有一行代码时核心,

std::string isa_code = cm::util::isa::loadFile("gauss_genx.isa");

那么这个gauss_genx.isa 是哪来的呢?用NVIDIA显卡做过深度学习的同学肯定知道ISA总线这个名词,本文不写这些。主要是告诉您,怎么由gauss_genx.cpp变为gauss_genx.isa.

首先,你得找到cm的编译器。

(1)从https://github.com/intel/cm-compiler这个地方clone代码

(2)安装:VS2015(或以上版本),安装python2.7,安装cygwin,安装cmake,安装unzip,安装curl

(3)打开cygwin,进入cm-compiler里面执行下面的代码:

cmake path/to/llvm/source/root

cmake --build .

(4)执行support/scripts/build.bash -s vs2015 -d -m --64

(5)找到.exe,应该是在build.64.vs2015文件夹里面。

(6)在cmd中执行

cmc.exe -isystem path\to\cm-compiler\support\include path\to\gauss_genx.cpp -march=SKL

(7)可以看到,在cmc.exe所在文件夹生成了gauss_genx.isa文件

(8)我们打开isa文件看一下,实际上是一些16进制的数据。再底层是怎么加载实现的,本文就不深入探讨了。

 

好了,写到这里,基本上把CM有关的一些基本知识都介绍了。跟CUDA很类似,CM同样可以集成到深度学习框架当中,作为框架的kernel部分。

 

下一个更新,则主要集中在深度学习的应用上。那么为什么之前没有写CUDA呢。按照我的思路,先了解Intel的东西,然后在应用层面上,以大家用的最多的为主。

所以,后面将会带你从零开始,在笔记本上搭建一个基于深度学习的目标检测模型。此模型能够实现将训练好的模型,用C++进行集成,并作为最终的开发文件一起发布。

拭目以待吧。

 

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值