PatchMatch Stereo(二)：Gipuma

最新推荐文章于 2024-08-08 08:08:28 发布

FrankJingle

最新推荐文章于 2024-08-08 08:08:28 发布

阅读量7.1k

点赞数 3

分类专栏： OpenCV Computer Vision

本文链接：https://blog.csdn.net/fangjin_kl/article/details/76072755

版权

Computer Vision 同时被 2 个专栏收录

23 篇文章 0 订阅

订阅专栏

OpenCV

17 篇文章 1 订阅

订阅专栏

前言

paper: 《Massively Parallel Multiview Stereopsis by Surface Normal Diffusion》

PatchMatch Stereo 算法的GPU版本，提供了源码，从代码的角度来进行解读。

算法 & 源码

源码地址： https://github.com/kysucix/gipuma.git

一、main函数

int main(int argc, char **argv)
{
    if ( argc < 3 )
    {
        print_help (argv);
        return 0;
    }

    InputFiles inputFiles;
    OutputFiles outputFiles;
    AlgorithmParameters* algParams = new AlgorithmParameters;
    GTcheckParameters gtParameters;

    int ret = getParametersFromCommandLine ( argc, argv, inputFiles, outputFiles, *algParams, gtParameters);
    if ( ret != 0 )
        return ret;

    selectCudaDevice();

    Results results;
    ret = runGipuma ( inputFiles, outputFiles, *algParams, gtParameters, results);

    return 0;
}

InputFiles ，OutputFiles ，AlgorithmParameters，GTcheckParameters , Results分别是一些输入文件、输出文件、算法参数、GroundTruth、结果相关的结构体，比较简单，略过。
需要说明的有几点

`1. AlgorithmParameters`

AlgorithmParameters是继承自一个Managed类

struct AlgorithmParameters : public Managed

其中

class Managed {
public:
  void *operator new(size_t len) {
    void *ptr;
    checkCudaErrors(cudaMallocManaged(&ptr, len));
    return ptr;
  }

  void operator delete(void *ptr) {
      cudaFree(ptr);
  }
};

继承自Managed类说明，在每次new的时候，开辟的是GPU的内存。

AlgorithmParameters* algParams = new AlgorithmParameters;

因此， algParams 中的数据都是GPU中开辟的。

`2. getParametersFromCommandLine`

读取函数参数的代码，写的很粗糙，略过。

`3. selectCudaDevice`

查询Cuda设备代码，可留作自用。

static void selectCudaDevice ()
{
    int deviceCount = 0;
    checkCudaErrors(cudaGetDeviceCount(&deviceCount));
    if (deviceCount == 0) {
        fprintf(stderr, "There is no cuda capable device!\n");
        exit(EXIT_FAILURE);
    } 
    cout << "Detected " << deviceCount << " devices!" << endl;
    std::vector<int> usableDevices;
    std::vector<std::string> usableDeviceNames;
    for (int i = 0; i < deviceCount; i++) {
        cudaDeviceProp prop;
        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if (prop.major >= 3 && prop.minor >= 0) {
                usableDevices.push_back(i);
                usableDeviceNames.push_back(string(prop.name));
            } else {
                cout << "CUDA capable device " << string(prop.name)
                     << " is only compute cabability " << prop.major << '.'
                     << prop.minor << endl;
            }
        } else {
            cout << "Could not check device properties for one of the cuda "
                    "devices!" << endl;
        }
    }
    if(usableDevices.empty()) {
        fprintf(stderr, "There is no cuda device supporting gipuma!\n");
        exit(EXIT_FAILURE);
    }
    cout << "Detected gipuma compatible device: " << usableDeviceNames[0] << endl;;
    checkCudaErrors(cudaSetDevice(usableDevices[0]));
    cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 1024*128);
}

二、runGipuma 函数

很长，就记录几个小知识点。

`1. time`

time_t timeObj;
time ( &timeObj );
tm *pTime = localtime ( &timeObj );

利用时间来给文件命名

sprintf ( outputFolder, "%s/%04d%02d%02d_%02d%02d%02d_%s", outputFiles.parentFolder, pTime->tm_year + 1900, pTime->tm_mon + 1, pTime->tm_mday, pTime->tm_hour, pTime->tm_min, pTime->tm_sec, ref_name.c_str () );

`2. mkdir`

Windows 下和 Linux 下

//新建文件夹，windows下函数_mkdir
#if defined(_WIN32)
    _mkdir ( outputFolder );
#else
    mkdir ( outputFolder, 0777 );
#endif

`3. GPU内存`

//查询GPU可用内存
size_t avail;
size_t used;
size_t total;
cudaMemGetInfo( &avail, &total );
used = total - avail;

`4. 测时间`

int64_t t = getTickCount ();
...
fun();
...
t = getTickCount () - t;
double rt = ( double ) t / getTickFrequency (); //单位second

`5. 添加alpha通道，转float型`

if(algParams.color_processing) {
   vector<Mat_<float> > rgbChannels ( 3 );
   img_color_float_alpha[i] = Mat::zeros ( img_grayscale[0].rows, img_grayscale[0].cols, CV_32FC4 );
   img_color[i].convertTo (img_color_float[i], CV_32FC3); // or CV_32F works (too)
   Mat alpha( img_grayscale[0].rows, img_grayscale[0].cols, CV_32FC1 );
   split (img_color_float[i], rgbChannels);
   rgbChannels.push_back( alpha);
   merge (rgbChannels, img_color_float_alpha[i]);
}

`6. getCameraParameters`

readKRtFileMiddlebury函数

Vec3f vt;
...
Mat t(vt, false); //浅拷贝
/*Mat t(vt);*/

hconcat(R, t, Rt); //将R和t串起来
cameras[i].P = K * Rt;

P是projection矩阵。
注意 hconcat(R, t, Rt)函数的使用。

opencv 中range的使用

Mat img = Mat::eye(5, 5, CV_32F);
cout << img << endl;
cout << img(Range(0, 3), Range(0, 1)) << endl;

打印

[1, 0, 0, 0, 0;
 0, 1, 0, 0, 0;
 0, 0, 1, 0, 0;
 0, 0, 0, 1, 0;
 0, 0, 0, 0, 1]
[1;
 0;
 0]

对行和列的还有单独的函数

P.colRange ( 0,3 );

decomposeProjectionMatrix
根据ProjectionMatrix来分解K，R，T。

Mat_<float> K;
Mat_<float> R;
Mat_<float> T;
decomposeProjectionMatrix ( P, K, R, T);

如果要计算T，例如

Mat Rt_leftIR, P;
hconcat(R_leftIR, T_leftIR, Rt_leftIR);
cout << "Rt_leftIR: " << Rt_leftIR << endl;
cout << "cameraMatrix_leftIR: " << cameraMatrix_leftIR << endl;

P = cameraMatrix_leftIR * Rt_leftIR;

Mat_<double> K, R, T, C, t;
decomposeProjectionMatrix(P, K, R, T);
C = T(Range(0, 3), Range(0, 1)) / T(3, 0);
t = -R * C;
cout << "P: " << P << endl;
cout << "K: " << K << endl;
cout << "R: " << R << endl;
cout << "T: " << T << endl;
cout << "C: " << C << endl;
cout << "t: " << t << endl;

输出

Rt_leftIR: [-0.034142, 0.998565, -0.041267, -75.201183;
 0.990604, 0.028341, -0.133792, -167.158638;
 -0.13243, -0.045448, -0.99015, 408.359915]
cameraMatrix_leftIR: [580.49762, 0, 318.49469;
 0, 580.49762, 264.3284;
 0, 0, 1]
P: [-61.99760153874, 565.18965924418, -339.31291258804, 86406.35678366688;
 540.0382543504801, 4.438685925220001, -339.39070283504, 10905.93143464444;
 -0.13243, -0.045448, -0.99015, 408.359915]
K: [580.4978233706864, 0.0001877105137459978, 318.4942436276482;
 0, 580.4975608939687, 264.3286762000524;
 0, 0, 1.000000124051992]
R: [-0.03414241920693253, 0.9985646029595258, -0.04126777104451897;
 0.9906041563923866, 0.02834102194255837, -0.1337915984409323;
 -0.1324299835717967, -0.04544799436208576, -0.9901498771699351]
T: [0.4850183888674806;
 0.2198098541623313;
 0.84642530642572;
 0.002234080494034573]
C: [217.0997822874214;
 98.38940662579803;
 378.8696551828993]
t: [-75.20076080031868;
 -167.158826250214;
 408.3598643421455]

注意T，C和t。
T是相机的世界坐标系下的四维坐标，通过归一化可以得到三维坐标C，也就是相机的中心坐标。t是相机坐标系下将三维点投影到平面上的位移矩阵，具体运算公式，可以见《Multiple View Geometry in Computer Vision》的《Camera Models》这一章的计算公式。

这里写图片描述

伪逆
求逆的时候可以选定伪逆的方法，如

P.inv ( DECOMP_SVD );

4*4的RT矩阵

Mat_<float> getTransformationMatrix ( Mat_<float> R, Mat_<float> t ) {
    Mat_<float> transMat = Mat::eye ( 4,4, CV_32F );
    //Mat_<float> Rt = - R * t;
    R.copyTo ( transMat ( Range ( 0,3 ),Range ( 0,3 ) ) );
    t.copyTo ( transMat ( Range ( 0,3 ),Range ( 3,4 ) ) );

    return transMat;
}

`7. selectViews`

异常值判断
getAngle函数中有一处

static float getAngle ( Vec3f v1, Vec3f v2 ) {
    float angle = acosf ( v1.dot ( v2 ) );
    //if angle is not a number the dot product was 1 and thus the two vectors should be identical --> return 0
    if ( angle != angle )
        return 0.0f;
    //if ( acosf ( v1.dot ( v2 ) ) != acosf ( v1.dot ( v2 ) ) )
        //cout << acosf ( v1.dot ( v2 ) ) << " / " << v1.dot ( v2 )<< " / " << v1<< " / " << v2 << endl;
    return angle;
}

其中if ( angle != angle )这一句话主要的作用在于，如果v1, v2没有做归一化，则acosf 函数的输入可能超过[-1, 1]的范围，则函数会返回异常值，那么angle就不是一个float值，而是一个类似于NAN的符号，无法使用!=来进行判断，会认为angle != angle,从而进入if语句，强行返回0。
这里同样需要注意，对于float数，使用 ==或者!=进行判断，是可以的。

getViewVector
得到相机成像平面的法向量，其中一个点事中心点。

static inline Vec3f getViewVector ( Camera &cam, int x, int y) {

    //get some point on the line (the other point on the line is the camera center)
    Vec3f ptX = get3Dpoint ( cam,x,y,1.0f );

    //get vector between camera center and other point on the line
    Vec3f v = ptX - cam.C;
    return normalize ( v );
}

其中

inline Vec3f get3Dpoint ( Camera &cam, float x, float y, float depth ) {
    // in case camera matrix is not normalized: see page 162, 
    //then depth might not be the real depth but w and depth needs to be computed from that first

    Mat_<float> pt = Mat::ones ( 3,1,CV_32F );
    pt ( 0,0 ) = x;
    pt ( 1,0 ) = y;

    //formula taken from page 162 (alternative expression)
    Mat_<float> ptX = cam.M_inv * ( depth*pt - cam.P.col ( 3 ) );
    return Vec3f ( ptX ( 0 ),ptX ( 1 ),ptX ( 2 ) );
}

按照《Multiple View Geometry in Computer Vision》书中的公式

这里写图片描述

8. 读取文件夹文件

static void get_directory_entries(
                           const char *dirname,
                           vector<string> &directory_entries)
{
    DIR *dir;
    struct dirent *ent;

    // Open directory stream
    dir = opendir (dirname);
    if (dir != NULL) {
        //cout << "Dirname is " << dirname << endl;
        //cout << "Dirname type is " << ent->d_type << endl;
        //cout << "Dirname type DT_DIR " << DT_DIR << endl;

        // Print all files and directories within the directory
        while ((ent = readdir (dir)) != NULL) {
            //cout << "INSIDE" << endl;
            //if(ent->d_type == DT_DIR)
            {
                char* name = ent->d_name;
                if(strcmp(name,".") == 0 || strcmp(ent->d_name,"..") == 0)
                    continue;
                //printf ("dir %s/\n", name);
                directory_entries.push_back(string(name));
            }
        }

        closedir (dir);

    } else {
        // Could not open directory
        printf ("Cannot open directory %s\n", dirname);
        exit (EXIT_FAILURE);
    }
    sort ( directory_entries.begin (), directory_entries.end () );
}

9. CUDA的纹理对象

纹理内存可以硬件自己计算插值。
举个示例

// 简单纹理变换函数
__global__ void transformKernel(float* output,
        cudaTextureObject_t texObj,
        int width, int height,
        float theta)
{
    // 计算纹理坐标
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    float u = x / (float)width;
    float v = y / (float)height;

    // 坐标转换
    u -= 0.5f;
    v -= 0.5f;
    float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
    float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;

    // 从纹理中读取并写入全局存储
    output[y * width + x] = tex2D<float>(texObj, tu, tv);
}

int main() {
    // 定义CUDA array
    cudaChannelFormatDesc channelDesc =
    cudaCreateChannelDesc(32, 0, 0, 0,
    cudaChannelFormatKindFloat);
    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, width, height);

    // 拷贝数据到CUDA array
    cudaMemcpyToArray(cuArray, 0, 0, h_data, size,
    cudaMemcpyHostToDevice);

    // 定义资源描述符
    struct cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypeArray;
    resDesc.res.array.array = cuArray;

    // 定义纹理对象参数
    struct cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.addressMode[0] = cudaAddressModeWrap;
    texDesc.addressMode[1] = cudaAddressModeWrap;
    texDesc.filterMode = cudaFilterModeLinear;
    texDesc.readMode = cudaReadModeElementType;
    texDesc.normalizedCoords = 1;

    // 生产纹理对象
    cudaTextureObject_t texObj = 0;
    cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);

    // 分配用于保持结果的内存
    float* output;
    cudaMalloc(&output, width * height * sizeof(float));

    // 调用Kernel
    dim3 dimBlock(16, 16);
    dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x, (height + dimBlock.y - 1) / dimBlock.y);

    transformKernel<<<dimGrid, dimBlock>>> (output, texObj, width, height, angle);

    // 销毁纹理对象
    cudaDestroyTextureObject(texObj);

    // 释放内存
    cudaFreeArray(cuArray);
}

再去代码中的addImageToTextureFloatGray函数，作用就是将图像存入了纹理内存中。

三、初始化

init函数入口在

dim3 grid_size_initrand;
//强制变成16的倍数，grid_size 是线程块的数量，block_size 是 一个线程块上的线程数量
grid_size_initrand.x=(cols+16-1)/16;
grid_size_initrand.y=(rows+16-1)/16;
dim3 block_size_initrand;
block_size_initrand.x=16;
block_size_initrand.y=16;
...
gipuma_init_cu2<T><<< grid_size_initrand, block_size_initrand>>>(gs);

其中

template< typename T >
__global__ void gipuma_init_cu2(GlobalState &gs)
{
    //当前线程坐标
    const int2 p = make_int2 ( blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y );
    const int rows = gs.cameras->rows;
    const int cols = gs.cameras->cols;

    if (p.x >= cols)
        return;
    if (p.y >= rows)
        return;

    // Temporary variables
    Camera_cu &camera = gs.cameras->cameras[REFERENCE];

    const int center = p.y*cols+p.x;
    int box_hrad = gs.params->box_hsize / 2;
    int box_vrad = gs.params->box_vsize / 2;

    float disp_now;
    float4 norm_now;

    curandState localState = gs.cs[p.y*cols+p.x];
    curand_init ( clock64(), p.y, p.x, &localState );

    // Compute random normal on half hemisphere of fronto view vector
    float mind = gs.params->min_disparity;
    float maxd = gs.params->max_disparity;
    float4 viewVector;

    //getViewVector的GPU版本
    getViewVector_cu ( &viewVector, camera, p);
    //printf("Random number is %f\n", random_number);
    //return;

    //均匀分布，在固定范围内随机分配一个disp值
    disp_now = curand_between(&localState, mind, maxd);

    //随机初始化normal值，参照论文
    rndUnitVectorOnHemisphere_cu ( &norm_now, viewVector, &localState );

    //根据disparity值计算depth值
    disp_now= disparityDepthConversion_cu ( camera.f, camera.baseline, disp_now);

    // Save values
    //计算
    norm_now.w = getD_cu ( norm_now, p, disp_now,  camera);
    //disp[x] = disp_now;
    gs.lines->norm4[center] = norm_now;

    __shared__ T tile_leftt[1] ;
    const int2 tmp =make_int2(0,0);

    //计算这一点的cost值
    gs.lines->c[center] = pmCostMultiview_cu<T> ( gs.imgs,
                                                 tile_leftt,
                                                 tmp,
                                                 p,
                                                 norm_now,
                                                 box_vrad, box_hrad,
                                                 *(gs.params),
                                                 *(gs.cameras),
                                                 gs.lines->norm4,
                                                 0);
    return;
}

下面结合论文来看每一个子步骤。

1. `rndUnitVectorOnHemisphere_cu`

按照论文《PatchMatch Stereo - Stereo Matching with Slanted Support Windows》，对每个点，随机初始化一个disparity和normal值。
假设每个点都有一个倾斜面，用

d p = a f p p x + b f p p y + c f p

$d_p = a_{f_p} p_x+b_{f_p} p_y+c_{f_p}$
来表示，但是这样表示对a，b，c的初始化无法很均匀，也无法限制条件，我们改用先在disparity的范围内随机一个d，然后初始化normal vector

n=(nx,ny,nz) $n = (n_x ,n_y ,n_z)$ ，
然后可以计算出a, b, c, 如下

a f : = - n x n z

$a_f := - \frac{n_x}{ n_z }$

b f : = - n y n z

$b_f := - \frac{n_y}{ n_z }$

c f : = - n x x 0 + n y y 0 + n z z 0 n z

$c_f := - \frac{n_xx_0+n_yy_0+n_zz_0}{ n_z }$

在Gipuma的论文中，对于初始化，
这里写图片描述

对应代码

__device__ FORCEINLINE static void rndUnitVectorOnHemisphere_cu ( float4 *v, const float4 &viewVector, curandState *cs ) {
    rndUnitVectorSphereMarsaglia_cu (v, cs);
    vecOnHemisphere_cu ( v,viewVector );
};

利用 $q_1, q_2$ 来初始化一个normal

__device__ FORCEINLINE static void rndUnitVectorSphereMarsaglia_cu (float4 *v, curandState *cs) {
    float x = 1.0f;
    float y = 1.0f;
    float sum = 2.0f;
    while ( sum>=1.0f ) {
        x = curand_between (cs, -1.0f, 1.0f);
        y = curand_between (cs, -1.0f, 1.0f);
        sum = get_pow2_norm(x,y);
    }
    const float sq = sqrtf ( 1.0f-sum );
    v->x = 2.0f*x*sq;
    v->y = 2.0f*y*sq;
    v->z = 1.0f-2.0f*sum;
    //v->x = 0;
    //v->y = 0;
    //v->z = -1;
}

如果ray是正向的，就将整个normal反向。

__device__ FORCEINLINE static void vecOnHemisphere_cu ( float4 * __restrict__ v, const float4 &viewVector ) {
    const float dp = dot4 ( (*v), viewVector );
    if ( dp > 0.0f ) {
        negate4(v);
    }
    return;
}

2. 计算随机初始化的cost值

/* cost computation for multiple images
 * combines cost of all ref-to-img correspondences
 */
template< typename T >
__device__ FORCEINLINE static float pmCostMultiview_cu (
                                                        const cudaTextureObject_t *images,
                                                        const T * __restrict__ tile_left,
                                                        const int2 tile_offset,
                                                        const int2 p,
                                                        const float4 &normal,
                                                        const int &vRad,
                                                        const int &hRad,
                                                        const AlgorithmParameters &algParam,
                                                        const CameraParameters_cu &camParams,
                                                        const float4 * __restrict__ state,
                                                        const int point)
{
    // iterate over all other images and compute cost
    //const int numImages = camParams.viewSelectionSubsetNumber; // CACHE
    float costVector[32];
    float cost = 0.0f;
    int numValidViews = 0;

    int cost_count=0;
    for ( int i = 0; i < camParams.viewSelectionSubsetNumber; i++ ) {
        int idxCurr = camParams.viewSelectionSubset[i];
        /*if ( idxCurr != REFERENCE ) */
        {
            float c = 0;
#ifdef SHARED
            if (tile_offset.x!= 0 )
                c = pmCost_shared<T> ( images[REFERENCE],
                                       tile_left,
                                       tile_offset,
                                       images[idxCurr],
                                       p,
                                       normal,
                                       vRad, hRad,
                                       algParam, camParams,
                                       idxCurr );
            else
#endif
                c = pmCost<T> ( images[REFERENCE],
                                tile_left,
                                tile_offset,
                                images[idxCurr],
                                p.x, p.y,
                                normal,
                                vRad, hRad,
                                algParam, camParams,
                                idxCurr );

            // only add to cost vector if viewable
            if ( c < MAXCOST )
                numValidViews++;
            else
                c = MAXCOST; // in order to not get an overflow when accumulating
            costVector[i] = c;
            cost_count++;
        }
    }
    sort_small(costVector,cost_count);

    //for some robustness only consider best n cost values (n dependent on number of images)
    int numBest = numValidViews; //numImages-1;
    if ( algParam.cost_comb == COMB_BEST_N )
        numBest = min ( numBest, algParam.n_best );
    if ( algParam.cost_comb == COMB_GOOD )
        numBest = camParams.viewSelectionSubsetNumber ;

    float costThresh = costVector[0] * algParam.good_factor;
    int numConsidered = 0;
    for ( int i = 0; i < numBest; i++ ) {
        numConsidered++;
        float c = costVector[i];
        if ( algParam.cost_comb == COMB_GOOD ) {
            c = fminf ( c, costThresh );
        }
        cost = cost + c;
    }
    cost = cost / ( ( float ) numConsidered);
    if ( numConsidered < 1 )
        cost = MAXCOST;

    if ( cost != cost || cost > MAXCOST || cost < 0 )
        cost = MAXCOST;

    return cost;
}

主要针对multiview的做cost计算，关键是其中的pmCost函数来计算cost。
计算公式如下，在一个小窗口中，对一个窗口内的点进行计算总的cost，作为中心点的cost，这样比较鲁棒，每个点的权重与点到中心点的颜色相似度有关。
这里写图片描述

pmCost函数为

template< typename T >
__device__ FORCEINLINE static float pmCost (
                                            const cudaTextureObject_t &l,
                                            const T * __restrict__ tile_left,
                                            const int2 tile_offset,
                                            const cudaTextureObject_t &r,
                                            const int &x,
                                            const int &y,
                                            const float4 &normal,
                                            const int &vRad,
                                            const int &hRad,
                                            const AlgorithmParameters &algParam,
                                            const CameraParameters_cu &camParams,
                                            const int &camTo )
{
    const int cols = camParams.cols;
    const int rows = camParams.rows;
    const float alpha = algParam.alpha;
    const float tau_color = algParam.tau_color;
    const float tau_gradient = algParam.tau_gradient;
    const float gamma = algParam.gamma;

    float4 pt_c;
    float H[16];

    //计算Homography矩阵，用于获得patch在另一个view下的位置。
    getHomography_cu ( camParams.cameras[REFERENCE], camParams.cameras[camTo], camParams.cameras[REFERENCE].K_inv, camParams.cameras[camTo].K, normal, normal.w, H );

    //根据Homography矩阵，获得这个点对应的位置。
    getCorrespondingPoint_cu ( make_int2(x, y), H, &pt_c );

    {
        float cost = 0;
        //float weightSum = 0.0f;
        for ( int i = -hRad; i < hRad + 1; i+=WIN_INCREMENT ) {
            for ( int j = -vRad; j < vRad + 1; j+=WIN_INCREMENT ) {
                const int xTemp = x + i;
                const int yTemp = y + j;
                float4 pt_l;
                pt_l.x = __int2float_rn(xTemp);
                pt_l.y = __int2float_rn(yTemp);
                int2 pt_li = make_int2(xTemp, yTemp);

                float w;

                w = weight_cu<T> ( tex2D<T>(l, pt_l.x + 0.5f, pt_l.y + 0.5f), tex2D<T>(l,x + 0.5f,y + 0.5f), gamma);

                float4 pt;
                getCorrespondingPoint_cu ( make_int2(xTemp, yTemp),
                                           H,
                                           &pt );

                cost = cost + pmCostComputation<T> ( l, tile_left, r, pt_l, pt, rows, cols, tau_color, tau_gradient, alpha,  w );
                //weightSum = weightSum + w;
            }
        }
        return cost;
    }
}

其中， getHomography_cu 和 getCorrespondingPoint_cu的算法来自
Gipuma论文中
这里写图片描述

按照论文的说法，直接利用Homograph来寻找在另一个view上的对应点，可以不用rectify。其实就是假设一个patch对应的3D物体在一个平面上。

然后，在函数
weight_cu 和 pmCostComputation里基本实现了前面提到的cost计算公式，只是需要注意

const T gx1 = tex2D<T> (l, pt_l.x+1 + 0.5f, pt_l.y   + 0.5f) - tex2D<T> (l, pt_l.x-1 + 0.5f, pt_l.y   + 0.5f);
const T gy1 = tex2D<T> (l, pt_l.x   + 0.5f, pt_l.y+1 + 0.5f) - tex2D<T> (l, pt_l.x   + 0.5f, pt_l.y-1 + 0.5f);
const T gx2 = tex2D<T> (r, pt_r.x+1 + 0.5f, pt_r.y   + 0.5f) - tex2D<T> (r, pt_r.x-1 + 0.5f, pt_r.y   + 0.5f);
const T gy2 = tex2D<T> (r, pt_r.x   + 0.5f, pt_r.y+1 + 0.5f) - tex2D<T> (r, pt_r.x   + 0.5f, pt_r.y-1 + 0.5f);

//计算梯度时，是间隔了一个元素进行计算的，注意。
const T gradX = (gx1 - gx2);
const T gradY = (gy1 - gy2);

//gradient dissimilarity (L1) in x and y direction (multiplication by 0.5 to use tauGrad from PatchMatch stereo paper)
//为什么乘以0.0625f？
const float gradDis = fminf ( ( l1_norm ( gradX ) + l1_norm ( gradY ) ) * 0.0625f, tau_gradient );

四、Propagation

然后就开始大循环迭代了，

for (int it = 0;it<maxiter; it++) {
        printf("%d ", it+1);
#ifdef SMALLKERNEL
        //spatial propagation of 4 closest neighbors (1px up/down/left/right)
        gipuma_black_spatialPropClose_cu<T><<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
        cudaDeviceSynchronize();
    #ifdef EXTRAPOINTFAR
        //spatial propagation of 4 far away neighbors (5px up/down/left/right)
        gipuma_black_spatialPropFar_cu<T><<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
        cudaDeviceSynchronize();
    #endif
        //plane refinement
        gipuma_black_planeRefine_cu<T><<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
        cudaDeviceSynchronize();

        //spatial propagation of 4 closest neighbors (1px up/down/left/right)
        gipuma_red_spatialPropClose_cu<T><<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
        cudaDeviceSynchronize();
    #ifdef EXTRAPOINTFAR
        //spatial propagation of 4 far away neighbors (5px up/down/left/right)
        gipuma_red_spatialPropFar_cu<T><<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
        cudaDeviceSynchronize();
    #endif
        //plane refinement
        gipuma_red_planeRefine_cu<T><<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
        cudaDeviceSynchronize();
#else
        gipuma_black_cu<T><<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
        gipuma_red_cu<T><<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
#endif
    }

原始的算法是从第一个元素开始遍历，试用邻居节点的平面，保留cost值小的参数。而GPU版本利用红黑块，来做并行化。
这里写图片描述
具体实现的代码，几个函数大同小异，暂时简单说明，以后有空再开一篇详细介绍。简单的说，就是根据上下左右的点来更新，将让自己cost值更小的normal值更新为自己的。

if (p.y>0) {
        SPATIALPROPAGATION(up);
    }
    if (p.y<rows-1) {
        SPATIALPROPAGATION(down);
    }
    if (p.x>0) {
        SPATIALPROPAGATION(left);
    }
    if (p.x<cols-1) {
        SPATIALPROPAGATION(right);
    }

    // Save to global memory
    c    [center] = cost_now;
    //disp [center] = disp_now;
    norm [center] = norm_now;

planeRefine的原理如下
这里写图片描述