Platform: LG G3, Adreno 330 ,img size 3264x2448
C code | neon | GPU |
300 | 60 | 29 |
单位:ms
1. 目前按如下行列分解的方式最快29ms,Horizontal kernel globalWorksize[1] = {height+256-height%256};Vertical kernel globalWorksize2[1] = {width+256-width%256};
localWorksize2[] = {64}; localWorksize2 手动设为64时最快。
Porfile的结果为:Horizontal kernel 的wait time 有11ms,实际rum time 18ms.
这个wait time是什么呢?注释掉Horizontal kernel中的 vstore16(convert_uchar16(sum>>(ushort)8),0,pOutLine+j) ; 则wait time只有0.x ms.并且 localWorksize 越小wait time越长,为1时达到200ms,16时20ms. 难道是写内存等待时间,没有足够的ALU指令隐藏访存延时?写内存后进入下一个for循环,马上又读内存,所以没有ALU指令隐藏这个延时。然而Horizontal kernel的profile结果实际run time只有0.x ms,所有时间基本都是在wait.(更正:注释掉vstore16后,sum的计算被优化掉了,0.x ms是读内存的时间)
__kernel void ImageGaussianFilterHorizontal(__global const uchar* restrict source, // Source image __global uchar* restrict dest, // Intermediate dest image const int imgWidth , // Image width const int imgHeight) { const int y = get_global_id(0); if(y>=(imgHeight)) return; const uchar m_nRightShiftNum = 8; const uchar Rounding = (1 << (m_nRightShiftNum - 1)); const uchar m_nFilter[11] = {1,4,8,16,32,134,32,16,8,4,1}; const int s = 11; const int nStart = 5; const int nWidth = imgWidth; __global const uchar* pInLine = source + y*nWidth; __global uchar* pOutLine = dest + y*nWidth; int j; for(j = 0; j < nStart; j ++) { ushort sum = 0; for (int m = 0; m<s / 2; m++) { int k1 = (j + m - nStart); k1 = k1<0 ? -k1 : k1; int k2 = (j + nStart - m ); sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m]; } sum += pInLine[j] * m_nFilter[s / 2]; sum = (sum + Rounding) >> 8; pOutLine[j] = (uchar)clamp(sum,(ushort)0,(ushort)255); } for ( ; (j+16)<= (nWidth - nStart); j+=16) { #define GAUSSIAN_LINE_NEON(m) \ sum += ( convert_ushort16(vload16(0,pInLine+j-nStart+m))* m_nFilter[m] ); ushort16 sum = (convert_ushort16(vload16(0,pInLine+j-nStart)) * m_nFilter[0]); GAUSSIAN_LINE_NEON(1); GAUSSIAN_LINE_NEON(2); GAUSSIAN_LINE_NEON(3); GAUSSIAN_LINE_NEON(4); GAUSSIAN_LINE_NEON(5); GAUSSIAN_LINE_NEON(6); GAUSSIAN_LINE_NEON(7); GAUSSIAN_LINE_NEON(8); GAUSSIAN_LINE_NEON(9); GAUSSIAN_LINE_NEON(10); sum += (ushort)Rounding; vstore16(convert_uchar16(sum>>(ushort)8),0,pOutLine+j) ; } for( ; j < nWidth; j ++) { ushort sum = 0; for (int m = 0; m<s / 2; m++) { int k1 = (j + m - nStart); int k2 = (j + nStart - m ); k2 = k2 >= nWidth ? 2 * nWidth - 2 - k2 : k2; sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m]; } sum += pInLine[j] * m_nFilter[s / 2]; sum = (sum + Rounding) >> m_nRightShiftNum; pOutLine[j] = (uchar)clamp(sum,(ushort)0,(ushort)255); } } __kernel void ImageGaussianFilterVertical( __global uchar* restrict source, // Intermediate image processed by ImageGaussianFilterHorizontal() __global uchar* restrict dest, // Final destination image const int imgWidth, const int imgHeight ) { const int x = get_global_id(0); if(x>=(imgWidth)) return; const int x_offset = x; const int s = 11; const int nStart = s / 2; const int m_nRightShiftNum = 8; const int Rounding = (1 << (m_nRightShiftNum - 1)); const uchar m_nFilter[11] = {1,4,8,16,32,134,32,16,8,4,1}; int y; // mem_fence(CLK_LOCAL_MEM_FENCE); ushort lines[11]; lines[nStart] = (ushort)( source[x_offset] ); for(y=1;y<=nStart;y++) { lines[nStart+y] = (ushort)( source[y*imgWidth+x_offset] ); lines[nStart-y] = lines[nStart+y]; } for(y=0;y<(imgHeight-nStart-1);) { ushort sum = lines[nStart] * m_nFilter[nStart]; #define GaussianTwoLines(m) \ sum += ( (lines[m] + lines[s-1-m])*m_nFilter[m] ); GaussianTwoLines(0) GaussianTwoLines(1) GaussianTwoLines(2) GaussianTwoLines(3) GaussianTwoLines(4) sum += (ushort)Rounding; dest[y*imgWidth+x_offset] = (uchar)(sum>>(ushort)8); y++; for(int i = 0; i<s-1; i++) lines[i] = lines[i+1]; lines[s-1] = (ushort)( source[(y+nStart)*imgWidth+x_offset] ); } for(y=imgHeight-nStart-1;y<(imgHeight-1);) { ushort sum = lines[nStart] * m_nFilter[nStart]; GaussianTwoLines(0) GaussianTwoLines(1) GaussianTwoLines(2) GaussianTwoLines(3) GaussianTwoLines(4) sum += (ushort)Rounding; dest[y*imgWidth+x_offset] = (uchar)(sum>>(ushort)8); y++; for(int i = 0; i<s-1; i++) { lines[i] = lines[i+1]; } lines[s-1] = lines[(imgHeight-y)*2-2] ; // } //last y=imgHeight-1 ushort sum = lines[nStart] * m_nFilter[nStart]; GaussianTwoLines(0) GaussianTwoLines(1) GaussianTwoLines(2) GaussianTwoLines(3) GaussianTwoLines(4) sum += (ushort)Rounding; dest[y*imgWidth+x_offset] = (uchar)(sum>>(ushort)8); }
2.Horizontal kernel改进,预先load 2x16个所需的pixel,计算时从中提取,这样每次循环只需读一次内存。需要26ms,wait time 8ms.
ushort16 line0 = convert_ushort16(vload16(0,pInLine+j-nStart)); for ( ; (j+16)<= (nWidth - nStart); j+=16) { ushort16 line1 = convert_ushort16(vload16(0,pInLine+j-nStart+16)); ushort16 temp0; ushort16 temp1; temp0 = line0; temp1.s0123 = line0.sabcd; temp1.s45 = line0.sef; temp1.s67 = line1.s01; temp1.s89abcdef = line1.s23456789; ushort16 sum = ( temp0 + temp1 ) * m_nFilter[0]; temp0.s0123456789abcdef = temp0.s123456789abcdeff; temp0.sf = line1.s0; temp1.s0123456789abcdef = temp1.s00123456789abcde; temp1.s0 = line0.s9; sum += ( temp0 + temp1 ) * m_nFilter[1]; temp0.s0123456789abcdef = temp0.s123456789abcdeff; temp0.sf = line1.s1; temp1.s0123456789abcdef = temp1.s00123456789abcde; temp1.s0 = line0.s8; sum += ( temp0 + temp1 ) * m_nFilter[2]; temp0.s0123456789abcdef = temp0.s123456789abcdeff; temp0.sf = line1.s2; temp1.s0123456789abcdef = temp1.s00123456789abcde; temp1.s0 = line0.s7; sum += ( temp0 + temp1 ) * m_nFilter[3]; temp0.s0123456789abcdef = temp0.s123456789abcdeff; temp0.sf = line1.s3; temp1.s0123456789abcdef = temp1.s00123456789abcde; temp1.s0 = line0.s6; sum += ( temp0 + temp1 ) * m_nFilter[4]; temp0.s0123456789abcdef = temp0.s123456789abcdeff; temp0.sf = line1.s4; sum += ( temp0 ) * m_nFilter[5]; sum += (ushort)Rounding; line0 = line1; vstore16(convert_uchar16(sum>>(ushort)8),0,pOutLine+j) ; }
3.不计算,只读写内存测试。那么wait time 3.2 ms,run time 18.2 ms.说明Horizontal kernel 耗时的极限也需3.2ms. 但是只是注释掉vstore16,还保留了读和计算,反而wait time还只有0.x ms,这又是为何?是读几乎没有wait,3.2ms都是写的wait time? (更正:注释掉vstore16后,sum的计算被优化掉了,0.x ms是读内存的时间)
a.再次测试,只有读wait time 0.xms ,只有写wait time 3.2ms.写比读的周期长.
for ( ; (j+16)<= (nWidth - nStart); j+=16)
{
ushort16 line1 = convert_ushort16(vload16(0,pInLine+j-nStart+16));
vstore16(0,0,pOutLine+j) ;
}
b.另外发现使用*((__global uint4*)(pOutLine+j)) = as_uint4(result);比vstore16快,wait time 2.5ms.高通 80-N8592-1_L_OpenCL_Programming_Guide 中提到:
Vectorized load/store of a larger data type is more optimal than a small data type; e.g., a load of uint2* is more optimal than uchar8* .
For optimal SP to L2 bandwidth performance, align read access to a 32-bit address and write access to a 128-bit address.
c.原来写的内存没有对齐,使用*((__global uint4*)(pOutLine+j-5)) = as_uint4(result);wait time 1.9ms.
d.最后加上sum计算,采用的Horizontal kernel如下,localWorksize[] = {64};时时间最少,需要23ms,wait time 4.7ms , localWorksize = 128时,wait 6ms.
并且使用__attribute__((work_group_size_hint(64,1,1))) ,耗时22ms.
__kernel __attribute__((work_group_size_hint(64,1,1))) void ImageGaussianFilterHorizontal(__global const uchar* restrict source, // Source image __global uchar* restrict dest, // Intermediate dest image const int imgWidth , // Image width const int imgHeight) { const int y = get_global_id(0); if(y>=(imgHeight)) return; const uchar m_nRightShiftNum = 8; const uchar Rounding = (1 << (m_nRightShiftNum - 1)); const uchar m_nFilter[11] = {1,4,8,16,32,134,32,16,8,4,1}; const int s = 11; const int nStart = 5; const int nWidth = imgWidth; __global const uchar* pInLine = source + y*nWidth; __global uchar* pOutLine = dest + y*nWidth; int j; uchar temp[5]; for(j = 0; j < nStart; j ++) { ushort sum = 0; for (int m = 0; m<s / 2; m++) { int k1 = (j + m - nStart); k1 = k1<0 ? -k1 : k1; int k2 = (j + nStart - m ); sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m]; } sum += pInLine[j] * m_nFilter[s / 2]; sum = (sum + Rounding) >> 8; temp[j] = (uchar)clamp(sum,(ushort)0,(ushort)255); } uchar16 result,pre_result; pre_result.sbcde = (uchar4)(temp[0],temp[1],temp[2],temp[3]); pre_result.sf = temp[4]; ushort16 line0 = convert_ushort16(vload16(0,pInLine+j-nStart)); for ( ; (j+16)<= (nWidth - nStart); j+=16) { //prefetch(pInLine+j-nStart,32); //无变化 ushort16 line1 = convert_ushort16(vload16(0,pInLine+j-nStart+16)); ushort16 temp0; ushort16 temp1; temp0 = line0; temp1.s0123 = line0.sabcd; temp1.s45 = line0.sef; temp1.s67 = line1.s01; temp1.s89abcdef = line1.s23456789; ushort16 sum = ( temp0 + temp1 ) * m_nFilter[0]; temp0.s0123456789abcdef = temp0.s123456789abcdeff; temp0.sf = line1.s0; temp1.s0123456789abcdef = temp1.s00123456789abcde; temp1.s0 = line0.s9; sum += ( temp0 + temp1 ) * m_nFilter[1]; temp0.s0123456789abcdef = temp0.s123456789abcdeff; temp0.sf = line1.s1; temp1.s0123456789abcdef = temp1.s00123456789abcde; temp1.s0 = line0.s8; sum += ( temp0 + temp1 ) * m_nFilter[2]; temp0.s0123456789abcdef = temp0.s123456789abcdeff; temp0.sf = line1.s2; temp1.s0123456789abcdef = temp1.s00123456789abcde; temp1.s0 = line0.s7; sum += ( temp0 + temp1 ) * m_nFilter[3]; temp0.s0123456789abcdef = temp0.s123456789abcdeff; temp0.sf = line1.s3; temp1.s0123456789abcdef = temp1.s00123456789abcde; temp1.s0 = line0.s6; sum += ( temp0 + temp1 ) * m_nFilter[4]; temp0.s0123456789abcdef = temp0.s123456789abcdeff; temp0.sf = line1.s4; sum += ( temp0 ) * m_nFilter[5]; sum += (ushort)Rounding; line0 = line1; result.s0123 = pre_result.sbcde; result.s4 = pre_result.sf; pre_result = convert_uchar16(sum>>(ushort)8) ; result.s5 = pre_result.s0; result.s67 = pre_result.s12; result.s89abcdef = pre_result.s3456789a; *( (__global uint4*)(pOutLine+j-5) ) = (as_uint4)(result) ; } *( (__global uint*)(pOutLine+j-5) ) = (as_uint)(pre_result.sbcde);//last 5 bytes pOutLine[j-1] = pre_result.sf; for( ; j < nWidth; j ++) { ushort sum = 0; for (int m = 0; m<s / 2; m++) { int k1 = (j + m - nStart); int k2 = (j + nStart - m ); k2 = k2 >= nWidth ? 2 * nWidth - 2 - k2 : k2; sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m]; } sum += pInLine[j] * m_nFilter[s / 2]; sum = (sum + Rounding) >> m_nRightShiftNum; pOutLine[j] = (uchar)clamp(sum,(ushort)0,(ushort)255); } }