C++性能优化系列——矩阵转置(七)Intrinsic 内存预取与OpenMP多线程并行化

6 篇文章 4 订阅
4 篇文章 1 订阅

上一篇C++性能优化系列——矩阵转置(六)Intrinsic转置实现与Core Bound优化中通过Intrinsic 实现了矩阵转置功能,同时针对寄存器使用导致的spilling问题,对每次处理的数据块尺寸进行调整。本篇将基于之前的优化版本,在内存访问方面与多线程并行化方面做进一步优化。

内存访问优化

上一篇中主要针对寄存器使用问题优化代码,未针对内存访问问题 L3 Latency做处理。考虑Advance2版本中对内存访问的模式,写内存操作是连续执行的,读内存操作是交叉执行的。因此加载内存方式导致该问题的可能性最大。同时转置功能逻辑导致加载内存必须要交叉执行,因此要从别的方面考虑优化这个问题。
SSE指令集中提供内存预取指令。考虑每次循环中加载下一次要用到的数据。
代码实现:

void SimdTransposeBlock8RowPadPrefetch(unsigned char* pSource, unsigned char* pTarget)
	{
		clock_t begin = clock();
		int iblock = 128;
		int iwsize = NCOL / iblock;
		int ihsize = NROW / iblock;
		for (int i = 0; i < REPEAT; ++i)
		{
			for (int ibcol = 0; ibcol < iwsize; ++ibcol)
			{
				for (int ibrow = 0; ibrow < ihsize; ++ibrow)
				{
					for (int icol = 0; icol < iblock; icol += 8)
					{
						for (int irow = 0; irow < iblock; irow += 16)
						{
							__m128i A0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 0) * NREALCOL + ibcol * iblock + icol));
							__m128i B0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 1) * NREALCOL + ibcol * iblock + icol));
							__m128i C0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 2) * NREALCOL + ibcol * iblock + icol));
							__m128i D0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 3) * NREALCOL + ibcol * iblock + icol));
							__m128i E0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 4) * NREALCOL + ibcol * iblock + icol));
							__m128i F0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 5) * NREALCOL + ibcol * iblock + icol));
							__m128i G0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 6) * NREALCOL + ibcol * iblock + icol));
							__m128i H0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 7) * NREALCOL + ibcol * iblock + icol));
							__m128i I0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 8) * NREALCOL + ibcol * iblock + icol));
							__m128i J0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 9) * NREALCOL + ibcol * iblock + icol));
							__m128i K0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 10) * NREALCOL + ibcol * iblock + icol));
							__m128i L0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 11) * NREALCOL + ibcol * iblock + icol));
							__m128i M0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 12) * NREALCOL + ibcol * iblock + icol));
							__m128i N0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 13) * NREALCOL + ibcol * iblock + icol));
							__m128i O0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 14) * NREALCOL + ibcol * iblock + icol));
							__m128i P0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 15) * NREALCOL + ibcol * iblock + icol));


							//unpack2
							__m128i AB0_7 = _mm_unpacklo_epi8(A0_7, B0_7);
							__m128i CD0_7 = _mm_unpacklo_epi8(C0_7, D0_7);
							__m128i EF0_7 = _mm_unpacklo_epi8(E0_7, F0_7);
							__m128i GH0_7 = _mm_unpacklo_epi8(G0_7, H0_7);
							__m128i IJ0_7 = _mm_unpacklo_epi8(I0_7, J0_7);
							__m128i KL0_7 = _mm_unpacklo_epi8(K0_7, L0_7);
							__m128i MN0_7 = _mm_unpacklo_epi8(M0_7, N0_7);
							__m128i OP0_7 = _mm_unpacklo_epi8(O0_7, P0_7);


							//unpack4
							__m128i ABCD0_3 = _mm_unpacklo_epi16(AB0_7, CD0_7);
							__m128i EFGH0_3 = _mm_unpacklo_epi16(EF0_7, GH0_7);
							__m128i IJKL0_3 = _mm_unpacklo_epi16(IJ0_7, KL0_7);
							__m128i MNOP0_3 = _mm_unpacklo_epi16(MN0_7, OP0_7);
							__m128i ABCD4_7 = _mm_unpackhi_epi16(AB0_7, CD0_7);
							__m128i EFGH4_7 = _mm_unpackhi_epi16(EF0_7, GH0_7);
							__m128i IJKL4_7 = _mm_unpackhi_epi16(IJ0_7, KL0_7);
							__m128i MNOP4_7 = _mm_unpackhi_epi16(MN0_7, OP0_7);

							//unpack8
							__m128i A_H0_1 = _mm_unpacklo_epi32(ABCD0_3, EFGH0_3);
							__m128i I_P0_1 = _mm_unpacklo_epi32(IJKL0_3, MNOP0_3);
							__m128i A_H2_3 = _mm_unpackhi_epi32(ABCD0_3, EFGH0_3);
							__m128i I_P2_3 = _mm_unpackhi_epi32(IJKL0_3, MNOP0_3);
							__m128i A_H4_5 = _mm_unpacklo_epi32(ABCD4_7, EFGH4_7);
							__m128i I_P4_5 = _mm_unpacklo_epi32(IJKL4_7, MNOP4_7);
							__m128i A_H6_7 = _mm_unpackhi_epi32(ABCD4_7, EFGH4_7);
							__m128i I_P6_7 = _mm_unpackhi_epi32(IJKL4_7, MNOP4_7);

							//unpack16			
							__m128i AP0 = _mm_unpacklo_epi64(A_H0_1, I_P0_1);
							__m128i AP1 = _mm_unpackhi_epi64(A_H0_1, I_P0_1);
							__m128i AP2 = _mm_unpacklo_epi64(A_H2_3, I_P2_3);
							__m128i AP3 = _mm_unpackhi_epi64(A_H2_3, I_P2_3);
							__m128i AP4 = _mm_unpacklo_epi64(A_H4_5, I_P4_5);
							__m128i AP5 = _mm_unpackhi_epi64(A_H4_5, I_P4_5);
							__m128i AP6 = _mm_unpacklo_epi64(A_H6_7, I_P6_7);
							__m128i AP7 = _mm_unpackhi_epi64(A_H6_7, I_P6_7);

							//store
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 0) * NROW + ibrow * iblock + irow), AP0);
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 1) * NROW + ibrow * iblock + irow), AP1);
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 2) * NROW + ibrow * iblock + irow), AP2);
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 3) * NROW + ibrow * iblock + irow), AP3);
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 4) * NROW + ibrow * iblock + irow), AP4);
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 5) * NROW + ibrow * iblock + irow), AP5);
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 6) * NROW + ibrow * iblock + irow), AP6);
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 7) * NROW + ibrow * iblock + irow), AP7);

							//prefetch
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 0) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 1) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 2) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 3) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 4) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 5) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 6) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 7) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 8) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 9) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 10) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 11) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 12) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 13) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 14) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 15) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);


						}
					}
				}
			}
		}
		clock_t end = clock();
		std::cout << "SimdTransposeBlock8RowPadPrefetch 10240 Time " << (end - begin) << std::endl;
		std::cout << "SimdTransposeBlock8RowPadPrefetch each Transpose (ms) " << ((float)(end - begin)) / (float)REPEAT << std::endl;
	}

执行时间:

SimdTransposeBlock8RowPadPrefetch 10240 Time 686
SimdTransposeBlock8RowPadPrefetch each Transpose (ms) 0.0669922

程序在执行时间上有了进一步提升。

VTune分析

在这里插入图片描述
可以看到内存访问方面没有明显的性能问题。
在这里插入图片描述
因为实现中添加了prefetch指令,因此总的执行指令数量增加了。但是CPI相对而言是下降的。
在这里插入图片描述
之前的L3 Latency问题已经不是主要的性能热点。

OpenMP并行化

基于本文中之前实现版本,对程序做多线程并行化优化。因为每次循环中处理的数据块没有交集,因此对for循环做并行化可以保证结果的正确性。
代码实现:

void SimdTransposeBlock8RowPPP(unsigned char* pSource, unsigned char* pTarget)
	{
		//PPP -> Pad Prefetch Parallel
		clock_t begin = clock();
		int iblock = 128;
		int iwsize = NCOL / iblock;
		int ihsize = NROW / iblock;
		for (int i = 0; i < REPEAT; ++i)
		{
#pragma omp parallel for num_threads(8) schedule(dynamic)
			for (int ibcol = 0; ibcol < iwsize; ++ibcol)
			{
				for (int ibrow = 0; ibrow < ihsize; ++ibrow)
				{
					for (int icol = 0; icol < iblock; icol += 8)
					{
						for (int irow = 0; irow < iblock; irow += 16)
						{
							__m128i A0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 0) * NREALCOL + ibcol * iblock + icol));
							__m128i B0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 1) * NREALCOL + ibcol * iblock + icol));
							__m128i C0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 2) * NREALCOL + ibcol * iblock + icol));
							__m128i D0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 3) * NREALCOL + ibcol * iblock + icol));
							__m128i E0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 4) * NREALCOL + ibcol * iblock + icol));
							__m128i F0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 5) * NREALCOL + ibcol * iblock + icol));
							__m128i G0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 6) * NREALCOL + ibcol * iblock + icol));
							__m128i H0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 7) * NREALCOL + ibcol * iblock + icol));
							__m128i I0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 8) * NREALCOL + ibcol * iblock + icol));
							__m128i J0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 9) * NREALCOL + ibcol * iblock + icol));
							__m128i K0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 10) * NREALCOL + ibcol * iblock + icol));
							__m128i L0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 11) * NREALCOL + ibcol * iblock + icol));
							__m128i M0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 12) * NREALCOL + ibcol * iblock + icol));
							__m128i N0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 13) * NREALCOL + ibcol * iblock + icol));
							__m128i O0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 14) * NREALCOL + ibcol * iblock + icol));
							__m128i P0_7 = _mm_loadl_epi64((__m128i*)(pSource + (ibrow * iblock + irow + 15) * NREALCOL + ibcol * iblock + icol));


							//unpack2
							__m128i AB0_7 = _mm_unpacklo_epi8(A0_7, B0_7);
							__m128i CD0_7 = _mm_unpacklo_epi8(C0_7, D0_7);
							__m128i EF0_7 = _mm_unpacklo_epi8(E0_7, F0_7);
							__m128i GH0_7 = _mm_unpacklo_epi8(G0_7, H0_7);
							__m128i IJ0_7 = _mm_unpacklo_epi8(I0_7, J0_7);
							__m128i KL0_7 = _mm_unpacklo_epi8(K0_7, L0_7);
							__m128i MN0_7 = _mm_unpacklo_epi8(M0_7, N0_7);
							__m128i OP0_7 = _mm_unpacklo_epi8(O0_7, P0_7);


							//unpack4
							__m128i ABCD0_3 = _mm_unpacklo_epi16(AB0_7, CD0_7);
							__m128i EFGH0_3 = _mm_unpacklo_epi16(EF0_7, GH0_7);
							__m128i IJKL0_3 = _mm_unpacklo_epi16(IJ0_7, KL0_7);
							__m128i MNOP0_3 = _mm_unpacklo_epi16(MN0_7, OP0_7);
							__m128i ABCD4_7 = _mm_unpackhi_epi16(AB0_7, CD0_7);
							__m128i EFGH4_7 = _mm_unpackhi_epi16(EF0_7, GH0_7);
							__m128i IJKL4_7 = _mm_unpackhi_epi16(IJ0_7, KL0_7);
							__m128i MNOP4_7 = _mm_unpackhi_epi16(MN0_7, OP0_7);

							//unpack8
							__m128i A_H0_1 = _mm_unpacklo_epi32(ABCD0_3, EFGH0_3);
							__m128i I_P0_1 = _mm_unpacklo_epi32(IJKL0_3, MNOP0_3);
							__m128i A_H2_3 = _mm_unpackhi_epi32(ABCD0_3, EFGH0_3);
							__m128i I_P2_3 = _mm_unpackhi_epi32(IJKL0_3, MNOP0_3);
							__m128i A_H4_5 = _mm_unpacklo_epi32(ABCD4_7, EFGH4_7);
							__m128i I_P4_5 = _mm_unpacklo_epi32(IJKL4_7, MNOP4_7);
							__m128i A_H6_7 = _mm_unpackhi_epi32(ABCD4_7, EFGH4_7);
							__m128i I_P6_7 = _mm_unpackhi_epi32(IJKL4_7, MNOP4_7);

							//unpack16			
							__m128i AP0 = _mm_unpacklo_epi64(A_H0_1, I_P0_1);
							__m128i AP1 = _mm_unpackhi_epi64(A_H0_1, I_P0_1);
							__m128i AP2 = _mm_unpacklo_epi64(A_H2_3, I_P2_3);
							__m128i AP3 = _mm_unpackhi_epi64(A_H2_3, I_P2_3);
							__m128i AP4 = _mm_unpacklo_epi64(A_H4_5, I_P4_5);
							__m128i AP5 = _mm_unpackhi_epi64(A_H4_5, I_P4_5);
							__m128i AP6 = _mm_unpacklo_epi64(A_H6_7, I_P6_7);
							__m128i AP7 = _mm_unpackhi_epi64(A_H6_7, I_P6_7);

							//store
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 0) * NROW + ibrow * iblock + irow), AP0);
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 1) * NROW + ibrow * iblock + irow), AP1);
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 2) * NROW + ibrow * iblock + irow), AP2);
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 3) * NROW + ibrow * iblock + irow), AP3);
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 4) * NROW + ibrow * iblock + irow), AP4);
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 5) * NROW + ibrow * iblock + irow), AP5);
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 6) * NROW + ibrow * iblock + irow), AP6);
							_mm_storeu_si128((__m128i*)(pTarget + (ibcol * iblock + icol + 7) * NROW + ibrow * iblock + irow), AP7);

							//prefetch
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 0) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 1) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 2) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 3) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 4) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 5) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 6) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 7) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 8) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 9) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 10) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 11) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 12) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 13) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 14) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);
							_mm_prefetch((const char*)(pSource + (ibrow * iblock + irow + 15) * NREALCOL + ibcol * iblock + icol + 8), _MM_HINT_T0);


						}
					}
				}
			}
		}
		clock_t end = clock();
		std::cout << "SimdTransposeBlock8RowPPP 1024 Time " << (end - begin) << std::endl;
		std::cout << "SimdTransposeBlock8RowPPP each Transpose (ms) " << ((float)(end - begin)) / (float)REPEAT << std::endl;
	}

执行时间

SimdTransposeBlock8RowPPP 1024 Time 205
SimdTransposeBlock8RowPPP each Transpose (ms) 0.0200195

通过多线程加速,获得的加速比为3.43.

总结

本文通过内存预取技术,对矩阵转置功能中内存加载的L3 Cache问题进行优化。同时,通过OPenMP对程序做多线程并行化优化,获得3.43倍的加速比。

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值