本篇对调度方式进行优化,实现内存拷贝和计算overlap。
单流同步调用
/*
单流同步
*/
void CallKmeansSync()
{
//TODO:init host memory
float* h_Src/*[Coords][SrcCount]*/,
* h_Clusters/*[ClusterCount][Coords]*/;
int* h_MemberShip/*[kSrcCount]*/,
* h_MemberCount/*[kClusterCount]*/;
h_Src = (float*)malloc(kSrcCount * kCoords * sizeof(float));
h_Clusters = (float*)malloc(kClusterCount * kCoords * sizeof(float));/*[kClusterCount][kCoords]*/
for (size_t i = 0; i < kClusterCount; i++)
{
for (int j = 0; j < kCoords; ++j)
{
h_Clusters[i * kCoords + j] = float(100 * i + 10);
}
}
for (size_t i = 0; i < kClusterCount; i++)
{
for (size_t j = 0; j < (kSrcCount / kClusterCount); j++)
{
for (size_t iDim = 0; iDim < kCoords; iDim++)
{
h_Src[iDim * kSrcCount + i * (kSrcCount / kClusterCount) + j] = i * 100 + 0.5f;// -(float)((rand() % 100) / 100);
}
}
}
h_MemberShip = (int*)malloc(kSrcCount * sizeof(int));
memset(h_MemberShip, 9, kSrcCount * sizeof(int));
h_MemberCount = (int*)malloc(kClusterCount * sizeof(int));
memset(h_MemberCount, 0, kClusterCount * sizeof(int));
//TODO:init device mempry
float* d_pSrc/*[Coords][SrcCount]*/,
* d_pClusters/*[ClusterCount][Coords]*/;
int* d_pChanged/*[1]*/,
* d_pMemberShip/*[kSrcCount]*/,
* d_pMemberCount/*[kClusterCount]*/;
cudaMalloc(&d_pSrc, kSrcCount * kCoords * sizeof(float));
cudaMemcpy(d_pSrc, h_Src, kSrcCount * kCoords * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc(&d_pClusters, kClusterCount * kCoords * sizeof(float));
cudaMalloc(&d_pMemberShip, kSrcCount * sizeof(int));
cudaMemcpy(d_pMemberShip, h_MemberShip, kSrcCount * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc(&d_pMemberCount, kClusterCount * sizeof(int));
cudaMemcpy(d_pMemberCount, h_MemberCount, kClusterCount * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc(&d_pChanged, sizeof(int));
//TODO:find the points
int itCount = 0;
int iChanged = 0;
cudaStream_t sMember, sNewCluster;
cudaStreamCreate(&sMember);
cudaStreamCreate(&sNewCluster);
cudaEvent_t eMember, eNewCluster;
cudaEventCreate(&eMember);
cudaEventCreate(&eNewCluster);
do
{
{
const int UnrollScale = 8;
cudaMemcpy(d_pClusters, h_Clusters, kClusterCount * kCoords * sizeof(float), cudaMemcpyHostToDevice);//s1
cudaMemset(d_pMemberCount, 0, kClusterCount * sizeof(int));//s1
kKmeansClassifyMembershipMath<kCoords, UnrollScale, kClusterCount, kClusterUnroll> << <kGridCount * 1, kBlockSize / UnrollScale, kClusterCount* kCoords * sizeof(float) >> >
(kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pChanged);
checkCUDARtn(cudaDeviceSynchronize());
cudaMemcpy(&iChanged, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost);//s1
cudaMemset(d_pChanged, 0, sizeof(int));//s1
cudaMemset(d_pClusters, 0, kClusterCount * kCoords * sizeof(float));//s2
const int kUnrollAdvUNROLLSCALE = 16;
kkKmeansUnrollAdv<kBlockSize / kExecuteScale, kUnrollAdvUNROLLSCALE, kCoords, kClusterCount> << <kGridCount * kExecuteScale / kUnrollAdvUNROLLSCALE, kBlockSize / kExecuteScale, (kBlockSize * kCoords + kBlockSize) * sizeof(float) / kExecuteScale >> > (kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pMemberCount);//s2
checkCUDARtn(cudaDeviceSynchronize());
cudaMemcpy(h_MemberCount, d_pMemberCount, kClusterCount * sizeof(int), cudaMemcpyDeviceToHost);//s2
cudaMemcpy(h_Clusters, d_pClusters, kClusterCount * kCoords * sizeof(float), cudaMemcpyDeviceToHost);//s2
for (size_t i = 0; i < kClusterCount; i++)//s2
{
for (int j = 0; j < kCoords; ++j)
{
h_Clusters[i * kCoords + j] = (h_MemberCount[i] == 0) ? 0 : (h_Clusters[i * kCoords + j] / h_MemberCount[i]);
}
}
}
} while (/*(0 != iChanged) &&*/ itCount++ < 8);
std::cout << "it count " << itCount << std::endl;
cudaEventDestroy(eMember);
cudaEventDestroy(eNewCluster);
cudaStreamDestroy(sMember);
cudaStreamDestroy(sNewCluster);
free(h_Src);
free(h_Clusters);
free(h_MemberShip);
free(h_MemberCount);
cudaFree(d_pSrc);
cudaFree(d_pClusters);
cudaFree(d_pChanged);
cudaFree(d_pMemberShip);
cudaFree(d_pMemberCount);
}
void CallFunckmeans(int FuncIndex)
{
const int kSingleStm = 0;
const int kMultStm = 1;
const int kPrevCopy = 2;
const int kNonSync = 3;
const int kSync = 4;
if (FuncIndex == kSingleStm)
{
CallKmeansSingleStm();
}
else if (FuncIndex == kMultStm)
{
CallKmeansMultOverlap();
}
else if (FuncIndex == kPrevCopy)
{
CallKmeansMultPrevCopy();
}
else if (FuncIndex == kNonSync)
{
CallKmeansMultNonSync();
}
else if (FuncIndex == kSync)
{
CallKmeansSync();
}
return;
}
调度情况
总耗时7ms,可以看到核函数之间由于同步没有连续执行
单流异步调用
在这里插入代码片/*
单流异步
*/
void CallKmeansSingleStm()
{
//TODO:init host memory
float* h_Src/*[Coords][SrcCount]*/,
* h_Clusters/*[ClusterCount][Coords]*/;
int* h_MemberShip/*[kSrcCount]*/,
* h_MemberCount/*[kClusterCount]*/;
//h_Src = (float*)malloc(kSrcCount * kCoords * sizeof(float));
cudaMallocHost(&h_Src, kSrcCount * kCoords * sizeof(float));
h_Clusters = (float*)malloc(kClusterCount * kCoords * sizeof(float));/*[kClusterCount][kCoords]*/
for (size_t i = 0; i < kClusterCount; i++)
{
for (int j = 0; j < kCoords; ++j)
{
h_Clusters[i * kCoords + j] = float(100 * i + 10);
}
}
for (size_t i = 0; i < kClusterCount; i++)
{
for (size_t j = 0; j < (kSrcCount / kClusterCount); j++)
{
for (size_t iDim = 0; iDim < kCoords; iDim++)
{
h_Src[iDim * kSrcCount + i * (kSrcCount / kClusterCount) + j] = i * 100 + 0.5f;// -(float)((rand() % 100) / 100);
}
}
}
//h_MemberShip = (int*)malloc(kSrcCount * sizeof(int));
cudaMallocHost(&h_MemberShip, kSrcCount * sizeof(int));
memset(h_MemberShip, 9, kSrcCount * sizeof(int));
h_MemberCount = (int*)malloc(kClusterCount * sizeof(int));
memset(h_MemberCount, 0, kClusterCount * sizeof(int));
//TODO:init stream
cudaStream_t stm;
cudaStreamCreate(&stm);
const int EventNum = 10;
cudaEvent_t event[EventNum];
for (size_t i = 0; i < EventNum; i++)
{
cudaEventCreate(&event[i]);
}
//TODO:init device mempry
float* d_pSrc/*[Coords][SrcCount]*/,
* d_pClusters/*[ClusterCount][Coords]*/;
int* d_pChanged/*[1]*/,
* d_pMemberShip/*[kSrcCount]*/,
* d_pMemberCount/*[kClusterCount]*/;
cudaMalloc(&d_pSrc, kSrcCount * kCoords * sizeof(float));
cudaMalloc(&d_pClusters, kClusterCount * kCoords * sizeof(float));
cudaMalloc(&d_pMemberShip, kSrcCount * sizeof(int));
cudaMalloc(&d_pMemberCount, kClusterCount * sizeof(int));
cudaMalloc(&d_pChanged, sizeof(int));
//cudaMemcpy(d_pSrc, h_Src, kSrcCount * kCoords * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_pSrc, h_Src, kSrcCount * kCoords * sizeof(float), cudaMemcpyHostToDevice, stm);
//cudaMemcpy(d_pMemberShip, h_MemberShip, kSrcCount * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_pMemberShip, h_MemberShip, kSrcCount * sizeof(int), cudaMemcpyHostToDevice, stm);
//cudaMemcpy(d_pMemberCount, h_MemberCount, kClusterCount * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_pMemberCount, h_MemberCount, kClusterCount * sizeof(int), cudaMemcpyHostToDevice, stm);
//cudaMemcpy(d_pClusters, h_Clusters, kClusterCount * kCoords * sizeof(float), cudaMemcpyHostToDevice);//默认流导致和stm流之间同步的时间消耗几百微秒
cudaMemcpyAsync(d_pClusters, h_Clusters, kClusterCount * kCoords * sizeof(float), cudaMemcpyHostToDevice, stm);
//TODO:find the points
int itCount = 0;
int* changed;
cudaMallocHost(&changed, sizeof(int));
*changed = 0;
int iChanged = 0;
const int kUnrollAdvExecuteScale = 8;
const int kUnrollAdvkUnrollScale = 16;
const int preCalcCount = 10;
for (; itCount < preCalcCount; itCount++)
{
kKmeansClassifyMembershipMath<kCoords, kUnrollAdvkUnrollScale, kClusterCount, kClusterUnroll> << <kGridCount * 1, kBlockSize / kUnrollAdvkUnrollScale, kClusterCount* kCoords * sizeof(float) >> >
(kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pChanged);
//cudaMemcpy(&iChanged, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpyAsync(changed, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost, stm);
//cudaMemset(d_pChanged, 0, sizeof(int));
cudaMemsetAsync(d_pChanged, 0, sizeof(int), stm);
cudaEventRecord(event[itCount % 10], stm);
//cudaMemset(d_pClusters, 0, kClusterCount * kCoords * sizeof(float));
cudaMemsetAsync(d_pClusters, 0, kClusterCount * kCoords * sizeof(float), stm);
kkKmeansUnrollAdv<kBlockSize / kExecuteScale, kUnrollAdvkUnrollScale, kCoords, kClusterCount> << <kGridCount * kExecuteScale / kUnrollAdvkUnrollScale, kBlockSize / kExecuteScale, (kBlockSize * kCoords + kBlockSize) * sizeof(float) / kExecuteScale >> > (kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pMemberCount);//s2
//kKmeansSumAtomic << <kGridCount* 1, kBlockSize / 1, (kBlockSize* kCoords / 1 + kClusterCount * kCoords + kClusterCount) * sizeof(float) >> > (kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pMemberCount);
kCalcNewClusterCenter << <1, kClusterCount, 0, stm >> > (d_pClusters, d_pMemberCount);
cudaMemsetAsync(d_pMemberCount, 0, kClusterCount * sizeof(int), stm);
}
cudaEventSynchronize(event[preCalcCount - 5]);
while ((0 != *changed) && itCount++ < 100)
{
kKmeansClassifyMembershipMath<kCoords, kUnrollAdvkUnrollScale, kClusterCount, kClusterUnroll> << <kGridCount * 1, kBlockSize / kUnrollAdvkUnrollScale, kClusterCount* kCoords * sizeof(float) >> >
(kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pChanged);
//cudaMemcpy(&iChanged, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpyAsync(changed, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost, stm);
cudaEventRecord(event[itCount % 10], stm);
//cudaMemset(d_pChanged, 0, sizeof(int));
cudaMemsetAsync(d_pChanged, 0, sizeof(int), stm);
//cudaMemset(d_pClusters, 0, kClusterCount * kCoords * sizeof(float));
cudaMemsetAsync(d_pClusters, 0, kClusterCount * kCoords * sizeof(float), stm);
kkKmeansUnrollAdv<kBlockSize / kExecuteScale, kUnrollAdvkUnrollScale, kCoords, kClusterCount> << <kGridCount* kExecuteScale / kUnrollAdvkUnrollScale, kBlockSize / kExecuteScale, (kBlockSize* kCoords + kBlockSize) * sizeof(float) / kExecuteScale >> > (kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pMemberCount);//s2
kCalcNewClusterCenter << <1, kClusterCount, 0, stm >> > (d_pClusters, d_pMemberCount);
cudaMemsetAsync(d_pMemberCount, 0, kClusterCount * sizeof(int), stm);
}
std::cout << "it count " << itCount << std::endl;
cudaMemcpy(h_Clusters, d_pClusters, kClusterCount * kCoords * sizeof(float), cudaMemcpyDeviceToHost);
for (size_t i = 0; i < kClusterCount; i++)
{
std::cout << h_Clusters[i] << std::endl;
}
//free(h_Src);
cudaFreeHost(h_Src);
free(h_Clusters);
//free(h_MemberShip);
cudaFreeHost(h_MemberShip);
cudaFreeHost(changed);
free(h_MemberCount);
cudaFree(d_pSrc);
cudaFree(d_pClusters);
cudaFree(d_pChanged);
cudaFree(d_pMemberShip);
cudaFree(d_pMemberCount);
cudaStreamDestroy(stm);
for (size_t i = 0; i < EventNum; i++)
{
cudaEventDestroy(event[i]);
}
}
调度情况
执行时间5.6ms,由上图可以该方式可以使核函数连续执行,但是由于主机端调用事件同步函数,导致流中核函数执行发生1.7ms的间断。
多流执行
多流执行,通过OpenMP开启4个线程,每个线程中一个流。同时在最开始的内存拷贝阶段通过流中添加事件同步,保证依赖关系。
/*
4个线程各自持有一个流
流中最开始的内存拷贝之间通过事件同步添加依赖
*/
void CallKmeansMultOverlap()
{
const int ThreadNum = 4;
//TODO:init host memory
float* h_SrcBase/*[ThreadNum][Coords][SrcCount]*/,
* h_ClustersBase/*[ThreadNum][ClusterCount][Coords]*/;
int* h_MemberShipBase/*[ThreadNum][kSrcCount]*/,
* h_MemberCountBase/*[ThreadNum][kClusterCount]*/;
//h_Src = (float*)malloc(kSrcCount * kCoords * sizeof(float));
cudaMallocHost(&h_SrcBase, ThreadNum * kSrcCount * kCoords * sizeof(float));
h_ClustersBase = (float*)malloc(ThreadNum * kClusterCount * kCoords * sizeof(float));
int* changedBase;
cudaMallocHost(&changedBase, ThreadNum * sizeof(int));
for (size_t tid = 0; tid < ThreadNum; tid++)
{
float* h_LocalClusters = h_ClustersBase + kClusterCount * kCoords * tid;
for (size_t i = 0; i < kClusterCount; i++)
{
for (int j = 0; j < kCoords; ++j)
{
h_LocalClusters[i * kCoords + j] = float(100 * i + 10);
}
}
float* h_LocalSrc = h_SrcBase + kSrcCount * kCoords * tid;
for (size_t i = 0; i < kClusterCount; i++)
{
for (size_t j = 0; j < (kSrcCount / kClusterCount); j++)
{
for (size_t iDim = 0; iDim < kCoords; iDim++)
{
h_LocalSrc[iDim * kSrcCount + i * (kSrcCount / kClusterCount) + j] = i * 100 + 0.5f;// -(float)((rand() % 100) / 100);
}
}
}
changedBase[tid] = 0;
}
//h_MemberShip = (int*)malloc(kSrcCount * sizeof(int));
cudaMallocHost(&h_MemberShipBase, ThreadNum * kSrcCount * sizeof(int));
memset(h_MemberShipBase, 9, ThreadNum * kSrcCount * sizeof(int));
h_MemberCountBase = (int*)malloc(ThreadNum * kClusterCount * sizeof(int));
memset(h_MemberCountBase, 0, ThreadNum * kClusterCount * sizeof(int));
//TODO:init stream
cudaStream_t stm[ThreadNum];
for (size_t i = 0; i < ThreadNum; i++)
{
//cudaStreamCreate(&stm[i]);
cudaStreamCreateWithFlags(&stm[i], cudaStreamNonBlocking);
}
const int StmEventCount = 10;
cudaEvent_t event[StmEventCount * ThreadNum];
for (size_t i = 0; i < StmEventCount * ThreadNum; i++)
{
cudaEventCreate(&event[i]);
}
//TODO:init device mempry
float* d_pSrcBase/*[ThreadNum][Coords][SrcCount]*/,
* d_pClustersBase/*[ThreadNum][ClusterCount][Coords]*/;
int* d_pChangedBase/*[ThreadNum][1]*/,
* d_pMemberShipBase/*[ThreadNum][kSrcCount]*/,
* d_pMemberCountBase/*[ThreadNum][kClusterCount]*/;
cudaMalloc(&d_pSrcBase, ThreadNum * kSrcCount * kCoords * sizeof(float));
cudaMalloc(&d_pClustersBase, ThreadNum * kClusterCount * kCoords * sizeof(float));
cudaMalloc(&d_pMemberShipBase, ThreadNum * kSrcCount * sizeof(int));
cudaMalloc(&d_pMemberCountBase, ThreadNum * kClusterCount * sizeof(int));
cudaMalloc(&d_pChangedBase, ThreadNum * sizeof(int));
const int kUnrollAdvExecuteScale = 8;
const int kUnrollAdvkUnrollScale = 16;
const int preCalcCount = 8;
#pragma omp parallel for num_threads(ThreadNum)
for (size_t itstm = 0; itstm < ThreadNum; itstm++)
{
float* h_Src = h_SrcBase + itstm * kSrcCount * kCoords;
float* d_pSrc = d_pSrcBase + itstm * kSrcCount * kCoords;
int* h_MemberShip = h_MemberShipBase + itstm * kSrcCount;
int* d_pMemberShip = d_pMemberShipBase + itstm * kSrcCount;
int* h_MemberCount = h_MemberCountBase + itstm * kClusterCount;
int* d_pMemberCount = d_pMemberCountBase + itstm * kClusterCount;
float* h_Clusters = h_ClustersBase + itstm * kClusterCount * kCoords;
float* d_pClusters = d_pClustersBase + itstm * kClusterCount * kCoords;
int* changed = changedBase + itstm;
int* d_pChanged = d_pChangedBase + itstm;
if (itstm != 0)
{
cudaStreamWaitEvent(stm[itstm], event[(itstm - 1) * StmEventCount], 0);
}
//cudaMemcpy(d_pSrc, h_Src, kSrcCount * kCoords * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_pSrc, h_Src, kSrcCount * kCoords * sizeof(float), cudaMemcpyHostToDevice, stm[itstm]);
//cudaMemcpy(d_pMemberShip, h_MemberShip, kSrcCount * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_pMemberShip, h_MemberShip, kSrcCount * sizeof(int), cudaMemcpyHostToDevice, stm[itstm]);
//cudaMemcpy(d_pMemberCount, h_MemberCount, kClusterCount * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_pMemberCount, h_MemberCount, kClusterCount * sizeof(int), cudaMemcpyHostToDevice, stm[itstm]);
//cudaMemcpy(d_pClusters, h_Clusters, kClusterCount * kCoords * sizeof(float), cudaMemcpyHostToDevice);//默认流导致和stm流之间同步的时间消耗几百微秒
cudaMemcpyAsync(d_pClusters, h_Clusters, kClusterCount * kCoords * sizeof(float), cudaMemcpyHostToDevice, stm[itstm]);
cudaEventRecord(event[itstm * StmEventCount], stm[itstm]);
//TODO:find the points
int itCount = 1;
for (; itCount < preCalcCount; itCount++)
{
kKmeansClassifyMembershipMath<kCoords, kUnrollAdvkUnrollScale, kClusterCount, kClusterUnroll> << <kGridCount * 1, kBlockSize / kUnrollAdvkUnrollScale, kClusterCount* kCoords * sizeof(float), stm[itstm] >> >
(kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pChanged);
//cudaMemcpy(&iChanged, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpyAsync(changed, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost, stm[itstm]);
//cudaMemset(d_pChanged, 0, sizeof(int));
cudaMemsetAsync(d_pChanged, 0, sizeof(int), stm[itstm]);
cudaEventRecord(event[(itCount % 10) + itstm * StmEventCount], stm[itstm]);
//cudaMemset(d_pClusters, 0, kClusterCount * kCoords * sizeof(float));
cudaMemsetAsync(d_pClusters, 0, kClusterCount * kCoords * sizeof(float), stm[itstm]);
kkKmeansUnrollAdv<kBlockSize / kExecuteScale, kUnrollAdvkUnrollScale, kCoords, kClusterCount> << <kGridCount * kExecuteScale / kUnrollAdvkUnrollScale, kBlockSize / kExecuteScale, (kBlockSize * kCoords + kBlockSize) * sizeof(float) / kExecuteScale, stm[itstm] >> > (kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pMemberCount);//s2
//kKmeansSumAtomic << <kGridCount* 1, kBlockSize / 1, (kBlockSize* kCoords / 1 + kClusterCount * kCoords + kClusterCount) * sizeof(float) >> > (kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pMemberCount);
kCalcNewClusterCenter << <1, kClusterCount, 0, stm[itstm] >> > (d_pClusters, d_pMemberCount);
cudaMemsetAsync(d_pMemberCount, 0, kClusterCount * sizeof(int), stm[itstm]);
}
cudaEventSynchronize(event[(itCount % 10) + itstm * StmEventCount - 4]);
while ((0 != *changed) && itCount++ < 100)
{
kKmeansClassifyMembershipMath<kCoords, kUnrollAdvkUnrollScale, kClusterCount, kClusterUnroll> << <kGridCount * 1, kBlockSize / kUnrollAdvkUnrollScale, kClusterCount* kCoords * sizeof(float), stm[itstm] >> >
(kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pChanged);
//cudaMemcpy(&iChanged, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpyAsync(changed, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost, stm[itstm]);
cudaEventRecord(event[itCount % 10], stm[itstm]);
//cudaMemset(d_pChanged, 0, sizeof(int));
cudaMemsetAsync(d_pChanged, 0, sizeof(int), stm[itstm]);
//cudaMemset(d_pClusters, 0, kClusterCount * kCoords * sizeof(float));
cudaMemsetAsync(d_pClusters, 0, kClusterCount * kCoords * sizeof(float), stm[itstm]);
kkKmeansUnrollAdv<kBlockSize / kExecuteScale, kUnrollAdvkUnrollScale, kCoords, kClusterCount> << <kGridCount * kExecuteScale / kUnrollAdvkUnrollScale, kBlockSize / kExecuteScale, (kBlockSize * kCoords + kBlockSize) * sizeof(float) / kExecuteScale, stm[itstm] >> > (kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pMemberCount);//s2
kCalcNewClusterCenter << <1, kClusterCount, 0, stm[itstm] >> > (d_pClusters, d_pMemberCount);
cudaMemsetAsync(d_pMemberCount, 0, kClusterCount * sizeof(int), stm[itstm]);
}
printf("tid %d and count %d \n", itstm, itCount);
cudaMemcpyAsync(h_Clusters, d_pClusters, kClusterCount * kCoords * sizeof(float), cudaMemcpyDeviceToHost, stm[itstm]);
for (size_t i = 0; i < kClusterCount; i++)
{
printf("tid %d and value %f \n", itstm, h_Clusters[i]);
}
}
//free(h_Src);
cudaFreeHost(h_SrcBase);
free(h_ClustersBase);
//free(h_MemberShip);
cudaFreeHost(h_MemberShipBase);
cudaFreeHost(changedBase);
free(h_MemberCountBase);
cudaFree(d_pSrcBase);
cudaFree(d_pClustersBase);
cudaFree(d_pChangedBase);
cudaFree(d_pMemberShipBase);
cudaFree(d_pMemberCountBase);
for (size_t i = 0; i < ThreadNum; i++)
{
cudaStreamDestroy(stm[i]);
}
for (size_t i = 0; i < StmEventCount * ThreadNum; i++)
{
cudaEventDestroy(event[i]);
}
}
执行情况
总执行时间8.6ms,这种多流的调度方式,由主机到设备端的内存拷贝没有连续执行。同时第二次内存拷贝开始,核函数的执行和内存拷贝完成overlap。
提前执行内存拷贝
/*
提前进行异步内存拷贝
4个线程,每个执行各自的流通过事件同步触发核函数启动
*/
void CallKmeansMultPrevCopy()
{
const int ThreadNum = 4;
//TODO:init host memory
float* h_SrcBase/*[ThreadNum][Coords][SrcCount]*/,
* h_ClustersBase/*[ThreadNum][ClusterCount][Coords]*/;
int* h_MemberShipBase/*[ThreadNum][kSrcCount]*/,
* h_MemberCountBase/*[ThreadNum][kClusterCount]*/;
//h_Src = (float*)malloc(kSrcCount * kCoords * sizeof(float));
cudaMallocHost(&h_SrcBase, ThreadNum * kSrcCount * kCoords * sizeof(float));
h_ClustersBase = (float*)malloc(ThreadNum * kClusterCount * kCoords * sizeof(float));
int* changedBase;
cudaMallocHost(&changedBase, ThreadNum * sizeof(int));
for (size_t tid = 0; tid < ThreadNum; tid++)
{
float* h_LocalClusters = h_ClustersBase + kClusterCount * kCoords * tid;
for (size_t i = 0; i < kClusterCount; i++)
{
for (int j = 0; j < kCoords; ++j)
{
h_LocalClusters[i * kCoords + j] = float(100 * i + 10);
}
}
float* h_LocalSrc = h_SrcBase + kSrcCount * kCoords * tid;
for (size_t i = 0; i < kClusterCount; i++)
{
for (size_t j = 0; j < (kSrcCount / kClusterCount); j++)
{
for (size_t iDim = 0; iDim < kCoords; iDim++)
{
h_LocalSrc[iDim * kSrcCount + i * (kSrcCount / kClusterCount) + j] = i * 100 + 0.5f;// -(float)((rand() % 100) / 100);
}
}
}
changedBase[tid] = 0;
}
//h_MemberShip = (int*)malloc(kSrcCount * sizeof(int));
cudaMallocHost(&h_MemberShipBase, ThreadNum * kSrcCount * sizeof(int));
memset(h_MemberShipBase, 9, ThreadNum * kSrcCount * sizeof(int));
h_MemberCountBase = (int*)malloc(ThreadNum * kClusterCount * sizeof(int));
memset(h_MemberCountBase, 0, ThreadNum * kClusterCount * sizeof(int));
//TODO:init stream
cudaStream_t stm[ThreadNum + 1];
for (size_t i = 0; i < ThreadNum + 1; i++)
{
//cudaStreamCreate(&stm[i]);
cudaStreamCreateWithFlags(&stm[i], cudaStreamNonBlocking);
}
const int StmEventCount = 10;
cudaEvent_t event[(1 + StmEventCount) * ThreadNum];
for (size_t i = 0; i < (1 + StmEventCount) * ThreadNum; i++)
{
cudaEventCreate(&event[i]);
}
//TODO:init device mempry
float* d_pSrcBase/*[ThreadNum][Coords][SrcCount]*/,
* d_pClustersBase/*[ThreadNum][ClusterCount][Coords]*/;
int* d_pChangedBase/*[ThreadNum][1]*/,
* d_pMemberShipBase/*[ThreadNum][kSrcCount]*/,
* d_pMemberCountBase/*[ThreadNum][kClusterCount]*/;
cudaMalloc(&d_pSrcBase, ThreadNum * kSrcCount * kCoords * sizeof(float));
cudaMalloc(&d_pClustersBase, ThreadNum * kClusterCount * kCoords * sizeof(float));
cudaMalloc(&d_pMemberShipBase, ThreadNum * kSrcCount * sizeof(int));
cudaMalloc(&d_pMemberCountBase, ThreadNum * kClusterCount * sizeof(int));
cudaMalloc(&d_pChangedBase, ThreadNum * sizeof(int));
const int kUnrollAdvExecuteScale = 8;
const int kUnrollAdvkUnrollScale = 16;
const int preCalcCount = 8;
for (size_t itstm = 0; itstm < ThreadNum; itstm++)
{
float* h_Src = h_SrcBase + itstm * kSrcCount * kCoords;
float* d_pSrc = d_pSrcBase + itstm * kSrcCount * kCoords;
int* h_MemberShip = h_MemberShipBase + itstm * kSrcCount;
int* d_pMemberShip = d_pMemberShipBase + itstm * kSrcCount;
int* h_MemberCount = h_MemberCountBase + itstm * kClusterCount;
int* d_pMemberCount = d_pMemberCountBase + itstm * kClusterCount;
float* h_Clusters = h_ClustersBase + itstm * kClusterCount * kCoords;
float* d_pClusters = d_pClustersBase + itstm * kClusterCount * kCoords;
int* changed = changedBase + itstm;
int* d_pChanged = d_pChangedBase + itstm;
//cudaMemcpy(d_pSrc, h_Src, kSrcCount * kCoords * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_pSrc, h_Src, kSrcCount * kCoords * sizeof(float), cudaMemcpyHostToDevice, stm[ThreadNum]);
//cudaMemcpy(d_pMemberShip, h_MemberShip, kSrcCount * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_pMemberShip, h_MemberShip, kSrcCount * sizeof(int), cudaMemcpyHostToDevice, stm[ThreadNum]);
//cudaMemcpy(d_pMemberCount, h_MemberCount, kClusterCount * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_pMemberCount, h_MemberCount, kClusterCount * sizeof(int), cudaMemcpyHostToDevice, stm[ThreadNum]);
//cudaMemcpy(d_pClusters, h_Clusters, kClusterCount * kCoords * sizeof(float), cudaMemcpyHostToDevice);//默认流导致和stm流之间同步的时间消耗几百微秒
cudaMemcpyAsync(d_pClusters, h_Clusters, kClusterCount * kCoords * sizeof(float), cudaMemcpyHostToDevice, stm[ThreadNum]);
cudaEventRecord(event[ThreadNum * StmEventCount + itstm], stm[ThreadNum]);
}
#pragma omp parallel for num_threads(ThreadNum)
for (size_t itstm = 0; itstm < ThreadNum; itstm++)
{
float* h_Src = h_SrcBase + itstm * kSrcCount * kCoords;
float* d_pSrc = d_pSrcBase + itstm * kSrcCount * kCoords;
int* h_MemberShip = h_MemberShipBase + itstm * kSrcCount;
int* d_pMemberShip = d_pMemberShipBase + itstm * kSrcCount;
int* h_MemberCount = h_MemberCountBase + itstm * kClusterCount;
int* d_pMemberCount = d_pMemberCountBase + itstm * kClusterCount;
float* h_Clusters = h_ClustersBase + itstm * kClusterCount * kCoords;
float* d_pClusters = d_pClustersBase + itstm * kClusterCount * kCoords;
int* changed = changedBase + itstm;
int* d_pChanged = d_pChangedBase + itstm;
//TODO:find the points
cudaStreamWaitEvent(stm[itstm], event[ThreadNum * StmEventCount + itstm], 0);
int itCount = 1;
for (; itCount < preCalcCount; itCount++)
{
kKmeansClassifyMembershipMath<kCoords, kUnrollAdvkUnrollScale, kClusterCount, kClusterUnroll> << <kGridCount * 1, kBlockSize / kUnrollAdvkUnrollScale, kClusterCount* kCoords * sizeof(float), stm[itstm] >> >
(kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pChanged);
//cudaMemcpy(&iChanged, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpyAsync(changed, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost, stm[itstm]);
//cudaMemset(d_pChanged, 0, sizeof(int));
cudaMemsetAsync(d_pChanged, 0, sizeof(int), stm[itstm]);
cudaEventRecord(event[(itCount % 10) + itstm * StmEventCount], stm[itstm]);
//cudaMemset(d_pClusters, 0, kClusterCount * kCoords * sizeof(float));
cudaMemsetAsync(d_pClusters, 0, kClusterCount * kCoords * sizeof(float), stm[itstm]);
kkKmeansUnrollAdv<kBlockSize / kExecuteScale, kUnrollAdvkUnrollScale, kCoords, kClusterCount> << <kGridCount * kExecuteScale / kUnrollAdvkUnrollScale, kBlockSize / kExecuteScale, (kBlockSize * kCoords + kBlockSize) * sizeof(float) / kExecuteScale, stm[itstm] >> > (kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pMemberCount);//s2
//kKmeansSumAtomic << <kGridCount* 1, kBlockSize / 1, (kBlockSize* kCoords / 1 + kClusterCount * kCoords + kClusterCount) * sizeof(float) >> > (kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pMemberCount);
kCalcNewClusterCenter << <1, kClusterCount, 0, stm[itstm] >> > (d_pClusters, d_pMemberCount);
cudaMemsetAsync(d_pMemberCount, 0, kClusterCount * sizeof(int), stm[itstm]);
}
cudaEventSynchronize(event[(itCount % 10) + itstm * StmEventCount - 4]);
while ((0 != *changed) && itCount++ < 100)
{
kKmeansClassifyMembershipMath<kCoords, kUnrollAdvkUnrollScale, kClusterCount, kClusterUnroll> << <kGridCount * 1, kBlockSize / kUnrollAdvkUnrollScale, kClusterCount* kCoords * sizeof(float), stm[itstm] >> >
(kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pChanged);
//cudaMemcpy(&iChanged, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpyAsync(changed, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost, stm[itstm]);
cudaEventRecord(event[itCount % 10], stm[itstm]);
//cudaMemset(d_pChanged, 0, sizeof(int));
cudaMemsetAsync(d_pChanged, 0, sizeof(int), stm[itstm]);
//cudaMemset(d_pClusters, 0, kClusterCount * kCoords * sizeof(float));
cudaMemsetAsync(d_pClusters, 0, kClusterCount * kCoords * sizeof(float), stm[itstm]);
kkKmeansUnrollAdv<kBlockSize / kExecuteScale, kUnrollAdvkUnrollScale, kCoords, kClusterCount> << <kGridCount * kExecuteScale / kUnrollAdvkUnrollScale, kBlockSize / kExecuteScale, (kBlockSize * kCoords + kBlockSize) * sizeof(float) / kExecuteScale, stm[itstm] >> > (kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pMemberCount);//s2
kCalcNewClusterCenter << <1, kClusterCount, 0, stm[itstm] >> > (d_pClusters, d_pMemberCount);
cudaMemsetAsync(d_pMemberCount, 0, kClusterCount * sizeof(int), stm[itstm]);
}
printf("tid %d and count %d \n", itstm, itCount);
cudaMemcpyAsync(h_Clusters, d_pClusters, kClusterCount * kCoords * sizeof(float), cudaMemcpyDeviceToHost, stm[itstm]);
for (size_t i = 0; i < kClusterCount; i++)
{
printf("tid %d and value %f \n", itstm, h_Clusters[i]);
}
}
//free(h_Src);
cudaFreeHost(h_SrcBase);
free(h_ClustersBase);
//free(h_MemberShip);
cudaFreeHost(h_MemberShipBase);
cudaFreeHost(changedBase);
free(h_MemberCountBase);
cudaFree(d_pSrcBase);
cudaFree(d_pClustersBase);
cudaFree(d_pChangedBase);
cudaFree(d_pMemberShipBase);
cudaFree(d_pMemberCountBase);
for (size_t i = 0; i < ThreadNum + 1; i++)
{
cudaStreamDestroy(stm[i]);
}
for (size_t i = 0; i < (1 + StmEventCount) * ThreadNum; i++)
{
cudaEventDestroy(event[i]);
}
}
调度情况
执行时间9.4ms,这种方式内存拷贝连续执行,但是事件同步触发流中核函数执行,并没有很好的overlap。
多流不添加事件同步
/*
4个线程,每个执行各自的流
不同流之间拷贝数据未添加同步操作
*/
void CallKmeansMultNonSync()
{
const int ThreadNum = 4;
//TODO:init host memory
float* h_SrcBase/*[ThreadNum][Coords][SrcCount]*/,
* h_ClustersBase/*[ThreadNum][ClusterCount][Coords]*/;
int* h_MemberShipBase/*[ThreadNum][kSrcCount]*/,
* h_MemberCountBase/*[ThreadNum][kClusterCount]*/;
//h_Src = (float*)malloc(kSrcCount * kCoords * sizeof(float));
cudaMallocHost(&h_SrcBase, ThreadNum * kSrcCount * kCoords * sizeof(float));
h_ClustersBase = (float*)malloc(ThreadNum * kClusterCount * kCoords * sizeof(float));
int* changedBase;
cudaMallocHost(&changedBase, ThreadNum * sizeof(int));
for (size_t tid = 0; tid < ThreadNum; tid++)
{
float* h_LocalClusters = h_ClustersBase + kClusterCount * kCoords * tid;
for (size_t i = 0; i < kClusterCount; i++)
{
for (int j = 0; j < kCoords; ++j)
{
h_LocalClusters[i * kCoords + j] = float(100 * i + 10);
}
}
float* h_LocalSrc = h_SrcBase + kSrcCount * kCoords * tid;
for (size_t i = 0; i < kClusterCount; i++)
{
for (size_t j = 0; j < (kSrcCount / kClusterCount); j++)
{
for (size_t iDim = 0; iDim < kCoords; iDim++)
{
h_LocalSrc[iDim * kSrcCount + i * (kSrcCount / kClusterCount) + j] = i * 100 + 0.5f;// -(float)((rand() % 100) / 100);
}
}
}
changedBase[tid] = 0;
}
//h_MemberShip = (int*)malloc(kSrcCount * sizeof(int));
cudaMallocHost(&h_MemberShipBase, ThreadNum * kSrcCount * sizeof(int));
memset(h_MemberShipBase, 9, ThreadNum * kSrcCount * sizeof(int));
h_MemberCountBase = (int*)malloc(ThreadNum * kClusterCount * sizeof(int));
memset(h_MemberCountBase, 0, ThreadNum * kClusterCount * sizeof(int));
//TODO:init stream
cudaStream_t stm[ThreadNum + 1];
for (size_t i = 0; i < ThreadNum + 1; i++)
{
//cudaStreamCreate(&stm[i]);
cudaStreamCreateWithFlags(&stm[i], cudaStreamNonBlocking);
}
const int StmEventCount = 10;
cudaEvent_t event[(1 + StmEventCount) * ThreadNum];
for (size_t i = 0; i < (1 + StmEventCount) * ThreadNum; i++)
{
cudaEventCreate(&event[i]);
}
//TODO:init device mempry
float* d_pSrcBase/*[ThreadNum][Coords][SrcCount]*/,
* d_pClustersBase/*[ThreadNum][ClusterCount][Coords]*/;
int* d_pChangedBase/*[ThreadNum][1]*/,
* d_pMemberShipBase/*[ThreadNum][kSrcCount]*/,
* d_pMemberCountBase/*[ThreadNum][kClusterCount]*/;
cudaMalloc(&d_pSrcBase, ThreadNum * kSrcCount * kCoords * sizeof(float));
cudaMalloc(&d_pClustersBase, ThreadNum * kClusterCount * kCoords * sizeof(float));
cudaMalloc(&d_pMemberShipBase, ThreadNum * kSrcCount * sizeof(int));
cudaMalloc(&d_pMemberCountBase, ThreadNum * kClusterCount * sizeof(int));
cudaMalloc(&d_pChangedBase, ThreadNum * sizeof(int));
const int kUnrollAdvExecuteScale = 8;
const int kUnrollAdvkUnrollScale = 16;
const int preCalcCount = 8;
#pragma omp parallel for num_threads(ThreadNum)
for (size_t itstm = 0; itstm < ThreadNum; itstm++)
{
float* h_Src = h_SrcBase + itstm * kSrcCount * kCoords;
float* d_pSrc = d_pSrcBase + itstm * kSrcCount * kCoords;
int* h_MemberShip = h_MemberShipBase + itstm * kSrcCount;
int* d_pMemberShip = d_pMemberShipBase + itstm * kSrcCount;
int* h_MemberCount = h_MemberCountBase + itstm * kClusterCount;
int* d_pMemberCount = d_pMemberCountBase + itstm * kClusterCount;
float* h_Clusters = h_ClustersBase + itstm * kClusterCount * kCoords;
float* d_pClusters = d_pClustersBase + itstm * kClusterCount * kCoords;
int* changed = changedBase + itstm;
int* d_pChanged = d_pChangedBase + itstm;
//cudaMemcpy(d_pSrc, h_Src, kSrcCount * kCoords * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_pSrc, h_Src, kSrcCount * kCoords * sizeof(float), cudaMemcpyHostToDevice, stm[itstm]);
//cudaMemcpy(d_pMemberShip, h_MemberShip, kSrcCount * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_pMemberShip, h_MemberShip, kSrcCount * sizeof(int), cudaMemcpyHostToDevice, stm[itstm]);
//cudaMemcpy(d_pMemberCount, h_MemberCount, kClusterCount * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_pMemberCount, h_MemberCount, kClusterCount * sizeof(int), cudaMemcpyHostToDevice, stm[itstm]);
//cudaMemcpy(d_pClusters, h_Clusters, kClusterCount * kCoords * sizeof(float), cudaMemcpyHostToDevice);//默认流导致和stm流之间同步的时间消耗几百微秒
cudaMemcpyAsync(d_pClusters, h_Clusters, kClusterCount * kCoords * sizeof(float), cudaMemcpyHostToDevice, stm[itstm]);
cudaEventRecord(event[itstm* StmEventCount], stm[itstm]);
//TODO:find the points
cudaStreamWaitEvent(stm[itstm], event[ThreadNum * StmEventCount + itstm], 0);
int itCount = 1;
for (; itCount < preCalcCount; itCount++)
{
kKmeansClassifyMembershipMath<kCoords, kUnrollAdvkUnrollScale, kClusterCount, kClusterUnroll> << <kGridCount * 1, kBlockSize / kUnrollAdvkUnrollScale, kClusterCount* kCoords * sizeof(float), stm[itstm] >> >
(kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pChanged);
//cudaMemcpy(&iChanged, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpyAsync(changed, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost, stm[itstm]);
//cudaMemset(d_pChanged, 0, sizeof(int));
cudaMemsetAsync(d_pChanged, 0, sizeof(int), stm[itstm]);
cudaEventRecord(event[(itCount % 10) + itstm * StmEventCount], stm[itstm]);
//cudaMemset(d_pClusters, 0, kClusterCount * kCoords * sizeof(float));
cudaMemsetAsync(d_pClusters, 0, kClusterCount * kCoords * sizeof(float), stm[itstm]);
kkKmeansUnrollAdv<kBlockSize / kExecuteScale, kUnrollAdvkUnrollScale, kCoords, kClusterCount> << <kGridCount * kExecuteScale / kUnrollAdvkUnrollScale, kBlockSize / kExecuteScale, (kBlockSize * kCoords + kBlockSize) * sizeof(float) / kExecuteScale, stm[itstm] >> > (kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pMemberCount);//s2
//kKmeansSumAtomic << <kGridCount* 1, kBlockSize / 1, (kBlockSize* kCoords / 1 + kClusterCount * kCoords + kClusterCount) * sizeof(float) >> > (kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pMemberCount);
kCalcNewClusterCenter << <1, kClusterCount, 0, stm[itstm] >> > (d_pClusters, d_pMemberCount);
cudaMemsetAsync(d_pMemberCount, 0, kClusterCount * sizeof(int), stm[itstm]);
}
cudaEventSynchronize(event[(itCount % 10) + itstm * StmEventCount - 4]);
while ((0 != *changed) && itCount++ < 100)
{
kKmeansClassifyMembershipMath<kCoords, kUnrollAdvkUnrollScale, kClusterCount, kClusterUnroll> << <kGridCount * 1, kBlockSize / kUnrollAdvkUnrollScale, kClusterCount* kCoords * sizeof(float), stm[itstm] >> >
(kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pChanged);
//cudaMemcpy(&iChanged, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpyAsync(changed, d_pChanged, sizeof(int), cudaMemcpyDeviceToHost, stm[itstm]);
cudaEventRecord(event[itCount % 10], stm[itstm]);
//cudaMemset(d_pChanged, 0, sizeof(int));
cudaMemsetAsync(d_pChanged, 0, sizeof(int), stm[itstm]);
//cudaMemset(d_pClusters, 0, kClusterCount * kCoords * sizeof(float));
cudaMemsetAsync(d_pClusters, 0, kClusterCount * kCoords * sizeof(float), stm[itstm]);
kkKmeansUnrollAdv<kBlockSize / kExecuteScale, kUnrollAdvkUnrollScale, kCoords, kClusterCount> << <kGridCount * kExecuteScale / kUnrollAdvkUnrollScale, kBlockSize / kExecuteScale, (kBlockSize * kCoords + kBlockSize) * sizeof(float) / kExecuteScale, stm[itstm] >> > (kSrcCount, kCoords, kClusterCount, d_pSrc, d_pMemberShip, d_pClusters, d_pMemberCount);//s2
kCalcNewClusterCenter << <1, kClusterCount, 0, stm[itstm] >> > (d_pClusters, d_pMemberCount);
cudaMemsetAsync(d_pMemberCount, 0, kClusterCount * sizeof(int), stm[itstm]);
}
printf("tid %d and count %d \n", itstm, itCount);
cudaMemcpyAsync(h_Clusters, d_pClusters, kClusterCount * kCoords * sizeof(float), cudaMemcpyDeviceToHost, stm[itstm]);
for (size_t i = 0; i < kClusterCount; i++)
{
printf("tid %d and value %f \n", itstm, h_Clusters[i]);
}
}
//free(h_Src);
cudaFreeHost(h_SrcBase);
free(h_ClustersBase);
//free(h_MemberShip);
cudaFreeHost(h_MemberShipBase);
cudaFreeHost(changedBase);
free(h_MemberCountBase);
cudaFree(d_pSrcBase);
cudaFree(d_pClustersBase);
cudaFree(d_pChangedBase);
cudaFree(d_pMemberShipBase);
cudaFree(d_pMemberCountBase);
for (size_t i = 0; i < ThreadNum + 1; i++)
{
cudaStreamDestroy(stm[i]);
}
for (size_t i = 0; i < (1 + StmEventCount) * ThreadNum; i++)
{
cudaEventDestroy(event[i]);
}
}
调度情况
执行时间8.3ms,同时从第二次内存拷贝开始,实现了overlap。但是经过多次测试,这种调度方式的执行结果并不稳定。
总结
对比五个执行结果,通过添加流中事件同步,保证执行顺序,调度结果相对更加稳定,执行时间相对较快,可以作为优先考虑的使用方式。