#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <time.h>
#include <list>
#include "Graph.h"
#include "ArrayUtil.h"
#include "TreeNode.h"
#include "TreeUtil.h"
using namespace std;
//打印GPU设备信息
void print_GPU_device_info(){
int deviceCount;
cudaGetDeviceCount(&deviceCount);
for (int i = 0; i < deviceCount; i++)
{
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, i);
cout << "使用GPU device " << i << ": " << devProp.name << endl;
cout << "设备全局内存总量: " << devProp.totalGlobalMem / 1024 / 1024 << "MB" << endl;
cout << "SM的数量:" << devProp.multiProcessorCount << endl;
cout << "每个SM的最大线程数:" << devProp.maxThreadsPerMultiProcessor << endl;
cout << "每个SM的最大线程束数:" << devProp.maxThreadsPerMultiProcessor / 32 << endl;
cout << "每个Block的共享内存大小:" << devProp.sharedMemPerBlock / 1024.0 << " KB" << endl;
cout << "每个Block的最大线程数:" << devProp.maxThreadsPerBlock << endl;
cout << "每个Block中可用的32位寄存器数量: " << devProp.regsPerBlock << endl;
cout << "======================================================" << endl;
}
}
/*
* 核函数,多核协同获取当前所有树边界队列的邻居情况
*/
__global__ void countNextFlag(int* dev_border, int** dev_flag2d, int** dev_Array2d) {
int i = blockIdx.x; //各个生成树的遍历
int j = threadIdx.x; //各条边的遍历
int start = 1;
int end = -dev_border[0]; //指定第i棵树应该遍历总边界队列的哪些点
for (int k = 0; k < i; k++) { //定位到第i棵树的头坐标
start = end + 2;
end = end - dev_border[end + 1] + 1;
}
//遍历树i的边界节点
for (int k = start; k <= end; k++) {//行头记录的是树的边界节点个数
if (dev_Array2d[j][0] == dev_border[k]) { //遍历到当前树节点的邻居
dev_flag2d[dev_Array2d[j][1]][i] = 1; //标记该点在当前树位置下为1
}
}
}
//用户输入生成图,后续替换为文件读取
static Graph initGraph() {
Graph graph;
cout << "请输入图的节点数:";
cin >> graph.n;
cout << "请输入图的边数:";
cin >> graph.m;
cout << "节点编号从0开始,请分别输入各条边的两端节点:" << endl;
int** arrays = ArrayUtil::getTwoArray(graph.n, graph.n);
int M1, M2;
for (int i = 1; i <= graph.m; i++) {
cout << "边(" << i << ")连接的一个节点为:";
cin >> M1;
cout << "边(" << i << ")连接的另一个节点为:";
cin >> M2;
arrays[M1][M2] = 1;
arrays[M2][M1] = 1;
}
graph.arrays = arrays;
graph.sum = ArrayUtil::getSparseArrayRow(arrays, graph.n, graph.n);
ArrayUtil::printArray(arrays, graph.n, graph.n);
cout << "________________________" << endl;
return graph;
}
int main() {
cout << "欢迎使用iBFS算法" << endl;
Graph graph = initGraph();
cout << "要获取几个节点的生成树:";
int resultNum;
cin >> resultNum;
cout << "请输入要获得生成树的节点编号,编号从0开始:" << endl;
TreeNode* resultTrees = new TreeNode[resultNum]; //存储要返回的生成树
for (int i = 1; i <= resultNum; i++) {
cout << "节点" << i << "编号:";
int index;
cin >> index;
resultTrees[i - 1] = TreeNode(index);
}
int** flag = ArrayUtil::getTwoArray(graph.n, resultNum);
bool end = false; //标记是否生成结束
for (int i = 0; i < resultNum; i++)
flag[resultTrees[i].nodeIndex][i] = 1; //初始头节点对应位置置1
int** newFlag = (int**)malloc(sizeof(int*) * graph.n); //拷贝一份flag以供对比,同时生成cuda备份
int* flagRow = (int*)malloc(sizeof(int) * graph.n * resultNum);
int** dev_flag2d;
int* dev_flag1d;
cudaMalloc((void**)&dev_flag2d, sizeof(int*) * graph.n);
cudaMalloc((void**)&dev_flag1d, sizeof(int) * graph.n * resultNum);
int index = 0;
for (int k = 0; k < graph.n; k++) //---------二维数组赋值(转为一维数组)-------------
for (int j = 0; j < resultNum; j++) flagRow[index++] = flag[k][j];
//cout << "未进CUDA前一维的nowFlag:" << endl;
//for (int i = 0; i < graph.n * resultNum; i++) cout << flagRow[i] << " ";
//建立主机指针与设备指针的关系
for (int i = 0; i < graph.n; i++) newFlag[i] = dev_flag1d + i * resultNum;
//转存到设备
cudaMemcpy(dev_flag1d, flagRow, sizeof(int) * graph.n * resultNum, cudaMemcpyHostToDevice);
cudaMemcpy(dev_flag2d, newFlag, sizeof(int*) * graph.n, cudaMemcpyHostToDevice);
int** cpu_sparseArray = ArrayUtil::getSparseArray(graph.arrays, graph.n, graph.n, graph.sum); //该稀疏数组用于cpu
int** sparseArray = (int**)malloc(sizeof(int*) * graph.sum); //获得图的稀疏数组信息,不在表头设定行列数据
int** dev_Array2d;
int* ArrayRow = (int*)malloc(sizeof(int) * graph.sum * 2);;
int* dev_Array1d;
cudaMalloc((void**)&dev_Array2d, sizeof(int*) * graph.sum);
cudaMalloc((void**)&dev_Array1d, sizeof(int) * graph.sum * 2);
index = 0;
for (int i = 0; i < graph.n; i++)
for (int j = 0; j < graph.n; j++)
if (graph.arrays[i][j] != 0) {
ArrayRow[index++] = i;
ArrayRow[index++] = j;
}
//建立主机指针与设备指针的关系
for (int i = 0; i < graph.sum; i++) sparseArray[i] = dev_Array1d + i * 2;
//转存到设备
cudaMemcpy(dev_Array2d, sparseArray, sizeof(int*) * graph.sum, cudaMemcpyHostToDevice);
cudaMemcpy(dev_Array1d, ArrayRow, sizeof(int) * graph.sum * 2, cudaMemcpyHostToDevice);
cout << "开始生成..." << endl;
clock_t start_time, end_time;
start_time = clock();
while (!end) {
list<int> borderArrays = list<int>(); //存储所有树当前的边界队列节点,不同树之间以负数隔开
for (int i = 0; i < resultNum; i++) { //各个生成树的遍历,到时候用CUDA分配给多个block执行
//遍历树i的边界队列,存储边界节点的地址值
list<TreeNode*> treeNodes = list<TreeNode*>();
TreeUtil::addChildNode(treeNodes, resultTrees[i]);
borderArrays.push_back(-treeNodes.size()); //由于c++动态数组的遍历还得提前知道数组尺寸,故将有效数据个数存在数组头部,以负数表示,coda中通过绝对值取值
for (TreeNode* t : treeNodes) //cuda不支持模板类,故无法使用list,需要将各生成树当前边界队列的信息存储到一个总的队列中
borderArrays.push_back((*t).nodeIndex);
}
//将borderArrays转为一维数组
int* borderArray = new int[borderArrays.size()];
int* dev_border;
int index = 0;
for (int val : borderArrays) borderArray[index++] = val;
cudaMalloc((void**)& dev_border, sizeof(int) * borderArrays.size());
cudaMemcpy(dev_border, borderArray, sizeof(int) * borderArrays.size(), cudaMemcpyHostToDevice);
countNextFlag<<<resultNum, graph.sum >>>(dev_border, dev_flag2d, dev_Array2d); //这里根据要生成几棵树来指定网格数,根据边数指定线程数
cudaError_t error = cudaGetLastError();
printf("CUDA error: %s\n", cudaGetErrorString(error));
cudaDeviceSynchronize(); //cpu等待gpu进程执行完毕再继续
cudaMemcpy(flagRow, dev_flag1d, sizeof(int) * graph.n * resultNum, cudaMemcpyDeviceToHost);
//cout << "一维的nowFlag:" << endl;
//for (int i = 0; i < graph.n * resultNum; i++) cout << flagRow[i] << " ";
int** nowFlag = ArrayUtil::getNowFlag(flagRow, graph.n, resultNum);
//标记完毕后将共享边界与原flag对比
//cout << "nowFlag:" << endl;
//ArrayUtil::printArray(nowFlag, graph.n, resultNum);
//cout << "flag:" << endl;
//ArrayUtil::printArray(flag, graph.n, resultNum);
//遍历树j的边界队列,看看节点i可以作为哪个节点的子节点
for (int r = 0; r < resultNum; r++) {
list<TreeNode*> treeNodes = list<TreeNode*>();
TreeUtil::addChildNode(treeNodes, resultTrees[r]);
for (list<TreeNode*>::iterator p1 = treeNodes.begin(); p1 != treeNodes.end(); p1++){
for (int j = 0; j < graph.sum; j++) {
if (cpu_sparseArray[j][0] == (**p1).nodeIndex) //遍历到当前树节点的邻居
if (flag[cpu_sparseArray[j][1]][r] == 0 && nowFlag[cpu_sparseArray[j][1]][r] == 1) { //看看邻居点的flag有没有在当前树下变化
(**p1).childrenNodes.push_back(TreeNode(cpu_sparseArray[j][1])); //确认插入该点
//由于c++列表存的不是对象的引用,而是新开了一个对象,所以这里还要重新在树中遍历到所需节点
//TreeUtil::setChild(resultTrees[r], sparseArray[j][1], t.nodeIndex); //确认插入该点
flag[cpu_sparseArray[j][1]][r] = 1;
}
}
}
}
//cout << "遍历后的flag:" << endl;
//ArrayUtil::printArray(flag, graph.n, resultNum);
if (ArrayUtil::arrayAllEqOne(flag, graph.n, resultNum)) end = true;
}
end_time = clock();
cout << "生成完毕,耗时" << float(end_time - start_time) / CLOCKS_PER_SEC << "秒" << endl;
for (int i = 0; i < resultNum; i++) {
cout << "第" << (i + 1) << "棵生成树如下:" << endl;
TreeUtil::printTree(resultTrees[i]);
cout << "\n------------------------------------" << endl;
}
while (1);
return 0;
}
【无标题】
最新推荐文章于 2024-08-08 15:12:59 发布
该代码示例展示了如何使用CUDA进行图形算法的并行处理,特别是实现基于GPU的多树广度优先搜索(iBFS)。程序首先打印GPU设备信息,然后通过用户输入创建图结构。接着,利用核函数`countNextFlag`在GPU上计算生成树的边界节点。最后,通过比较不同迭代步长的标志数组更新生成树状态,直至所有节点都被包含在生成树中。
摘要由CSDN通过智能技术生成