【异构】并行任务计算

码力码力我爱你

已于 2024-05-13 13:53:19 修改

阅读量648

点赞数 12

文章标签： c++ 边缘计算

于 2024-05-11 20:54:18 首次发布

本文链接：https://blog.csdn.net/qq_30220519/article/details/138729514

版权

https://cdrdv2-public.intel.com/772402/onetbb_cookbook_2021.6-772401-772402.pdf

https://cdrdv2-public.intel.com/816904/onetbb_get-started-guide_2021.12-772618-816904.pdf

https://blog.51cto.com/shijianfeng/5152901

// 同构，多进程
https://github.com/microsoft/Microsoft-MPI // 消息传递接口（MPI）
https://github.com/rabauke/mpl // 同类，Boost.MPI

MPICH | High-Performance Portable MPI

// 同构，多线程任务计算
https://github.com/ClearLove27149/TaskCpp
https://github.com/ddemidov/compute // GPU计算
https://github.com/oneapi-src/oneTBB#stable-releases // tbb
https://github.com/intel/tbb // tbb

// 异构，并行任务计算，支持流程
https://taskflow.github.io/taskflow/index.html
https://github.com/taskflow/taskflow.git

#include <taskflow/taskflow.hpp>
#include <vector>

// // Offload Tasks to a GPU
//__global__ void saxpy(int n, float a, float* x, float* y) {
// int i = blockIdx.x * blockDim.x + threadIdx.x;
// if (i < n) {
// y[i] = a * x[i] + y[i];
// }
//}

int test() {
tf::Executor executor;
tf::Taskflow taskflow;

auto [A, B, C, D] = taskflow.emplace( // create four tasks
[]() { std::cout << "TaskA\n"; },
[]() { std::cout << "TaskB\n"; },
[]() { std::cout << "TaskC\n"; },
[]() { std::cout << "TaskD\n"; }
);

tf::Task E = taskflow.emplace([](tf::Subflow& subflow) { // subflow task B
tf::Task B1 = subflow.emplace([]() {}).name("B1");
tf::Task B2 = subflow.emplace([]() {}).name("B2");
tf::Task B3 = subflow.emplace([]() {}).name("B3");
B3.succeed(B1, B2); // B3 runs after B1 and B2
}).name("B");
A.precede(B, C); // A runs before B and C
D.succeed(B, C, E); // D runs after B and C

// creates a feedback loop {0: cond, 1: stop}
tf::Task stop = taskflow.emplace([]() {}).name("stop");
tf::Task cond = taskflow.emplace([]() { return std::rand() % 2; }).name("cond");
cond.precede(cond, stop); // moves on to 'cond' on returning 0, or 'stop' on 1

//tf::Task cudaflow = taskflow.emplace([&](tf::cudaFlow& cf) {
// tf::cudaTask h2d_x = cf.copy(dx, hx.data(), N).name("h2d_x");
// tf::cudaTask h2d_y = cf.copy(dy, hy.data(), N).name("h2d_y");
// tf::cudaTask d2h_x = cf.copy(hx.data(), dx, N).name("d2h_x");
// tf::cudaTask d2h_y = cf.copy(hy.data(), dy, N).name("d2h_y");
// tf::cudaTask saxpy = cf.kernel((N + 255) / 256, 256, 0, saxpy, N, 2.0f, dx, dy)
// .name("saxpy"); // parameters to the saxpy kernel
// saxpy.succeed(h2d_x, h2d_y)
// .precede(d2h_x, d2h_y);
// }).name("cudaFlow");

// Compose Task Graphs
tf::Taskflow f1, f2;
// create taskflow f1 of two tasks
tf::Task f1A = f1.emplace([]() { std::cout << "Task f1A\n"; }).name("f1A");
tf::Task f2B = f2.emplace([]() { std::cout << "Task f2B\n"; }).name("f2B");
tf::Task f1_module_task = f2.composed_of(f1).name("module");

f1_module_task.succeed(f1A, f2B)
.precede(f1A);

// tf::Future<void> run_once = executor.run(taskflow);
// run_once.get(); // wait on this run to finish

executor.run(taskflow).wait();

// create asynchronous tasks directly from an executor
std::future<int> future = executor.async([]() {
std::cout << "async task returns 1\n";
return 1;
});
executor.silent_async([]() { std::cout << "async task does not return\n"; });

// create asynchronous tasks with dynamic dependencies
tf::AsyncTask AA = executor.silent_dependent_async([]() { printf("A\n"); });
executor.wait_for_all();

// run the taskflow four times
executor.run_n(taskflow, 4);

// runs the taskflow five times
executor.run_until(taskflow, [ counter = 5]() mutable { return --counter == 0; });

// standard parallel CPU algorithms
//std::vector<int> data{ 8, 32, 65, 64, 7 };
//tf::Task task1 = taskflow.for_each( // assign each element to 100 in parallel
// data.begin(), data.end(), [](auto& i) { i = 100; }
//);
//tf::Task task2 = taskflow.reduce( // reduce a range of items in parallel
// data.begin(), data.end(), data.begin(), [](auto a, auto b) { return a + b; }
//);
//tf::Task task3 = taskflow.sort( // sort a range of items in parallel
// data.begin(), data.end(), [](auto a, auto b) { return a < b; }
//);

create a pipeline to propagate five tokens through three serial stages
//tf::Pipeline pl("num_lines",
// tf::Pipe{ tf::PipeType::SERIAL, [](tf::Pipeflow& pf) {
// if (pf.token() == 5) {
// pf.stop();
// }
// } },
// tf::Pipe{ tf::PipeType::SERIAL, [](tf::Pipeflow& pf) {
// printf("stage 2: input buffer[%zu] = %d\n", pf.line(), buffer[pf.line()]);
// } },
// tf::Pipe{ tf::PipeType::SERIAL, [](tf::Pipeflow& pf) {
// printf("stage 3: input buffer[%zu] = %d\n", pf.line(), buffer[pf.line()]);
// } }
//);
//taskflow.composed_of(pl)
// executor.run(taskflow).wait();