本章节翻译by chenshusmail@163.com 原文:Avoiding Redundant Queue Constructions (intel.com)
要在设备上执行 kernel,用户必须创建一个 queue,它引用了关联的 context、平台和设备。 这些可以自动选择,也可以由用户指定。
context 是由用户直接构建的,或者在创建 queue 时隐式构建的, 用于保存 SYCL runtime 和 SYCL 后端操作设备所需的所有 runtime 信息。 当创建一个没有指定 context 的 queue 时,将使用默认构造函数隐式构建一个新的 context。 一般来说,创建一个新的 context 是一项繁重的操作, 因为每次向新 context 的 queue 提交 kernel 时,都需要 JIT 编译程序。 为了获得良好的性能,应用程序中应尽可能少地使用 context。
在以下示例中,在循环内部创建了一个 queue,并将 kernel 提交到这个新的 queue。 这基本上将在循环的每次迭代中调用 JIT 编译器。
int reductionMultipleQMultipleC(std::vector<int> &data, int iter) {
const size_t data_size = data.size();
int sum = 0;
int work_group_size = 512;
int num_work_groups = 1;
int num_work_items = work_group_size;
const sycl::property_list props = {sycl::property::buffer::use_host_ptr()};
sycl::buffer<int> buf(data.data(), data_size, props);
sycl::buffer<int> sum_buf(&sum, 1, props);
sycl::queue q1{sycl::default_selector{}, exception_handler};
// initialize data on the device
q1.submit([&](auto &h) {
sycl::accessor buf_acc(buf, h, sycl::write_only, sycl::no_init);
h.parallel_for(data_size, [=](auto index) { buf_acc[index] = 1; });
});
double elapsed = 0;
for (int i = 0; i < iter; i++) {
sycl::queue q2{sycl::default_selector{}, exception_handler};
if (i == 0)
std::cout << q2.get_device().get_info<sycl::info::device::name>() << "\n";
// reductionMultipleQMultipleC main begin
Timer timer;
q2.submit([&](auto &h) {
sycl::accessor buf_acc(buf, h, sycl::read_only);
sycl::accessor sum_acc(sum_buf, h, sycl::write_only, sycl::no_init);
sycl::accessor<int, 1, sycl::access::mode::read_write,
sycl::access::target::local>
scratch(work_group_size, h);
h.parallel_for(sycl::nd_range<1>{num_work_items, work_group_size},
[=](sycl::nd_item<1> item) {
size_t loc_id = item.get_local_id(0);
int sum = 0;
for (int i = loc_id; i < data_size; i += num_work_items)
sum += buf_acc[i];
scratch[loc_id] = sum;
for (int i = work_group_size / 2; i > 0; i >>= 1) {
item.barrier(sycl::access::fence_space::local_space);
if (loc_id < i)
scratch[loc_id] += scratch[loc_id + i];
}
if (loc_id == 0)
sum_acc[0] = scratch[0];
});
});
// reductionMultipleQMultipleC main end
q2.wait();
sycl::host_accessor h_acc(sum_buf);
sum = h_acc[0];
elapsed += timer.Elapsed();
}
elapsed = elapsed / iter;
if (sum == sum_expected)
std::cout << "SUCCESS: Time reductionMultipleQMultipleC = " << elapsed
<< "s"
<< " sum = " << sum << "\n";
else
std::cout << "ERROR: reductionMultipleQMultipleC Expected " << sum_expected
<< " but got " << sum << "\n";
return sum;
} // end reductionMultipleQMultipleC
上述程序可以通过将 queue 的声明移出循环来重写,这将显著提高性能。
int reductionSingleQ(std::vector<int> &data, int iter) {
const size_t data_size = data.size();
int sum = 0;
int work_group_size = 512;
int num_work_groups = 1;
int num_work_items = work_group_size;
const sycl::property_list props = {sycl::property::buffer::use_host_ptr()};
sycl::buffer<int> buf(data.data(), data_size, props);
sycl::buffer<int> sum_buf(&sum, 1, props);
sycl::queue q{sycl::default_selector{}, exception_handler};
std::cout << q.get_device().get_info<sycl::info::device::name>() << "\n";
// initialize data on the device
q.submit([&](auto &h) {
sycl::accessor buf_acc(buf, h, sycl::write_only, sycl::no_init);
h.parallel_for(data_size, [=](auto index) { buf_acc[index] = 1; });
});
double elapsed = 0;
for (int i = 0; i < iter; i++) {
// reductionIntBarrier main begin
Timer timer;
q.submit([&](auto &h) {
sycl::accessor buf_acc(buf, h, sycl::read_only);
sycl::accessor sum_acc(sum_buf, h, sycl::write_only, sycl::no_init);
sycl::accessor<int, 1, sycl::access::mode::read_write,
sycl::access::target::local>
scratch(work_group_size, h);
h.parallel_for(sycl::nd_range<1>{num_work_items, work_group_size},
[=](sycl::nd_item<1> item) {
size_t loc_id = item.get_local_id(0);
int sum = 0;
for (int i = loc_id; i < data_size; i += num_work_items)
sum += buf_acc[i];
scratch[loc_id] = sum;
for (int i = work_group_size / 2; i > 0; i >>= 1) {
item.barrier(sycl::access::fence_space::local_space);
if (loc_id < i)
scratch[loc_id] += scratch[loc_id + i];
}
if (loc_id == 0)
sum_acc[0] = scratch[0];
});
});
// reductionSingleQ main end
q.wait();
sycl::host_accessor h_acc(sum_buf);
sum = h_acc[0];
elapsed += timer.Elapsed();
}
elapsed = elapsed / iter;
if (sum == sum_expected)
std::cout << "SUCCESS: Time reductionSingleQ = " << elapsed << "s"
<< " sum = " << sum << "\n";
else
std::cout << "ERROR: reductionSingleQ Expected " << sum_expected
<< " but got " << sum << "\n";
return sum;
} // end reductionSingleQ
如果你需要创建多个 queue,尝试在 queue 之间共享 context。这将提高性能。 上面的 kernel 被重写为下面的样子,其中在循环内部创建的新 queue 和循环外部的 queue 共享 context。 在这种情况下,性能与一个 queue 的性能相同。
int reductionMultipleQSingleC(std::vector<int> &data, int iter) {
const size_t data_size = data.size();
int sum = 0;
int work_group_size = 512;
int num_work_groups = 1;
int num_work_items = work_group_size;
const sycl::property_list props = {sycl::property::buffer::use_host_ptr()};
sycl::buffer<int> buf(data.data(), data_size, props);
sycl::buffer<int> sum_buf(&sum, 1, props);
sycl::queue q1{sycl::default_selector{}, exception_handler};
// initialize data on the device
q1.submit([&](auto &h) {
sycl::accessor buf_acc(buf, h, sycl::write_only, sycl::no_init);
h.parallel_for(data_size, [=](auto index) { buf_acc[index] = 1; });
});
double elapsed = 0;
for (int i = 0; i < iter; i++) {
sycl::queue q2{q1.get_context(), sycl::default_selector{},
exception_handler};
if (i == 0)
std::cout << q2.get_device().get_info<sycl::info::device::name>() << "\n";
// reductionMultipleQSingleC main begin
Timer timer;
q2.submit([&](auto &h) {
sycl::accessor buf_acc(buf, h, sycl::read_only);
sycl::accessor sum_acc(sum_buf, h, sycl::write_only, sycl::no_init);
sycl::accessor<int, 1, sycl::access::mode::read_write,
sycl::access::target::local>
scratch(work_group_size, h);
h.parallel_for(sycl::nd_range<1>{num_work_items, work_group_size},
[=](sycl::nd_item<1> item) {
size_t loc_id = item.get_local_id(0);
int sum = 0;
for (int i = loc_id; i < data_size; i += num_work_items)
sum += buf_acc[i];
scratch[loc_id] = sum;
for (int i = work_group_size / 2; i > 0; i >>= 1) {
item.barrier(sycl::access::fence_space::local_space);
if (loc_id < i)
scratch[loc_id] += scratch[loc_id + i];
}
if (loc_id == 0)
sum_acc[0] = scratch[0];
});
});
// reductionMultipleQSingleC main end
q2.wait();
sycl::host_accessor h_acc(sum_buf);
sum = h_acc[0];
elapsed += timer.Elapsed();
}
elapsed = elapsed / iter;
if (sum == sum_expected)
std::cout << "SUCCESS: Time reductionMultipleQSingleContext = " << elapsed
<< "s"
<< " sum = " << sum << "\n";
else
std::cout << "ERROR: reductionMultipleQSingleContext Expected "
<< sum_expected << " but got " << sum << "\n";
return sum;
} // end reductionMultipleQSingleC