一. linux kernel的eventfd机制
让事件飞 ——Linux eventfd 原理与实践 - 知乎 (zhihu.com)https://zhuanlan.zhihu.com/p/40572954在Linux系统中,eventfd是一个用来通知事件的文件描述符,是一种内核向用户空间的应用发送通知的机制,可以有效地被用来实现用户空间的事件/通知驱动的应用程序。
二.ROCT event的event机制
ROCT event的本质也是内核(KMD)向用户空间的应用发送通知的机制。
1.ROCT event类型
typedef enum _HSA_EVENTTYPE
{
HSA_EVENTTYPE_SIGNAL = 0, //user-mode generated GPU signal
HSA_EVENTTYPE_NODECHANGE = 1, //HSA node change (attach/detach)
HSA_EVENTTYPE_DEVICESTATECHANGE = 2, //HSA device state change( start/stop )
HSA_EVENTTYPE_HW_EXCEPTION = 3, //GPU shader exception event
HSA_EVENTTYPE_SYSTEM_EVENT = 4, //GPU SYSCALL with parameter info
HSA_EVENTTYPE_DEBUG_EVENT = 5, //GPU signal for debugging
HSA_EVENTTYPE_PROFILE_EVENT = 6, //GPU signal for profiling
HSA_EVENTTYPE_QUEUE_EVENT = 7, //GPU signal queue idle state (EOP pm4)
HSA_EVENTTYPE_MEMORY = 8, //GPU signal for signaling memory access faults and memory subsystem issues
//...
HSA_EVENTTYPE_MAXID,
HSA_EVENTTYPE_TYPE_SIZE = 0xFFFFFFFF
} HSA_EVENTTYPE;
2.ROCT event应用之——线程同步
(1) HSA_EVENTTYPE_SIGNAL
(2) HSA_EVENTTYPE_DEBUG_EVENT
这两种类型的event的共同点——用于线程间同步;
这两种类型的event的不同点——HSA_EVENTTYPE_SIGNAL可以通过调用hsaKmtSetEvent(HsaEvent *Event)或者硬件中断触发;HSA_EVENTTYPE_DEBUG_EVENT只能通过硬件中断触发。
3.问题1:为什么线程同步要用event??? 为什么不能用POSIX线程库API?
因为这里的线程是UMD线程,每个线程的运行都是为硬件服务的。所以,线程之间的同步必须和当前硬件的状态一致,也就是说这里线程之间的同步依赖于从内核(KMD)获取某些硬件信息。
3.问题2:对于HSA_EVENTTYPE_SIGNAL中的signal如何理解?
二.ROCT event的源码分析
/*
* Copyright © 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including
* the next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "libhsakmt.h"
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <errno.h>
#include <unistd.h>
#include <sys/mman.h>
#include <stdio.h>
#include "linux/kfd_ioctl.h"
#include "fmm.h"
static HSAuint64 *events_page = NULL;
void clear_events_page(void)
{
events_page = NULL;
}
static bool IsSystemEventType(HSA_EVENTTYPE type)
{
// Debug events behave as signal events.
return (type != HSA_EVENTTYPE_SIGNAL && type != HSA_EVENTTYPE_DEBUG_EVENT);
}
/*
* ManualReset:1.KMD针对每个event会创建一个状态flag, 如果将这个标记设置为ManualReset:那么通过hsaKmtSetEvent
* 会触发KMD将这个标记设置为1,hsaKmtWaitOnEvent会触发KMD去判断这个标记是否有效来决定event是否有效。
* 同理,hsaKmtResetEvent会触发KMD将这个标记清0;
* 2.如果将这个标记设置为!ManualReset,这个标记油KMD自动清0。
*
* IsSignaled:如果创建event的时候传入IsSignaled为true,那阻塞在该event的线程调用hsaKmtWaitOnEvent的时候就会
* 被唤醒,否则,需要手动调用hsaKmtSetEvent来唤醒阻塞线程。
*/
HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc,
bool ManualReset, bool IsSignaled,
HsaEvent **Event)
{
unsigned int event_limit = KFD_SIGNAL_EVENT_LIMIT;
CHECK_KFD_OPEN();
if (EventDesc->EventType >= HSA_EVENTTYPE_MAXID)
return HSAKMT_STATUS_INVALID_PARAMETER;
HsaEvent *e = malloc(sizeof(HsaEvent));
if (!e)
return HSAKMT_STATUS_ERROR;
memset(e, 0, sizeof(*e));
struct kfd_ioctl_create_event_args args = {0};
args.event_type = EventDesc->EventType;
args.node_id = EventDesc->NodeId;
args.auto_reset = !ManualReset;
/* dGPU code */
pthread_mutex_lock(&hsakmt_mutex);
if (is_dgpu && !events_page) {
events_page = allocate_exec_aligned_memory_gpu(
KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0, true, false, true);
if (!events_page) {
pthread_mutex_unlock(&hsakmt_mutex);
return HSAKMT_STATUS_ERROR;
}
fmm_get_handle(events_page, (uint64_t *)&args.event_page_offset);
}
if (kmtIoctl(kfd_fd, AMDKFD_IOC_CREATE_EVENT, &args) != 0) {
free(e);
*Event = NULL;
pthread_mutex_unlock(&hsakmt_mutex);
return HSAKMT_STATUS_ERROR;
}
e->EventId = args.event_id;
if (!events_page && args.event_page_offset > 0) {
events_page = mmap(NULL, event_limit * 8, PROT_WRITE | PROT_READ,
MAP_SHARED, kfd_fd, args.event_page_offset);
if (events_page == MAP_FAILED) {
/* old kernels only support 256 events */
event_limit = 256;
events_page = mmap(NULL, PAGE_SIZE, PROT_WRITE | PROT_READ,
MAP_SHARED, kfd_fd, args.event_page_offset);
}
if (events_page == MAP_FAILED) {
events_page = NULL;
pthread_mutex_unlock(&hsakmt_mutex);
hsaKmtDestroyEvent(e);
return HSAKMT_STATUS_ERROR;
}
}
pthread_mutex_unlock(&hsakmt_mutex);
if (args.event_page_offset > 0 && args.event_slot_index < event_limit)
e->EventData.HWData2 = (HSAuint64)&events_page[args.event_slot_index];
e->EventData.EventType = EventDesc->EventType;
e->EventData.HWData1 = args.event_id;
e->EventData.HWData3 = args.event_trigger_data;
e->EventData.EventData.SyncVar.SyncVar.UserData =
EventDesc->SyncVar.SyncVar.UserData;
e->EventData.EventData.SyncVar.SyncVarSize =
EventDesc->SyncVar.SyncVarSize;
//调用AMDKFD_IOC_SET_EVENT自动唤醒阻塞在该event的线程
if (IsSignaled && !IsSystemEventType(e->EventData.EventType)) {
struct kfd_ioctl_set_event_args set_args = {0};
set_args.event_id = args.event_id;
kmtIoctl(kfd_fd, AMDKFD_IOC_SET_EVENT, &set_args);
}
*Event = e;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtDestroyEvent(HsaEvent *Event)
{
CHECK_KFD_OPEN();
if (!Event)
return HSAKMT_STATUS_INVALID_HANDLE;
struct kfd_ioctl_destroy_event_args args = {0};
args.event_id = Event->EventId;
if (kmtIoctl(kfd_fd, AMDKFD_IOC_DESTROY_EVENT, &args) != 0)
return HSAKMT_STATUS_ERROR;
free(Event);
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtSetEvent(HsaEvent *Event)
{
CHECK_KFD_OPEN();
if (!Event)
return HSAKMT_STATUS_INVALID_HANDLE;
/* Although the spec is doesn't say, don't allow system-defined events
* to be signaled.
*/
if (IsSystemEventType(Event->EventData.EventType))
return HSAKMT_STATUS_ERROR;
struct kfd_ioctl_set_event_args args = {0};
args.event_id = Event->EventId;
if (kmtIoctl(kfd_fd, AMDKFD_IOC_SET_EVENT, &args) == -1)
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtResetEvent(HsaEvent *Event)
{
CHECK_KFD_OPEN();
if (!Event)
return HSAKMT_STATUS_INVALID_HANDLE;
/* Although the spec is doesn't say, don't allow system-defined events
* to be signaled.
*/
if (IsSystemEventType(Event->EventData.EventType))
return HSAKMT_STATUS_ERROR;
struct kfd_ioctl_reset_event_args args = {0};
args.event_id = Event->EventId;
if (kmtIoctl(kfd_fd, AMDKFD_IOC_RESET_EVENT, &args) == -1)
return HSAKMT_STATUS_ERROR;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtQueryEventState(HsaEvent *Event)
{
CHECK_KFD_OPEN();
if (!Event)
return HSAKMT_STATUS_INVALID_HANDLE;
return HSAKMT_STATUS_SUCCESS;
}
HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnEvent(HsaEvent *Event,
HSAuint32 Milliseconds)
{
if (!Event)
return HSAKMT_STATUS_INVALID_HANDLE;
return hsaKmtWaitOnMultipleEvents(&Event, 1, true, Milliseconds);
}
//Analysis memory exception data, print debug messages
static void analysis_memory_exception(struct kfd_hsa_memory_exception_data *
memory_exception_data)
{
HSAKMT_STATUS ret;
HsaPointerInfo info;
const uint64_t addr = memory_exception_data->va;
uint32_t node_id = 0;
unsigned int i;
gpuid_to_nodeid(memory_exception_data->gpu_id, &node_id);
pr_err("Memory exception on virtual address 0x%lx, ", addr);
pr_err("node id %d : ", node_id);
if (memory_exception_data->failure.NotPresent)
pr_err("Page not present\n");
else if (memory_exception_data->failure.ReadOnly)
pr_err("Writing to readonly page\n");
else if (memory_exception_data->failure.NoExecute)
pr_err("Execute to none-executable page\n");
ret = fmm_get_mem_info((const void *)addr, &info);
if (ret != HSAKMT_STATUS_SUCCESS) {
pr_err("Address does not belong to a known buffer\n");
return;
}
pr_err("GPU address 0x%lx, node id %d, size in byte 0x%lx\n",
info.GPUAddress, info.Node, info.SizeInBytes);
switch (info.Type) {
case HSA_POINTER_REGISTERED_SHARED:
pr_err("Memory is registered shared buffer (IPC)\n");
break;
case HSA_POINTER_REGISTERED_GRAPHICS:
pr_err("Memory is registered graphics buffer\n");
break;
case HSA_POINTER_REGISTERED_USER:
pr_err("Memory is registered user pointer\n");
pr_err("CPU address of the memory is %p\n", info.CPUAddress);
break;
case HSA_POINTER_ALLOCATED:
pr_err("Memory is allocated using hsaKmtAllocMemory\n");
pr_err("CPU address of the memory is %p\n", info.CPUAddress);
break;
default:
pr_err("Invalid memory type %d\n", info.Type);
break;
}
if (info.RegisteredNodes) {
pr_err("Memory is registered to node id: ");
for (i = 0; i < info.NRegisteredNodes; i++)
pr_err("%d ", info.RegisteredNodes[i]);
pr_err("\n");
}
if (info.MappedNodes) {
pr_err("Memory is mapped to node id: ");
for (i = 0; i < info.NMappedNodes; i++)
pr_err("%d ", info.MappedNodes[i]);
pr_err("\n");
}
}
/*
* WaitOnAll:等到所有event才唤醒阻塞线程还是说等到一个event就唤醒
* Milliseconds:超时时间到如果还没有event到,就不用再阻塞等待线程
*/
HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents(HsaEvent *Events[],
HSAuint32 NumEvents,
bool WaitOnAll,
HSAuint32 Milliseconds)
{
CHECK_KFD_OPEN();
if (!Events)
return HSAKMT_STATUS_INVALID_HANDLE;
struct kfd_event_data *event_data = calloc(NumEvents, sizeof(struct kfd_event_data));
for (HSAuint32 i = 0; i < NumEvents; i++) {
event_data[i].event_id = Events[i]->EventId;
event_data[i].kfd_event_data_ext = (uint64_t)(uintptr_t)NULL;
}
struct kfd_ioctl_wait_events_args args = {0};
args.wait_for_all = WaitOnAll;
args.timeout = Milliseconds;
args.num_events = NumEvents;
args.events_ptr = (uint64_t)(uintptr_t)event_data; //要监控的event列表
HSAKMT_STATUS result;
if (kmtIoctl(kfd_fd, AMDKFD_IOC_WAIT_EVENTS, &args) == -1)
result = HSAKMT_STATUS_ERROR;
else if (args.wait_result == KFD_IOC_WAIT_RESULT_TIMEOUT)
result = HSAKMT_STATUS_WAIT_TIMEOUT;
else {
/* 针对HSA_EVENTTYPE_MEMORY类型event的处理,暂且不管 */
result = HSAKMT_STATUS_SUCCESS;
for (HSAuint32 i = 0; i < NumEvents; i++) {
if (Events[i]->EventData.EventType == HSA_EVENTTYPE_MEMORY &&
event_data[i].memory_exception_data.gpu_id) {
Events[i]->EventData.EventData.MemoryAccessFault.VirtualAddress = event_data[i].memory_exception_data.va;
result = gpuid_to_nodeid(event_data[i].memory_exception_data.gpu_id, &Events[i]->EventData.EventData.MemoryAccessFault.NodeId);
if (result != HSAKMT_STATUS_SUCCESS)
goto out;
Events[i]->EventData.EventData.MemoryAccessFault.Failure.NotPresent = event_data[i].memory_exception_data.failure.NotPresent;
Events[i]->EventData.EventData.MemoryAccessFault.Failure.ReadOnly = event_data[i].memory_exception_data.failure.ReadOnly;
Events[i]->EventData.EventData.MemoryAccessFault.Failure.NoExecute = event_data[i].memory_exception_data.failure.NoExecute;
Events[i]->EventData.EventData.MemoryAccessFault.Failure.Imprecise = event_data[i].memory_exception_data.failure.imprecise;
Events[i]->EventData.EventData.MemoryAccessFault.Failure.ErrorType = event_data[i].memory_exception_data.ErrorType;
Events[i]->EventData.EventData.MemoryAccessFault.Failure.ECC =
((event_data[i].memory_exception_data.ErrorType == 1) || (event_data[i].memory_exception_data.ErrorType == 2)) ? 1 : 0;
Events[i]->EventData.EventData.MemoryAccessFault.Flags = HSA_EVENTID_MEMORY_FATAL_PROCESS;
analysis_memory_exception(&event_data[i].memory_exception_data);
}
}
}
out:
free(event_data);
return result;
}