/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006 Voltaire. All rights reserved. * Copyright (c) 2007 Mellanox Technologies. All rights reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. * Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. * * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ /** * @file: * * This file implements a simple memory pool that is used by the GPU * buffer on the sending side. It just gets a memory handle and event * handle that can be sent to the remote side which can then use the * handles to get access to the memory and the event to determine when * it can start accessing the memory. There is no caching of the * memory handles as getting new ones is fast. The event handles are * cached by the cuda_common code. */ #include "opal_config.h" #include "opal/mca/rcache/base/base.h" #include "opal/mca/rcache/gpusm/rcache_gpusm.h" #include "opal/include/opal/opal_cuda.h" #include /** * Called when the registration free list is created. An event is created * for each entry. */ static void mca_rcache_gpusm_registration_constructor(mca_rcache_gpusm_registration_t *item) { uintptr_t *event = &item->event; void *handle = (void *) &item->evtHandle; CUresult result; result = cuEventCreate((CUevent *) event, CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_output(0, "cuEventCreate failed\n"); } result = cuIpcGetEventHandle((CUipcEventHandle *) handle, (CUevent) *event); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_output(0, "cuIpcGetEventHandle failed\n"); } } /** * Called when the program is exiting. This destroys the events. */ static void mca_rcache_gpusm_registration_destructor(mca_rcache_gpusm_registration_t *item) { uintptr_t event = item->event; CUresult result; result = cuEventDestroy((CUevent) event); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_output(0, "cuEventDestroy failed"); } } OBJ_CLASS_INSTANCE(mca_rcache_gpusm_registration_t, mca_rcache_base_registration_t, mca_rcache_gpusm_registration_constructor, mca_rcache_gpusm_registration_destructor); /* * Initializes the rcache module. */ void mca_rcache_gpusm_module_init(mca_rcache_gpusm_module_t *rcache) { mca_rcache_base_module_init(&rcache->super); rcache->super.rcache_component = &mca_rcache_gpusm_component.super; rcache->super.rcache_register = mca_rcache_gpusm_register; rcache->super.rcache_find = mca_rcache_gpusm_find; rcache->super.rcache_deregister = mca_rcache_gpusm_deregister; rcache->super.rcache_finalize = mca_rcache_gpusm_finalize; OBJ_CONSTRUCT(&rcache->reg_list, opal_free_list_t); /* Start with 0 entries in the free list since CUDA may not have * been initialized when this free list is created and there is * some CUDA specific activities that need to be done. */ opal_free_list_init(&rcache->reg_list, sizeof(struct mca_opal_cuda_reg_t), opal_cache_line_size, OBJ_CLASS(mca_rcache_gpusm_registration_t), 0, opal_cache_line_size, 0, -1, 64, NULL, 0, NULL, NULL, NULL); } /** * Just go ahead and get a new registration. The find and register * functions are the same thing for this memory pool. */ int mca_rcache_gpusm_find(mca_rcache_base_module_t *rcache, void *addr, size_t size, mca_rcache_base_registration_t **reg) { return mca_rcache_gpusm_register(rcache, addr, size, 0, 0, reg); } /* * Get the memory handle of a local section of memory that can be sent * to the remote size so it can access the memory. This is the * registration function for the sending side of a message transfer. */ static int mca_rcache_gpusm_get_mem_handle(void *base, size_t size, mca_rcache_base_registration_t *newreg) { CUmemorytype memType; CUresult result; CUipcMemHandle *memHandle; CUdeviceptr pbase; size_t psize; mca_opal_cuda_reg_t *cuda_reg = (mca_opal_cuda_reg_t *) newreg; memHandle = (CUipcMemHandle *) cuda_reg->data.memHandle; /* We should only be there if this is a CUDA device pointer */ result = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr) base); assert(CUDA_SUCCESS == result); assert(CU_MEMORYTYPE_DEVICE == memType); /* Get the memory handle so we can send it to the remote process. */ result = cuIpcGetMemHandle(memHandle, (CUdeviceptr) base); if (CUDA_SUCCESS != result) { return OPAL_ERROR; } /* Need to get the real base and size of the memory handle. This is * how the remote side saves the handles in a cache. */ result = cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr) base); if (CUDA_SUCCESS != result) { return OPAL_ERROR; } /* Store all the information in the registration */ cuda_reg->base.base = (void *) pbase; cuda_reg->base.bound = (unsigned char *) pbase + psize - 1; cuda_reg->data.memh_seg_addr.pval = (void *) pbase; cuda_reg->data.memh_seg_len = psize; #if OPAL_CUDA_SYNC_MEMOPS /* With CUDA 6.0, we can set an attribute on the memory pointer that will * ensure any synchronous copies are completed prior to any other access * of the memory region. This means we do not need to record an event * and send to the remote side. */ memType = 1; /* Just use this variable since we already have it */ result = cuPointerSetAttribute(&memType, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr) base); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { return OPAL_ERROR; } #else /* Need to record the event to ensure that any memcopies into the * device memory have completed. The event handle associated with * this event is sent to the remote process so that it will wait * on this event prior to copying data out of the device memory. * Note that this needs to be the NULL stream to make since it is * unknown what stream any copies into the device memory were done * with. */ result = cuEventRecord((CUevent) cuda_reg->data.event, 0); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { return OPAL_ERROR; } #endif /* OPAL_CUDA_SYNC_MEMOPS */ return OPAL_SUCCESS; } /* * This is the one function that does all the work. It will call into * the register function to get the memory handle for the sending * buffer. There is no need to deregister the memory handle so the * deregister function is a no-op. */ int mca_rcache_gpusm_register(mca_rcache_base_module_t *rcache, void *addr, size_t size, uint32_t flags, int32_t access_flags, mca_rcache_base_registration_t **reg) { mca_rcache_gpusm_module_t *rcache_gpusm = (mca_rcache_gpusm_module_t *) rcache; mca_rcache_base_registration_t *gpusm_reg; opal_free_list_item_t *item; unsigned char *base, *bound; int rc; /* In spite of the fact we return an error code, the existing code * checks the registration for a NULL value rather than looking at * the return code. So, initialize the registration to NULL in * case we run into a failure. */ *reg = NULL; base = addr; bound = (unsigned char *) addr + size - 1; item = opal_free_list_get(&rcache_gpusm->reg_list); if (NULL == item) { return OPAL_ERR_OUT_OF_RESOURCE; } gpusm_reg = (mca_rcache_base_registration_t *) item; gpusm_reg->rcache = rcache; gpusm_reg->base = base; gpusm_reg->bound = bound; gpusm_reg->flags = flags; gpusm_reg->access_flags = access_flags; rc = mca_rcache_gpusm_get_mem_handle(base, size, gpusm_reg); if (rc != OPAL_SUCCESS) { opal_free_list_return(&rcache_gpusm->reg_list, item); return rc; } *reg = gpusm_reg; (*reg)->ref_count++; return OPAL_SUCCESS; } /* * Return the registration to the free list. */ int mca_rcache_gpusm_deregister(struct mca_rcache_base_module_t *rcache, mca_rcache_base_registration_t *reg) { int rc; mca_rcache_gpusm_module_t *rcache_gpusm = (mca_rcache_gpusm_module_t *) rcache; opal_free_list_return(&rcache_gpusm->reg_list, (opal_free_list_item_t *) reg); return OPAL_SUCCESS; } /** * Free up the resources. */ void mca_rcache_gpusm_finalize(struct mca_rcache_base_module_t *rcache) { opal_free_list_item_t *item; mca_rcache_gpusm_module_t *rcache_gpusm = (mca_rcache_gpusm_module_t *) rcache; /* Need to run the destructor on each item in the free list explicitly. * The destruction of the free list only runs the destructor on the * main free list, not each item. */ while (NULL != (item = (opal_free_list_item_t *) opal_lifo_pop(&(rcache_gpusm->reg_list.super)))) { OBJ_DESTRUCT(item); } OBJ_DESTRUCT(&rcache_gpusm->reg_list); mca_rcache_base_module_fini(rcache); return; }