/* * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved. * Copyright (c) 2023 Triad National Security, LLC. All rights * reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "opal/mca/btl/btl.h" #include "ompi/constants.h" #include "btl_smcuda.h" #include "btl_smcuda_accelerator.h" #include "opal/mca/accelerator/base/base.h" #include "opal/mca/accelerator/accelerator.h" static opal_mutex_t btl_smcuda_accelerator_ipc_lock; static opal_accelerator_stream_t *ipc_stream = NULL; opal_accelerator_event_t **accelerator_event_ipc_array = NULL; static struct mca_btl_base_descriptor_t **accelerator_event_ipc_frag_array = NULL; /* First free/available location in accelerator_event_status_array */ static int accelerator_event_ipc_first_avail; /* First currently-being used location in the accelerator_event_status_array */ static int accelerator_event_ipc_first_used; /* Number of status items currently in use */ static volatile int accelerator_event_ipc_num_used; /* Size of array holding events */ static int accelerator_event_max = 400; static int accelerator_event_ipc_most = 0; static bool smcuda_accelerator_initialized = false; void mca_btl_smcuda_accelerator_fini(void); int mca_btl_smcuda_accelerator_init(void) { int rc = OPAL_SUCCESS; int i; OBJ_CONSTRUCT(&btl_smcuda_accelerator_ipc_lock, opal_mutex_t); /* The first available status index is 0. Make an empty frag array. */ rc = opal_accelerator.create_stream(MCA_ACCELERATOR_NO_DEVICE_ID, &ipc_stream); if (OPAL_SUCCESS != rc) { opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Failed to create accelerator ipc_stream stream."); goto cleanup_and_error; } accelerator_event_ipc_num_used = 0; accelerator_event_ipc_first_avail = 0; accelerator_event_ipc_first_used = 0; accelerator_event_ipc_array = calloc(accelerator_event_max, sizeof(opal_accelerator_event_t *)); if (NULL == accelerator_event_ipc_array) { opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "No memory."); rc = OPAL_ERROR; goto cleanup_and_error; } /* Create the events since they can be reused. */ for (i = 0; i < accelerator_event_max; i++) { rc = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_ipc_array[i]); if (OPAL_SUCCESS != rc) { opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Accelerator create event failed."); rc = OPAL_ERROR; goto cleanup_and_error; } } /* The first available status index is 0. Make an empty frag array. */ accelerator_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **) malloc(sizeof(struct mca_btl_base_descriptor_t *) * accelerator_event_max); if (NULL == accelerator_event_ipc_frag_array) { opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "No memory."); rc = OPAL_ERROR; goto cleanup_and_error; } smcuda_accelerator_initialized = true; cleanup_and_error: if (OPAL_SUCCESS != rc) { if (NULL != accelerator_event_ipc_array) { for (i = 0; i < accelerator_event_max; i++) { if (NULL != accelerator_event_ipc_array[i]) { OBJ_RELEASE(accelerator_event_ipc_array[i]); } } free(accelerator_event_ipc_array); } if (NULL != accelerator_event_ipc_frag_array) { free(accelerator_event_ipc_frag_array); } if (NULL != ipc_stream) { OBJ_RELEASE(ipc_stream); } OBJ_DESTRUCT(&btl_smcuda_accelerator_ipc_lock); } return rc; } void mca_btl_smcuda_accelerator_fini(void) { int i; if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "null") || false == smcuda_accelerator_initialized) { return; } if (NULL != accelerator_event_ipc_array) { for (i = 0; i < accelerator_event_max; i++) { if (NULL != accelerator_event_ipc_array[i]) { OBJ_RELEASE(accelerator_event_ipc_array[i]); } } free(accelerator_event_ipc_array); } if (NULL != accelerator_event_ipc_frag_array) { free(accelerator_event_ipc_frag_array); } OBJ_RELEASE(ipc_stream); OBJ_DESTRUCT(&btl_smcuda_accelerator_ipc_lock); smcuda_accelerator_initialized = false; } /* * Function is called every time progress is called with the sm BTL. If there * are outstanding events, check to see if one has completed. If so, hand * back the fragment for further processing. */ int mca_btl_smcuda_progress_one_ipc_event(struct mca_btl_base_descriptor_t **frag) { int result; if (OPAL_LIKELY(0 == accelerator_event_ipc_num_used)) return 0; OPAL_THREAD_LOCK(&btl_smcuda_accelerator_ipc_lock); if (accelerator_event_ipc_num_used > 0) { opal_output_verbose(20, mca_btl_smcuda_component.cuda_ipc_output, "smcuda: progress_one_accelerator_ipc_event, outstanding_events=%d", accelerator_event_ipc_num_used); result = opal_accelerator.query_event(MCA_ACCELERATOR_NO_DEVICE_ID, accelerator_event_ipc_array[accelerator_event_ipc_first_used]); /* We found an event that is not ready, so return. */ if (OPAL_ERR_RESOURCE_BUSY == result) { opal_output_verbose(20, mca_btl_smcuda_component.cuda_ipc_output, "smcuda: event query returned not ready"); *frag = NULL; OPAL_THREAD_UNLOCK(&btl_smcuda_accelerator_ipc_lock); return 0; } else if (OPAL_SUCCESS != result) { opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "smcuda: event query failed: %d", result); *frag = NULL; OPAL_THREAD_UNLOCK(&btl_smcuda_accelerator_ipc_lock); return OPAL_ERROR; } *frag = accelerator_event_ipc_frag_array[accelerator_event_ipc_first_used]; opal_output_verbose(10, mca_btl_smcuda_component.cuda_ipc_output, "smcuda: event query returned %d", result); /* Bump counters, loop around the circular buffer if necessary */ --accelerator_event_ipc_num_used; ++accelerator_event_ipc_first_used; if (accelerator_event_ipc_first_used >= accelerator_event_max) { accelerator_event_ipc_first_used = 0; } /* A return value of 1 indicates an event completed and a frag was returned */ OPAL_THREAD_UNLOCK(&btl_smcuda_accelerator_ipc_lock); return 1; } OPAL_THREAD_UNLOCK(&btl_smcuda_accelerator_ipc_lock); return 0; } /* * Start the asynchronous copy. Then record and save away an event that will * be queried to indicate the copy has completed. */ int mca_btl_smcuda_memcpy(void *dst, void *src, size_t amount, char *msg, struct mca_btl_base_descriptor_t *frag) { int result; OPAL_THREAD_LOCK(&btl_smcuda_accelerator_ipc_lock); /* First make sure there is room to store the event. If not, then * return an error. The error message will tell the user to try and * run again, but with a larger array for storing events. */ if (accelerator_event_ipc_num_used == accelerator_event_max) { opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "smcuda: Out of event handles"); OPAL_THREAD_UNLOCK(&btl_smcuda_accelerator_ipc_lock); return OPAL_ERR_OUT_OF_RESOURCE; } if (accelerator_event_ipc_num_used > accelerator_event_ipc_most) { accelerator_event_ipc_most = accelerator_event_ipc_num_used; /* Just print multiples of 10 */ if (0 == (accelerator_event_ipc_most % 10)) { opal_output_verbose(20, mca_btl_smcuda_component.cuda_ipc_output, "smcuda: Maximum ipc events used is now %d", accelerator_event_ipc_most); } } result = opal_accelerator.mem_copy_async(MCA_ACCELERATOR_NO_DEVICE_ID, MCA_ACCELERATOR_NO_DEVICE_ID, dst, src, amount, ipc_stream, MCA_ACCELERATOR_TRANSFER_UNSPEC); if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) { opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "smcuda: memcpy async failed: %d", result); OPAL_THREAD_UNLOCK(&btl_smcuda_accelerator_ipc_lock); return OPAL_ERROR; } else { opal_output_verbose(20, mca_btl_smcuda_component.cuda_ipc_output, "smcuda: cuMemcpyAsync passed: dst=%p, src=%p, size=%d", dst, src, (int) amount); } result = opal_accelerator.record_event(MCA_ACCELERATOR_NO_DEVICE_ID, accelerator_event_ipc_array[accelerator_event_ipc_first_avail], ipc_stream); if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) { opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Event Record failed."); OPAL_THREAD_UNLOCK(&btl_smcuda_accelerator_ipc_lock); return OPAL_ERROR; } accelerator_event_ipc_frag_array[accelerator_event_ipc_first_avail] = frag; /* Bump up the first available slot and number used by 1 */ accelerator_event_ipc_first_avail++; if (accelerator_event_ipc_first_avail >= accelerator_event_max) { accelerator_event_ipc_first_avail = 0; } accelerator_event_ipc_num_used++; OPAL_THREAD_UNLOCK(&btl_smcuda_accelerator_ipc_lock); return OPAL_SUCCESS; }