/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2021 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ /* * The ugni btl is implemented with native Cray Gemini. * * Known issues with ugni: * - */ #ifndef MCA_BTL_UGNI_H #define MCA_BTL_UGNI_H #include "opal_config.h" #include "opal/class/opal_free_list.h" #include "opal/class/opal_hash_table.h" #include "opal/class/opal_pointer_array.h" #include "opal/mca/btl/base/base.h" #include "opal/mca/btl/base/btl_base_error.h" #include "opal/mca/btl/btl.h" #include "opal/mca/mpool/base/base.h" #include "opal/mca/mpool/mpool.h" #include "opal/mca/pmix/pmix-internal.h" #include "opal/mca/rcache/base/base.h" #include "opal/mca/rcache/udreg/rcache_udreg.h" #include "opal/util/output.h" #include "opal_stdint.h" #include #include #include #include #include #include /* datagram message ids */ #define MCA_BTL_UGNI_CONNECT_WILDCARD_ID 0x0000000000000000ull #define MCA_BTL_UGNI_CONNECT_DIRECTED_ID 0x8000000000000000ull #define MCA_BTL_UGNI_DATAGRAM_MASK 0x8000000000000000ull /** maximum number of supported virtual devices */ #define MCA_BTL_UGNI_MAX_DEV_HANDLES 128 /** number of rdma completion queue items to remove per progress loop */ #define MCA_BTL_UGNI_COMPLETIONS_PER_LOOP 32 /** how often to check for connection requests */ #define MCA_BTL_UGNI_CONNECT_USEC 10 /** * Modex data */ struct mca_btl_ugni_modex_t { /** GNI NIC address */ uint32_t addr; /** CDM identifier (base) */ int id; }; typedef struct mca_btl_ugni_modex_t mca_btl_ugni_modex_t; /* ompi and smsg endpoint attributes */ typedef struct mca_btl_ugni_endpoint_attr_t { opal_process_name_t proc_name; uint32_t index; gni_smsg_attr_t smsg_attr; gni_mem_handle_t rmt_irq_mem_hndl; } mca_btl_ugni_endpoint_attr_t; enum { MCA_BTL_UGNI_RCACHE_UDREG, MCA_BTL_UGNI_RCACHE_GRDMA }; enum mca_btl_ugni_free_list_id_t { /* eager fragment list (registered) */ MCA_BTL_UGNI_LIST_EAGER_SEND, MCA_BTL_UGNI_LIST_EAGER_RECV, /* SMSG fragment list (unregistered) */ MCA_BTL_UGNI_LIST_SMSG, /* RDMA fragment list */ MCA_BTL_UGNI_LIST_RDMA, MCA_BTL_UGNI_LIST_RDMA_INT, MCA_BTL_UGNI_LIST_MAX, }; struct mca_btl_ugni_cq_t { /** ugni CQ handle */ gni_cq_handle_t gni_handle; /** number of completions expected on the CQ */ volatile int32_t active_operations; }; typedef struct mca_btl_ugni_cq_t mca_btl_ugni_cq_t; /** * GNI virtual device */ struct mca_btl_ugni_device_t { /** Communication domain handle */ gni_cdm_handle_t dev_cd_handle; /** protection for ugni access */ volatile int32_t lock; /** Index of device in module devices array */ int dev_index; /** number of SMSG connections */ opal_atomic_int32_t smsg_connections; /** boolean indicating that the device was recently flushed */ volatile bool flushed; /** uGNI device handle */ gni_nic_handle_t dev_handle; /** uGNI rdma completion queue */ mca_btl_ugni_cq_t dev_rdma_local_cq; /** local rdma completion queue (async) */ mca_btl_ugni_cq_t dev_rdma_local_irq_cq; /** local SMSG completion queue */ mca_btl_ugni_cq_t dev_smsg_local_cq; /** IRQ memory handle for this device */ gni_mem_handle_t smsg_irq_mhndl; /** RDMA endpoint free list */ opal_free_list_t rdma_descs; }; typedef struct mca_btl_ugni_device_t mca_btl_ugni_device_t; typedef intptr_t (*mca_btl_ugni_device_serialize_fn_t)(mca_btl_ugni_device_t *device, void *arg); typedef struct mca_btl_ugni_module_t { mca_btl_base_module_t super; bool initialized; mca_btl_ugni_device_t devices[MCA_BTL_UGNI_MAX_DEV_HANDLES]; opal_mutex_t endpoint_lock; size_t endpoint_count; opal_pointer_array_t endpoints; opal_hash_table_t id_to_endpoint; /* lock for this list */ opal_mutex_t failed_frags_lock; /** rdma frags waiting to be reposted */ opal_list_t failed_frags; /** lock for the eager_get_pending list */ opal_mutex_t eager_get_pending_lock; opal_list_t eager_get_pending; mca_mpool_base_module_t *mpool; opal_free_list_t smsg_mboxes; gni_ep_handle_t wildcard_ep; struct mca_btl_base_endpoint_t *local_ep; opal_atomic_int32_t active_datagrams; opal_event_t connection_event; struct mca_btl_ugni_endpoint_attr_t wc_remote_attr, wc_local_attr; gni_cq_handle_t smsg_remote_cq; gni_cq_handle_t smsg_remote_irq_cq; /** fragment free lists (see enum mca_btl_ugni_free_list_id_t) */ opal_free_list_t frags_lists[MCA_BTL_UGNI_LIST_MAX]; /* lock for this list */ opal_mutex_t ep_wait_list_lock; /* endpoints waiting on credits */ opal_list_t ep_wait_list; /* fragment id bounce buffer (smsg msg ids are only 32 bits) */ opal_pointer_array_t pending_smsg_frags_bb; int32_t reg_max; opal_atomic_int32_t reg_count; /* used to calculate the fraction of registered memory resources * this rank should be limited too */ int nlocal_procs; opal_atomic_int32_t active_rdma_count; mca_rcache_base_module_t *rcache; } mca_btl_ugni_module_t; typedef struct mca_btl_ugni_component_t { /* base BTL component */ mca_btl_base_component_3_0_0_t super; /* maximum supported btls. hardcoded to 1 for now */ uint32_t ugni_max_btls; /* Maximum number of entries a completion queue can hold */ uint32_t remote_cq_size; uint32_t local_cq_size; uint32_t local_rdma_cq_size; /* There is a hardware limitation that hurts BTE performance * if we submit too many BTE requests. This acts as a throttle. */ int32_t active_rdma_threshold; /* number of ugni modules */ uint32_t ugni_num_btls; /* ugni modules */ mca_btl_ugni_module_t *modules; size_t smsg_max_data; /* After this message size switch to BTE protocols */ long int ugni_fma_limit; /** FMA switchover for get */ long int ugni_fma_get_limit; /** FMA switchover for put */ long int ugni_fma_put_limit; #if OPAL_C_HAVE__THREAD_LOCAL bool bind_threads_to_devices; #endif /* Switch to get when sending above this size */ size_t ugni_smsg_limit; /* RDMA/SMSG free list settings */ int ugni_free_list_num; int ugni_free_list_max; int ugni_free_list_inc; /* eager free list settings */ int ugni_eager_num; int ugni_eager_max; int ugni_eager_inc; int smsg_max_retries; /* number of times to retry a post */ int rdma_max_retries; /* Maximum number of outstanding eager messages */ int smsg_max_credits; /* mailbox size (computed) */ int smsg_mbox_size; /* Maximum number of memory registrations per process */ int max_mem_reg; /* Page size to use for SMSG allocations (udreg mpool) */ unsigned int smsg_page_size; /* rcache type (grdma or udreg) */ int rcache_type; /* memory pool hints */ char *mpool_hints; /* Number of mailboxes to allocate in each block */ unsigned int mbox_increment; /* Indicate whether progress thread requested */ bool progress_thread_requested; /* Indicate whether progress thread allowed */ bool progress_thread_enabled; /** Number of ugni device contexts to create per GNI device */ int virtual_device_count; /** Protection tag */ uint8_t ptag; /** Unique id for this process assigned by the system */ uint32_t cookie; /** Starting value of communication identifier */ uint32_t cdm_id_base; /** GNI CDM flags */ uint32_t cdm_flags; /** NIC address */ uint32_t dev_addr; /** MCA variable identifier for the cdm_flags variable */ int cdm_flags_id; } mca_btl_ugni_component_t; /* Global structures */ OPAL_DECLSPEC extern mca_btl_ugni_component_t mca_btl_ugni_component; OPAL_DECLSPEC extern mca_btl_ugni_module_t mca_btl_ugni_module; static inline uint32_t mca_btl_ugni_ep_get_device_index(mca_btl_ugni_module_t *ugni_module) { static opal_atomic_int64_t device_index = 0; /* don't really care if the device index is atomically updated */ return (uint32_t)(opal_atomic_fetch_add_64(&device_index, 1) % mca_btl_ugni_component.virtual_device_count); } /** * Get a virtual device for communication */ static inline mca_btl_ugni_device_t *mca_btl_ugni_ep_get_device(mca_btl_ugni_module_t *ugni_module) { return ugni_module->devices + mca_btl_ugni_ep_get_device_index(ugni_module); } static inline int mca_btl_rc_ugni_to_opal(gni_return_t rc) { static int codes[] = {OPAL_SUCCESS, OPAL_ERR_RESOURCE_BUSY, OPAL_ERR_BAD_PARAM, OPAL_ERR_OUT_OF_RESOURCE, OPAL_ERR_TIMEOUT, OPAL_ERR_PERM, OPAL_ERROR, OPAL_ERR_BAD_PARAM, OPAL_ERR_BAD_PARAM, OPAL_ERR_NOT_FOUND, OPAL_ERR_VALUE_OUT_OF_BOUNDS, OPAL_ERROR, OPAL_ERR_NOT_SUPPORTED, OPAL_ERR_OUT_OF_RESOURCE}; return codes[rc]; } int mca_btl_ugni_flush(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint); /** * BML->BTL notification of change in the process list. * * location: btl_ugni_add_procs.c * * @param btl (IN) BTL module * @param nprocs (IN) Number of processes * @param procs (IN) Array of processes * @param endpoint (OUT) Array of mca_btl_base_endpoint_t structures by BTL. * @param reachable (OUT) Bitmask indicating set of peer processes that are reachable by this * BTL. * @return OPAL_SUCCESS or error status on failure. */ int mca_btl_ugni_add_procs(struct mca_btl_base_module_t *btl, size_t nprocs, struct opal_proc_t **procs, struct mca_btl_base_endpoint_t **peers, opal_bitmap_t *reachable); /** * Notification of change to the process list. * * location: btl_ugni_add_procs.c * * @param btl (IN) BTL module * @param nprocs (IN) Number of processes * @param proc (IN) Set of processes * @param peer (IN) Set of peer addressing information. * @return Status indicating if cleanup was successful */ int mca_btl_ugni_del_procs(struct mca_btl_base_module_t *btl, size_t nprocs, struct opal_proc_t **procs, struct mca_btl_base_endpoint_t **peers); struct mca_btl_base_endpoint_t *mca_btl_ugni_get_ep(struct mca_btl_base_module_t *module, opal_proc_t *proc); /** * Initiate an asynchronous send. * * location: btl_ugni_send.c * * @param btl (IN) BTL module * @param endpoint (IN) BTL addressing information * @param descriptor (IN) Description of the data to be transferred * @param tag (IN) The tag value used to notify the peer. */ int mca_btl_ugni_send(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *btl_peer, struct mca_btl_base_descriptor_t *descriptor, mca_btl_base_tag_t tag); /** * Initiate an immediate blocking send. * * location: btl_ugni_sendi.c * * @param btl (IN) BTL module * @param endpoint (IN) BTL addressing information * @param convertor (IN) Data type convertor * @param header (IN) Pointer to header. * @param header_size (IN) Size of header. * @param payload_size (IN) Size of payload (from convertor). * @param order (IN) The ordering tag (may be MCA_BTL_NO_ORDER) * @param flags (IN) Flags. * @param tag (IN) The tag value used to notify the peer. * @param descriptor (OUT) The descriptor to be returned unable to be sent immediately */ int mca_btl_ugni_sendi(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor, void *header, size_t header_size, size_t payload_size, uint8_t order, uint32_t flags, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t **descriptor); int mca_btl_ugni_get(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); int mca_btl_ugni_put(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); int mca_btl_ugni_aop(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); int mca_btl_ugni_afop(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); int mca_btl_ugni_acswap(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); int mca_btl_ugni_progress_send_wait_list(struct mca_btl_base_endpoint_t *endpoint); int mca_btl_ugni_progress_datagram(mca_btl_ugni_device_t *device); mca_btl_base_descriptor_t *mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, uint8_t order, size_t size, uint32_t flags); struct mca_btl_base_registration_handle_t { /** uGNI memory handle */ gni_mem_handle_t gni_handle; }; typedef struct mca_btl_ugni_reg_t { mca_rcache_base_registration_t base; mca_btl_base_registration_handle_t handle; } mca_btl_ugni_reg_t; /** * Initialize uGNI support. */ int mca_btl_ugni_init(void); /** * Finalize uGNI support. */ int mca_btl_ugni_fini(void); int mca_btl_ugni_module_init(mca_btl_ugni_module_t *ugni_module); /** * Initialize a virtual device for device index 0. * * @param[inout] device Device to initialize * @param[in] virtual_device_id Virtual device identified (up to max handles) */ int mca_btl_ugni_device_init(mca_btl_ugni_device_t *device, int virtual_device_id); /** * Finalize a virtual device. * * @param[in] device Device to finalize */ int mca_btl_ugni_device_fini(mca_btl_ugni_device_t *dev); /* Get a unique 64-bit id for the process name */ static inline uint64_t mca_btl_ugni_proc_name_to_id(opal_process_name_t name) { /* Throw away the top bit of the jobid for the datagram type */ return ((uint64_t)(name.jobid & 0x7fffffff) << 32 | name.vpid); } int mca_btl_ugni_spawn_progress_thread(struct mca_btl_base_module_t *btl); int mca_btl_ugni_kill_progress_thread(void); struct mca_btl_ugni_post_descriptor_t; void btl_ugni_dump_post_desc(struct mca_btl_ugni_post_descriptor_t *desc); struct mca_btl_ugni_post_descriptor_t; void mca_btl_ugni_handle_rdma_completions(mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device, struct mca_btl_ugni_post_descriptor_t *post_desc, const int count); /** * Try to lock a uGNI device for exclusive access */ static inline int mca_btl_ugni_device_trylock(mca_btl_ugni_device_t *device) { /* checking the lock non-atomically first can reduce the number of * unnecessary atomic operations. */ return (device->lock || opal_atomic_swap_32(&device->lock, 1)); } /** * Lock a uGNI device for exclusive access */ static inline void mca_btl_ugni_device_lock(mca_btl_ugni_device_t *device) { while (mca_btl_ugni_device_trylock(device)) ; } /** * Release exclusive access to the device */ static inline void mca_btl_ugni_device_unlock(mca_btl_ugni_device_t *device) { opal_atomic_wmb(); device->lock = 0; } /** * Serialize an operation on a uGNI device * * @params[in] device ugni device * @params[in] fn function to serialize * @params[in] arg function argument */ static inline intptr_t mca_btl_ugni_device_serialize(mca_btl_ugni_device_t *device, mca_btl_ugni_device_serialize_fn_t fn, void *arg) { intptr_t rc; if (!opal_using_threads()) { return fn(device, arg); } /* NTH: for now the device is just protected by a spin lock but this will change in the future */ mca_btl_ugni_device_lock(device); rc = fn(device, arg); mca_btl_ugni_device_unlock(device); return rc; } static inline intptr_t mca_btl_ugni_device_serialize_any(mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_serialize_fn_t fn, void *arg) { mca_btl_ugni_device_t *device; intptr_t rc; if (!opal_using_threads()) { return fn(ugni_module->devices, arg); } #if OPAL_C_HAVE__THREAD_LOCAL if (mca_btl_ugni_component.bind_threads_to_devices) { /* NTH: if we have C11 _Thread_local just go ahead and assign the devices round-robin to * each thread. in testing this give much better performance than just picking any device */ static _Thread_local mca_btl_ugni_device_t *device_local = NULL; device = device_local; if (OPAL_UNLIKELY(NULL == device)) { /* assign device contexts round-robin */ device_local = device = mca_btl_ugni_ep_get_device(ugni_module); } mca_btl_ugni_device_lock(device); } else { #endif /* get the next starting index */ uint32_t device_index = mca_btl_ugni_ep_get_device_index(ugni_module); const int device_count = mca_btl_ugni_component.virtual_device_count; for (int i = 0; i < device_count; ++i) { device = ugni_module->devices + ((device_index + i) % device_count); if (!mca_btl_ugni_device_trylock(device)) { break; } device = NULL; } if (NULL == device) { device = mca_btl_ugni_ep_get_device(ugni_module); mca_btl_ugni_device_lock(device); } #if OPAL_C_HAVE__THREAD_LOCAL } #endif rc = fn(device, arg); mca_btl_ugni_device_unlock(device); return rc; } /** Number of times the progress thread has woken up */ extern unsigned int mca_btl_ugni_progress_thread_wakeups; #endif