/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2009 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2019 Google, LLC. All rights reserved. * Copyright (c) 2019 Intel, Inc. All rights reserved. * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ /** * @file */ #ifndef MCA_BTL_UCT_H #define MCA_BTL_UCT_H #include "opal_config.h" #include #include /* Open MPI includes */ #include "opal/class/opal_fifo.h" #include "opal/class/opal_hash_table.h" #include "opal/mca/btl/base/base.h" #include "opal/mca/btl/base/btl_base_error.h" #include "opal/mca/mpool/mpool.h" #include "opal/mca/pmix/pmix-internal.h" #include "opal/mca/rcache/base/base.h" #include "opal/mca/threads/tsd.h" #include "opal/util/event.h" #include #include "btl_uct_types.h" BEGIN_C_DECLS /* detection for old vs new atomic flags */ #if defined(UCT_IFACE_FLAG_ATOMIC_ADD32) # define OPAL_HAVE_UCT_EP_ATOMIC64_POST 0 #else # define OPAL_HAVE_UCT_EP_ATOMIC64_POST 1 #endif /** * @brief UCT BTL module */ struct mca_btl_uct_module_t { /** base BTL interface */ mca_btl_base_module_t super; /** whether the module has been fully initialized or not */ bool initialized; /** lock for the hash table */ opal_mutex_t endpoint_lock; /** endpoint hash table */ opal_hash_table_t id_to_endpoint; /** mutex to protect the module */ opal_recursive_mutex_t lock; /** async context */ ucs_async_context_t *ucs_async; /** transport for active messaging */ mca_btl_uct_tl_t *am_tl; /** transport for RDMA/AMOs */ mca_btl_uct_tl_t *rdma_tl; /** transport for forming connections (if needed) */ mca_btl_uct_tl_t *conn_tl; /** array containing the am_tl and rdma_tl */ mca_btl_uct_tl_t *comm_tls[2]; #if UCT_API >= UCT_VERSION(1, 7) uct_component_h uct_component; #endif /** registration cache */ mca_rcache_base_module_t *rcache; /** name of the memory domain backing this module */ char *md_name; /** am and rdma share endpoints */ bool shared_endpoints; /** memory domain */ mca_btl_uct_md_t *md; /** un-registered frags that will be used with uct_ep_am_short() */ opal_free_list_t short_frags; /** registered frags that will be used with uct_ep_am_zcopy() */ opal_free_list_t eager_frags; /** large registered frags for packing non-contiguous data */ opal_free_list_t max_frags; /** frags that were waiting on connections that are now ready to send */ opal_list_t pending_frags; /** pending connection requests */ opal_fifo_t pending_connection_reqs; }; typedef struct mca_btl_uct_module_t mca_btl_uct_module_t; extern mca_btl_uct_module_t mca_btl_uct_module_template; /** * @brief UCT BTL component */ struct mca_btl_uct_component_t { /** base BTL component */ mca_btl_base_component_3_0_0_t super; /** number of TL modules */ int module_count; /** All BTL UCT modules (1 per memory domain) */ mca_btl_uct_module_t *modules[MCA_BTL_UCT_MAX_MODULES]; /** allowed UCT memory domains */ char *memory_domains; /** allowed transports */ char *allowed_transports; /** number of worker contexts to create */ int num_contexts_per_module; #if OPAL_C_HAVE__THREAD_LOCAL /** bind threads to contexts */ bool bind_threads_to_contexts; #endif /** disable UCX memory hooks */ bool disable_ucx_memory_hooks; }; typedef struct mca_btl_uct_component_t mca_btl_uct_component_t; OPAL_DECLSPEC extern mca_btl_uct_component_t mca_btl_uct_component; struct mca_btl_base_registration_handle_t { /** The packed memory handle. The size of this field is defined by UCT. */ uint8_t packed_handle[1]; }; struct mca_btl_uct_reg_t { mca_rcache_base_registration_t base; /** UCT memory handle */ uct_mem_h uct_memh; /** remote handle */ mca_btl_base_registration_handle_t handle; }; typedef struct mca_btl_uct_reg_t mca_btl_uct_reg_t; OBJ_CLASS_DECLARATION(mca_btl_uct_reg_t); #define MCA_BTL_UCT_REG_REMOTE_TO_LOCAL(reg) \ ((mca_btl_uct_reg_t *) ((intptr_t)(reg) -offsetof(mca_btl_uct_reg_t, handle))) /** * Initiate an asynchronous put. * Completion Semantics: if this function returns a 1 then the operation * is complete. a return of OPAL_SUCCESS indicates * the put operation has been queued with the * network. the local_handle can not be deregistered * until all outstanding operations on that handle * have been completed. * * @param btl (IN) BTL module * @param endpoint (IN) BTL addressing information * @param local_address (IN) Local address to put from (registered) * @param remote_address (IN) Remote address to put to (registered remotely) * @param local_handle (IN) Registration handle for region containing * (local_address, local_address + size) * @param remote_handle (IN) Remote registration handle for region containing * (remote_address, remote_address + size) * @param size (IN) Number of bytes to put * @param flags (IN) Flags for this put operation * @param order (IN) Ordering * @param cbfunc (IN) Function to call on completion (if queued) * @param cbcontext (IN) Context for the callback * @param cbdata (IN) Data for callback * * @retval OPAL_SUCCESS The descriptor was successfully queued for a put * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put * operation. Try again later * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or * alignment restrictions. */ int mca_btl_uct_put(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); /** * Initiate an asynchronous get. * Completion Semantics: if this function returns a 1 then the operation * is complete. a return of OPAL_SUCCESS indicates * the get operation has been queued with the * network. the local_handle can not be deregistered * until all outstanding operations on that handle * have been completed. * * @param btl (IN) BTL module * @param endpoint (IN) BTL addressing information * @param local_address (IN) Local address to put from (registered) * @param remote_address (IN) Remote address to put to (registered remotely) * @param local_handle (IN) Registration handle for region containing * (local_address, local_address + size) * @param remote_handle (IN) Remote registration handle for region containing * (remote_address, remote_address + size) * @param size (IN) Number of bytes to put * @param flags (IN) Flags for this put operation * @param order (IN) Ordering * @param cbfunc (IN) Function to call on completion (if queued) * @param cbcontext (IN) Context for the callback * @param cbdata (IN) Data for callback * * @retval OPAL_SUCCESS The descriptor was successfully queued for a put * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put * operation. Try again later * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or * alignment restrictions. */ int mca_btl_uct_get(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); int mca_btl_uct_aop(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); int mca_btl_uct_afop(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); int mca_btl_uct_acswap(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); int mca_btl_uct_flush(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint); int mca_btl_uct_flush_thread(mca_btl_base_module_t *btl); int mca_btl_uct_finalize(mca_btl_base_module_t *btl); int mca_btl_uct_reg_mem(void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg); int mca_btl_uct_dereg_mem(void *reg_data, mca_rcache_base_registration_t *reg); ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsigned flags); struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep(struct mca_btl_base_module_t *module, opal_proc_t *proc); int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count); int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module, mca_btl_uct_conn_req_t *req); /** * @brief Checks if a tl is suitable for using for RDMA * * @param[in] tl btl/uct tl pointer */ static inline bool mca_btl_uct_tl_supports_rdma(mca_btl_uct_tl_t *tl) { return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY)) == (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY); } /** * @brief Checks if a tl is suitable for using for active messaging */ static inline bool mca_btl_uct_tl_support_am(mca_btl_uct_tl_t *tl) { return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_AM_ZCOPY)); } /** * @brief Checks if a tl can be used for passing data to connect endpoints * * @param[in] tl btl/uct tl pointer */ static inline bool mca_btl_uct_tl_supports_conn(mca_btl_uct_tl_t *tl) { return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE)) == (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE); } /** * @brief Check if tl endpoints need to be connected via a connection tl * * @param[in] tl btl/uct tl pointer */ static inline bool mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_tl_t *tl) { return !(MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); } END_C_DECLS #endif