/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved. * Copyright (c) 2017 Intel, Inc. All rights reserved. * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "btl_ugni.h" #include "btl_ugni_frag.h" #include "btl_ugni_rdma.h" #include "btl_ugni_smsg.h" #include "opal/util/printf.h" #include "opal/util/sys_limits.h" #include #include #include #include "opal/memoryhooks/memory.h" #include "opal/runtime/opal_params.h" #include "opal/mca/base/mca_base_pvar.h" static int btl_ugni_component_register(void); static int btl_ugni_component_open(void); static int btl_ugni_component_close(void); static mca_btl_base_module_t **mca_btl_ugni_component_init(int *, bool, bool); static int mca_btl_ugni_component_progress(void); static unsigned long mca_btl_ugni_ugni_page_size = 0; mca_btl_ugni_component_t mca_btl_ugni_component = { .super = { /* First, the mca_base_component_t struct containing meta information about the component itself */ .btl_version = { MCA_BTL_DEFAULT_VERSION("ugni"), .mca_open_component = btl_ugni_component_open, .mca_close_component = btl_ugni_component_close, .mca_register_component_params = btl_ugni_component_register, }, .btl_data = {.param_field = MCA_BASE_METADATA_PARAM_CHECKPOINT}, .btl_init = mca_btl_ugni_component_init, .btl_progress = mca_btl_ugni_component_progress, }}; mca_base_var_enum_value_t rcache_values[] = { {MCA_BTL_UGNI_RCACHE_UDREG, "udreg"}, {MCA_BTL_UGNI_RCACHE_GRDMA, "grdma"}, {-1, NULL} /* sentinal */ }; mca_base_var_enum_value_flag_t cdm_flags[] = {{.flag = GNI_CDM_MODE_FORK_NOCOPY, .string = "fork-no-copy", .conflicting_flag = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_FORK_PARTCOPY}, {.flag = GNI_CDM_MODE_FORK_FULLCOPY, .string = "fork-full-copy", .conflicting_flag = GNI_CDM_MODE_FORK_NOCOPY | GNI_CDM_MODE_FORK_PARTCOPY}, {.flag = GNI_CDM_MODE_FORK_PARTCOPY, .string = "fork-part-copy", .conflicting_flag = GNI_CDM_MODE_FORK_NOCOPY | GNI_CDM_MODE_FORK_FULLCOPY}, {.flag = GNI_CDM_MODE_ERR_NO_KILL, .string = "err-no-kill", .conflicting_flag = GNI_CDM_MODE_ERR_ALL_KILL}, {.flag = GNI_CDM_MODE_ERR_ALL_KILL, .string = "err-all-kill", .conflicting_flag = GNI_CDM_MODE_ERR_NO_KILL}, {.flag = GNI_CDM_MODE_FAST_DATAGRAM_POLL, .string = "fast-datagram-poll", .conflicting_flag = 0}, {.flag = GNI_CDM_MODE_BTE_SINGLE_CHANNEL, .string = "bte-single-channel", .conflicting_flag = 0}, {.flag = GNI_CDM_MODE_USE_PCI_IOMMU, .string = "use-pci-iommu", .conflicting_flag = 0}, {.flag = GNI_CDM_MODE_MDD_DEDICATED, .string = "mdd-dedicated", .conflicting_flag = GNI_CDM_MODE_MDD_SHARED}, {.flag = GNI_CDM_MODE_MDD_SHARED, .string = "mdd-shared", .conflicting_flag = GNI_CDM_MODE_MDD_DEDICATED}, {.flag = GNI_CDM_MODE_FMA_DEDICATED, .string = "fma-dedicated", .conflicting_flag = GNI_CDM_MODE_FMA_SHARED}, {.flag = GNI_CDM_MODE_FMA_SHARED, .string = "fma-shared", .conflicting_flag = GNI_CDM_MODE_FMA_DEDICATED}, {.flag = GNI_CDM_MODE_CACHED_AMO_ENABLED, .string = "cached-amo-enabled", .conflicting_flag = 0}, {.flag = GNI_CDM_MODE_CQ_NIC_LOCAL_PLACEMENT, .string = "cq-nic-placement", .conflicting_flag = 0}, {.flag = GNI_CDM_MODE_FMA_SMALL_WINDOW, .string = "fma-small-window", .conflicting_flag = 0}, {.string = NULL}}; static inline int mca_btl_ugni_get_stat(const mca_base_pvar_t *pvar, void *value, void *obj) { gni_statistic_t statistic = (gni_statistic_t)(intptr_t) pvar->ctx; gni_return_t rc = GNI_RC_SUCCESS; for (int i = 0; i < mca_btl_ugni_component.virtual_device_count; ++i) { rc = GNI_GetNicStat(mca_btl_ugni_component.modules[0].devices[i].dev_handle, statistic, ((unsigned int *) value) + i); } return mca_btl_rc_ugni_to_opal(rc); } static inline int mca_btl_ugni_notify_stat(mca_base_pvar_t *pvar, mca_base_pvar_event_t event, void *obj, int *count) { if (MCA_BASE_PVAR_HANDLE_BIND == event) { /* one value for each virtual device handle */ *count = mca_btl_ugni_component.virtual_device_count; } return OPAL_SUCCESS; } static int btl_ugni_component_register(void) { mca_base_var_enum_t *new_enum; gni_nic_device_t device_type; char *mpool_hints_tmp = NULL; int rc; (void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version, "uGNI byte transport layer"); mca_btl_ugni_component.ugni_free_list_num = 8; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "free_list_num", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.ugni_free_list_num); mca_btl_ugni_component.ugni_free_list_max = 4096; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "free_list_max", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.ugni_free_list_max); mca_btl_ugni_component.ugni_free_list_inc = 64; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "free_list_inc", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.ugni_free_list_inc); mca_btl_ugni_component.ugni_eager_num = 16; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "eager_num", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.ugni_eager_num); mca_btl_ugni_component.ugni_eager_max = 128; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "eager_max", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.ugni_eager_max); mca_btl_ugni_component.ugni_eager_inc = 16; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "eager_inc", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.ugni_eager_inc); mca_btl_ugni_component.remote_cq_size = 40000; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "remote_cq_size", "Remote SMSG completion queue " "size (default 40000)", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.remote_cq_size); mca_btl_ugni_component.local_cq_size = 8192; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "local_cq_size", "Local SMSG completion queue size " "(default 8k)", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.local_cq_size); mca_btl_ugni_component.local_rdma_cq_size = 1024; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "local_rdma_cq_size", "Local FMA/RDMA completion queue size " "(default: 1024)", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.local_rdma_cq_size); mca_btl_ugni_component.ugni_smsg_limit = 0; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "smsg_limit", "Maximum size message that " "will be sent using the SMSG/MSGQ protocol " "(0 - autoselect(default), 16k max)", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.ugni_smsg_limit); mca_btl_ugni_component.smsg_max_credits = 32; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "smsg_max_credits", "Maximum number of " "outstanding SMSG/MSGQ message (default 32)", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.smsg_max_credits); #if OPAL_C_HAVE__THREAD_LOCAL mca_btl_ugni_component.bind_threads_to_devices = true; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "bind_devices", "Bind threads to virtual " "devices. In general this should improve " "RDMA performance (default: true)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.bind_threads_to_devices); #endif mca_btl_ugni_component.ugni_fma_limit = -1; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "fma_limit", "Default maximum size message that " "will be sent using the FMA (Fast Memory " "Access) protocol (default: -1 (don't use), 64k max)", MCA_BASE_VAR_TYPE_LONG, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE | MCA_BASE_VAR_FLAG_DEPRECATED, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.ugni_fma_limit); mca_btl_ugni_component.ugni_fma_get_limit = 2048; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "fma_get_limit", "Maximum size message that " "will be sent using the FMA (Fast Memory " "Access) protocol for get (default 2k, " "64k max)", MCA_BASE_VAR_TYPE_LONG, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.ugni_fma_get_limit); mca_btl_ugni_component.ugni_fma_put_limit = 4096; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "fma_put_limit", "Maximum size message that " "will be sent using the FMA (Fast Memory " "Access) protocol for put (default: 4k, " "64k max)", MCA_BASE_VAR_TYPE_LONG, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.ugni_fma_put_limit); mca_btl_ugni_component.rdma_max_retries = 16; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "rdma_max_retries", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.rdma_max_retries); mca_btl_ugni_component.smsg_max_retries = 16; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "smsg_max_retries", NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.smsg_max_retries); mca_btl_ugni_component.max_mem_reg = 0; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "max_mem_reg", "Maximum number of " "memory registrations a process can " "hold (0 - autoselect, -1 - unlimited)" " (default 0)", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.max_mem_reg); mca_btl_ugni_component.mbox_increment = 0; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "mbox_inc", "Number of SMSG mailboxes to " "allocate in each block (0 - autoselect(default))", MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mbox_increment); /* communication domain flags */ rc = mca_base_var_enum_create_flag("btl_ugni_cdm_flags", cdm_flags, (mca_base_var_enum_flag_t **) &new_enum); if (OPAL_SUCCESS != rc) { return rc; } mca_btl_ugni_component.cdm_flags = GNI_CDM_MODE_FORK_PARTCOPY | GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL | GNI_CDM_MODE_MDD_SHARED | GNI_CDM_MODE_FMA_SHARED | GNI_CDM_MODE_FMA_SMALL_WINDOW; mca_btl_ugni_component.cdm_flags_id = mca_base_component_var_register( &mca_btl_ugni_component.super.btl_version, "cdm_flags", "Flags to set when creating a communication domain " " (default: fork-full-copy,cached-amo-enabled,err-no-kill,fast-datagram-poll," "fma-shared,fma-small-window)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.cdm_flags); OBJ_RELEASE(new_enum); mca_btl_ugni_component.virtual_device_count = 0; (void) mca_base_component_var_register( &mca_btl_ugni_component.super.btl_version, "virtual_device_count", "Number of virtual devices to create. Higher numbers may " "result in better performance when using threads. (default: 0 (auto), max: 128)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.virtual_device_count); /* determine if there are get alignment restrictions */ GNI_GetDeviceType(&device_type); mca_btl_ugni_component.smsg_page_size = 2 << 20; if (GNI_DEVICE_GEMINI == device_type) { if (access("/sys/class/gemini/ghal0/mrt", R_OK)) { int fd = open("/sys/class/gemini/ghal0/mrt", O_RDONLY); char buffer[10]; if (0 <= fd) { memset(buffer, 0, sizeof(buffer)); read(fd, buffer, sizeof(buffer) - 1); close(fd); mca_btl_ugni_ugni_page_size = strtol(buffer, NULL, 10) * 1024; mca_btl_ugni_component.smsg_page_size = mca_btl_ugni_ugni_page_size; } } } (void) mca_base_component_var_register( &mca_btl_ugni_component.super.btl_version, "smsg_page_size", "Page size to use for SMSG mailbox allocation (default: detect)", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.smsg_page_size); mca_btl_ugni_component.progress_thread_requested = 0; (void) mca_base_component_var_register( &mca_btl_ugni_component.super.btl_version, "request_progress_thread", "Enable to request ugni btl progress thread - requires MPI_THREAD_MULTIPLE support", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.progress_thread_requested); /* performance variables */ mca_btl_ugni_progress_thread_wakeups = 0; (void) mca_base_component_pvar_register(&mca_btl_ugni_component.super.btl_version, "progress_thread_wakeups", "Number of times the progress thread " "has been woken", OPAL_INFO_LVL_9, MCA_BASE_PVAR_CLASS_COUNTER, MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MCA_BASE_VAR_BIND_NO_OBJECT, MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, NULL, NULL, NULL, &mca_btl_ugni_progress_thread_wakeups); /* register network statistics as performance variables */ for (int i = 0; i < GNI_NUM_STATS; ++i) { char name[128], desc[128]; size_t str_len = strlen(gni_statistic_str[i]); assert(str_len < sizeof(name)); /* we can get an all-caps string for the variable from gni_statistic_str. need to make it * lowercase to match ompi standards */ for (size_t j = 0; j < str_len; ++j) { name[j] = tolower(gni_statistic_str[i][j]); desc[j] = ('_' == name[j]) ? ' ' : name[j]; } name[str_len] = '\0'; desc[str_len] = '\0'; (void) mca_base_component_pvar_register(&mca_btl_ugni_component.super.btl_version, name, desc, OPAL_INFO_LVL_4, MCA_BASE_PVAR_CLASS_COUNTER, MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, MCA_BASE_VAR_BIND_NO_OBJECT, MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, mca_btl_ugni_get_stat, NULL, mca_btl_ugni_notify_stat, (void *) (intptr_t) i); } /* btl/ugni can only support only a fixed set of rcache components (these rcache components have * compatible resource structures) */ rc = mca_base_var_enum_create("btl_ugni_rcache", rcache_values, &new_enum); if (OPAL_SUCCESS != rc) { return rc; } /* NTH: there are known *serious* performance issues with udreg. if they are ever resolved it is * the preferred rcache */ mca_btl_ugni_component.rcache_type = MCA_BTL_UGNI_RCACHE_GRDMA; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "rcache", "registration cache to use (default: grdma)", MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.rcache_type); OBJ_RELEASE(new_enum); if (mca_btl_ugni_ugni_page_size) { rc = opal_asprintf(&mpool_hints_tmp, "page_size=%lu", mca_btl_ugni_ugni_page_size); if (rc < 0) { return OPAL_ERR_OUT_OF_RESOURCE; } mca_btl_ugni_component.mpool_hints = mpool_hints_tmp; } else { mca_btl_ugni_component.mpool_hints = "page_size=2M"; } (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "mpool_hints", "hints to use when selecting a memory pool (default: " "\"page_size=2M\")", MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mpool_hints); free(mpool_hints_tmp); /* ensure we loose send exclusivity to sm and vader if they are enabled */ mca_btl_ugni_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 2; /* smsg threshold */ mca_btl_ugni_module.super.btl_eager_limit = 8 * 1024; mca_btl_ugni_module.super.btl_rndv_eager_limit = 8 * 1024; mca_btl_ugni_module.super.btl_rdma_pipeline_frag_size = 4 * 1024 * 1024; mca_btl_ugni_module.super.btl_max_send_size = 8 * 1024; mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = 8 * 1024; mca_btl_ugni_module.super.btl_get_limit = 1 * 1024 * 1024; /* * see def. of ALIGNMENT_MASK to figure this one out */ /* both gemini and aries have a 4-byte alignment requirement on remote addresses */ mca_btl_ugni_module.super.btl_get_alignment = 4; /* threshold for put */ mca_btl_ugni_module.super.btl_min_rdma_pipeline_size = 8 * 1024; mca_btl_ugni_module.super.btl_flags = MCA_BTL_FLAGS_SEND | MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_ATOMIC_OPS | MCA_BTL_FLAGS_ATOMIC_FOPS; mca_btl_ugni_module.super.btl_flags |= MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION; mca_btl_ugni_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR | MCA_BTL_ATOMIC_SUPPORTS_CSWAP; if (GNI_DEVICE_ARIES == device_type) { /* aries supports additional atomic operations */ mca_btl_ugni_module.super.btl_atomic_flags |= MCA_BTL_ATOMIC_SUPPORTS_MIN | MCA_BTL_ATOMIC_SUPPORTS_MAX | MCA_BTL_ATOMIC_SUPPORTS_LAND | MCA_BTL_ATOMIC_SUPPORTS_LOR | MCA_BTL_ATOMIC_SUPPORTS_LXOR | MCA_BTL_ATOMIC_SUPPORTS_32BIT | MCA_BTL_ATOMIC_SUPPORTS_FLOAT; } mca_btl_ugni_module.super.btl_registration_handle_size = sizeof( mca_btl_base_registration_handle_t); mca_btl_ugni_module.super.btl_bandwidth = 40000; /* Mbs */ mca_btl_ugni_module.super.btl_latency = 2; /* Microsecs */ mca_btl_ugni_module.super.btl_get_local_registration_threshold = 0; mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component .ugni_fma_put_limit; /* Call the BTL based to register its MCA params */ mca_btl_base_param_register(&mca_btl_ugni_component.super.btl_version, &mca_btl_ugni_module.super); return OPAL_SUCCESS; } static int btl_ugni_component_open(void) { mca_btl_ugni_component.ugni_num_btls = 0; mca_btl_ugni_component.modules = NULL; return OPAL_SUCCESS; } /* * component cleanup - sanity checking of queue lengths */ static int btl_ugni_component_close(void) { mca_btl_ugni_fini(); free(mca_btl_ugni_component.modules); mca_btl_ugni_component.modules = NULL; return OPAL_SUCCESS; } static mca_btl_base_module_t **mca_btl_ugni_component_init(int *num_btl_modules, bool enable_progress_threads, bool enable_mpi_threads) { struct mca_btl_base_module_t **base_modules; mca_btl_ugni_module_t *ugni_modules; int rc; if (16384 < mca_btl_ugni_component.ugni_smsg_limit) { mca_btl_ugni_component.ugni_smsg_limit = 16384; } if (65536 < mca_btl_ugni_component.ugni_fma_limit) { mca_btl_ugni_component.ugni_fma_limit = 65536; } if (-1 != mca_btl_ugni_component.ugni_fma_limit) { mca_btl_ugni_component.ugni_fma_get_limit = mca_btl_ugni_component.ugni_fma_limit; } else if (65536 < mca_btl_ugni_component.ugni_fma_get_limit) { mca_btl_ugni_component.ugni_fma_get_limit = 65536; } if (-1 != mca_btl_ugni_component.ugni_fma_limit) { mca_btl_ugni_component.ugni_fma_put_limit = mca_btl_ugni_component.ugni_fma_limit; } else if (65536 < mca_btl_ugni_component.ugni_fma_put_limit) { mca_btl_ugni_component.ugni_fma_put_limit = 65536; } mca_btl_ugni_module.super.btl_put_local_registration_threshold = mca_btl_ugni_component .ugni_fma_put_limit; /* limit the number of outstanding RDMA operations over all devices */ mca_btl_ugni_component.active_rdma_threshold = mca_btl_ugni_component.local_rdma_cq_size; if (enable_mpi_threads && mca_btl_ugni_component.progress_thread_requested) { mca_btl_ugni_component.progress_thread_enabled = 1; } /* Initialize ugni library and create communication domain */ rc = mca_btl_ugni_init(); if (OPAL_SUCCESS != rc) { return NULL; } /* For now only create a single BTL module */ mca_btl_ugni_component.ugni_num_btls = 1; BTL_VERBOSE(("btl/ugni initializing")); ugni_modules = mca_btl_ugni_component.modules = (mca_btl_ugni_module_t *) calloc(mca_btl_ugni_component.ugni_num_btls, sizeof(mca_btl_ugni_module_t)); if (OPAL_UNLIKELY(NULL == mca_btl_ugni_component.modules)) { BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__)); return NULL; } base_modules = (struct mca_btl_base_module_t **) calloc(mca_btl_ugni_component.ugni_num_btls, sizeof(struct mca_btl_base_module_t *)); if (OPAL_UNLIKELY(NULL == base_modules)) { BTL_ERROR(("Malloc failed : %s:%d", __FILE__, __LINE__)); return NULL; } if (mca_btl_ugni_component.smsg_page_size != (unsigned long) opal_getpagesize()) { if (mca_btl_ugni_ugni_page_size > mca_btl_ugni_component.smsg_page_size) { mca_btl_ugni_component.smsg_page_size = mca_btl_ugni_ugni_page_size; } } mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = mca_btl_ugni_module.super .btl_eager_limit; rc = mca_btl_ugni_module_init(ugni_modules); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { BTL_ERROR(("Failed to initialize uGNI module @ %s:%d", __FILE__, __LINE__)); return NULL; } *base_modules = (mca_btl_base_module_t *) ugni_modules; *num_btl_modules = mca_btl_ugni_component.ugni_num_btls; BTL_VERBOSE(("btl/ugni done initializing %d module(s)", *num_btl_modules)); return base_modules; } int mca_btl_ugni_progress_datagram(mca_btl_ugni_device_t *device) { mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules; mca_btl_base_endpoint_t *ep = NULL; gni_ep_handle_t handle; int count = 0, rc; rc = mca_btl_ugni_get_datagram(ugni_module, device, &handle, &ep); if (1 != rc) { return rc; } BTL_VERBOSE(("remote datagram completion on handle %p", (void *) handle)); /* if this is a wildcard endpoint lookup the remote peer by the proc id we received */ if (handle == ugni_module->wildcard_ep) { struct opal_proc_t *remote_proc = opal_proc_for_name(ugni_module->wc_remote_attr.proc_name); BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc: %s", OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name))); ep = mca_btl_ugni_get_ep(&ugni_module->super, remote_proc); if (OPAL_UNLIKELY(NULL == ep)) { /* there is no way to recover from this error so just abort() */ BTL_ERROR(("could not find/allocate a btl endpoint for peer %s", OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name))); abort(); return OPAL_ERR_NOT_FOUND; } } /* should not have gotten a NULL endpoint */ assert(NULL != ep); BTL_VERBOSE(("got a datagram completion: ep = %p. wc = %d", (void *) ep, handle == ugni_module->wildcard_ep)); /* NTH: TODO -- error handling */ opal_mutex_lock(&ep->lock); if (handle != ugni_module->wildcard_ep) { /* directed post complete */ BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep)); ep->dg_posted = false; (void) opal_atomic_add_fetch_32(&ugni_module->active_datagrams, -1); } (void) mca_btl_ugni_ep_connect_progress(ep); opal_mutex_unlock(&ep->lock); if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) { /* process messages waiting in the endpoint's smsg mailbox */ count = mca_btl_ugni_smsg_process(ep); } /* repost the wildcard datagram */ if (handle == ugni_module->wildcard_ep) { mca_btl_ugni_wildcard_ep_post(ugni_module); } return count; } void mca_btl_ugni_handle_rdma_completions(mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device, struct mca_btl_ugni_post_descriptor_t *post_desc, const int count) { int bte_complete = 0; for (int i = 0; i < count; ++i) { BTL_VERBOSE(("post descriptor complete. status: %d", post_desc[i].rc)); if (OPAL_UNLIKELY(OPAL_SUCCESS != post_desc[i].rc)) { /* dump the post descriptor if in a debug build */ btl_ugni_dump_post_desc(post_desc + i); } bte_complete += post_desc[i].use_bte == true; mca_btl_ugni_post_desc_complete(ugni_module, post_desc + i, post_desc[i].rc); } if (bte_complete > 0) { (void) OPAL_THREAD_FETCH_ADD32(&ugni_module->active_rdma_count, -bte_complete); } } static inline int mca_btl_ugni_progress_rdma(mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device, mca_btl_ugni_cq_t *cq) { mca_btl_ugni_post_descriptor_t post_desc[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP]; int rc; rc = mca_btl_ugni_cq_get_completed_desc(device, cq, post_desc, MCA_BTL_UGNI_COMPLETIONS_PER_LOOP); if (0 >= rc) { return rc; } BTL_VERBOSE(("got %d completed rdma descriptors", rc)); mca_btl_ugni_handle_rdma_completions(ugni_module, device, post_desc, rc); return rc; } static inline int mca_btl_ugni_progress_wait_list(mca_btl_ugni_module_t *ugni_module) { int rc = OPAL_SUCCESS; opal_list_t tmplist; opal_list_t *waitlist = &ugni_module->ep_wait_list; mca_btl_base_endpoint_t *endpoint = NULL; int count; /* check the count before taking the lock to avoid unnecessary locking */ count = opal_list_get_size(waitlist); if (0 == count) { return 0; } /* Don't hold the wait-list lock while processing the list as that may lead * to a deadlock. * Instead, move the wait_list elements into a temporary list and work on that.*/ OBJ_CONSTRUCT(&tmplist, opal_list_t); OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock); opal_list_join(&tmplist, opal_list_get_end(&tmplist), waitlist); OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock); count = opal_list_get_size(&tmplist); do { endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first(&tmplist); if (endpoint != NULL) { rc = mca_btl_ugni_progress_send_wait_list(endpoint); if (OPAL_SUCCESS != rc) { opal_list_append(&tmplist, &endpoint->super); } else { endpoint->wait_listed = false; } } } while (endpoint != NULL && --count > 0); /* reinsert unfinished elements into the wait-list */ count = opal_list_get_size(&tmplist); if (0 < count) { OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock); opal_list_join(waitlist, opal_list_get_end(waitlist), &tmplist); OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock); } OBJ_DESTRUCT(&tmplist); return rc; } static int mca_btl_ugni_component_progress(void) { mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules; int count = 0; count += mca_btl_ugni_progress_remote_smsg(ugni_module); if (ugni_module->active_datagrams) { count += mca_btl_ugni_progress_datagram(ugni_module->devices); } for (int i = 0; i < mca_btl_ugni_component.virtual_device_count; ++i) { mca_btl_ugni_device_t *device = ugni_module->devices + i; if (device->smsg_connections) { count += mca_btl_ugni_progress_local_smsg(ugni_module, device); mca_btl_ugni_progress_wait_list(ugni_module); } if (device->dev_rdma_local_cq.active_operations) { count += mca_btl_ugni_progress_rdma(ugni_module, device, &device->dev_rdma_local_cq); } if (mca_btl_ugni_component.progress_thread_enabled && device->dev_rdma_local_irq_cq.active_operations) { count += mca_btl_ugni_progress_rdma(ugni_module, device, &device->dev_rdma_local_irq_cq); } } return count; } int mca_btl_ugni_flush(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint) { mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules; for (int i = 0; i < mca_btl_ugni_component.virtual_device_count; ++i) { mca_btl_ugni_device_t *device = ugni_module->devices + i; /* spin on progress until all active operations are complete. it is tempting to * take an initial count then wait until that many operations have been completed * but it is impossible to tell if those are the operations the caller is waiting * on. */ while (device->dev_rdma_local_cq.active_operations) { (void) mca_btl_ugni_progress_rdma(ugni_module, device, &device->dev_rdma_local_cq); } /* mark that the device was recently flushed */ device->flushed = true; } return OPAL_SUCCESS; } void btl_ugni_dump_post_desc(mca_btl_ugni_post_descriptor_t *desc) { fprintf(stderr, "desc->gni_desc.post_id = %" PRIx64 "\n", desc->gni_desc.post_id); fprintf(stderr, "desc->gni_desc.status = %" PRIx64 "\n", desc->gni_desc.status); fprintf(stderr, "desc->gni_desc.cq_mode_complete = %hu\n", desc->gni_desc.cq_mode_complete); fprintf(stderr, "desc->gni_desc.type = %d\n", desc->gni_desc.type); fprintf(stderr, "desc->gni_desc.cq_mode = %hu\n", desc->gni_desc.cq_mode); fprintf(stderr, "desc->gni_desc.dlvr_mode = %hu\n", desc->gni_desc.dlvr_mode); fprintf(stderr, "desc->gni_desc.local_addr = %" PRIx64 "\n", desc->gni_desc.local_addr); fprintf(stderr, "desc->gni_desc.local_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->gni_desc.local_mem_hndl.qword1, desc->gni_desc.local_mem_hndl.qword2); fprintf(stderr, "desc->gni_desc.remote_addr = %" PRIx64 "\n", desc->gni_desc.remote_addr); fprintf(stderr, "desc->gni_desc.remote_mem_hndl = {%" PRIx64 ", %" PRIx64 "}\n", desc->gni_desc.remote_mem_hndl.qword1, desc->gni_desc.remote_mem_hndl.qword2); fprintf(stderr, "desc->gni_desc.length = %" PRIu64 "\n", desc->gni_desc.length); fprintf(stderr, "desc->gni_desc.rdma_mode = %hu\n", desc->gni_desc.rdma_mode); fprintf(stderr, "desc->gni_desc.amo_cmd = %d\n", desc->gni_desc.amo_cmd); }