/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * $COPYRIGHT$ * Copyright (c) 2018 Intel Inc. All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "btl_ofi.h" #include "btl_ofi_frag.h" #include "btl_ofi_rdma.h" #if OPAL_HAVE_THREAD_LOCAL opal_thread_local mca_btl_ofi_context_t *my_context = NULL; #endif /* OPAL_HAVE_THREAD_LOCAL */ int init_context_freelists(mca_btl_ofi_context_t *context) { int rc; OBJ_CONSTRUCT(&context->rdma_comp_list, opal_free_list_t); rc = opal_free_list_init(&context->rdma_comp_list, sizeof(mca_btl_ofi_rdma_completion_t), opal_cache_line_size, OBJ_CLASS(mca_btl_ofi_rdma_completion_t), 0, 0, 512, -1, 512, NULL, 0, NULL, NULL, NULL); if (rc != OPAL_SUCCESS) { BTL_VERBOSE(("cannot allocate completion freelist")); return rc; } if (TWO_SIDED_ENABLED) { OBJ_CONSTRUCT(&context->frag_comp_list, opal_free_list_t); rc = opal_free_list_init(&context->frag_comp_list, sizeof(mca_btl_ofi_frag_completion_t), opal_cache_line_size, OBJ_CLASS(mca_btl_ofi_frag_completion_t), 0, 0, 512, -1, 512, NULL, 0, NULL, NULL, NULL); if (rc != OPAL_SUCCESS) { BTL_VERBOSE(("cannot allocate completion freelist")); return rc; } /* Initialize frag pool */ OBJ_CONSTRUCT(&context->frag_list, opal_free_list_t); rc = opal_free_list_init(&context->frag_list, sizeof(mca_btl_ofi_base_frag_t) + MCA_BTL_OFI_FRAG_SIZE, opal_cache_line_size, OBJ_CLASS(mca_btl_ofi_base_frag_t), 0, 0, 1024, -1, 1024, NULL, 0, NULL, NULL, NULL); if (OPAL_SUCCESS != rc) { BTL_VERBOSE(("failed to init frag pool (free_list)")); } } return rc; } /* mca_btl_ofi_context_alloc_normal() * * This function will allocate an ofi_context, map the endpoint to tx/rx context, * bind CQ,AV to the endpoint and initialize all the structure. * USE WITH NORMAL ENDPOINT ONLY */ mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info, struct fid_domain *domain, struct fid_ep *ep, struct fid_av *av) { int rc; uint32_t cq_flags = FI_TRANSMIT | FI_SEND | FI_RECV; char *linux_device_name = info->domain_attr->name; struct fi_cq_attr cq_attr = {0}; mca_btl_ofi_context_t *context; context = (mca_btl_ofi_context_t *) calloc(1, sizeof(*context)); if (NULL == context) { BTL_VERBOSE(("cannot allocate context")); return NULL; } /* Don't really need to check, just avoiding compiler warning because * BTL_VERBOSE is a no op in performance build and the compiler will * complain about unused variable. */ if (NULL == linux_device_name) { BTL_VERBOSE(("linux device name is NULL. This shouldn't happen.")); goto single_fail; } cq_attr.format = FI_CQ_FORMAT_CONTEXT; cq_attr.wait_obj = FI_WAIT_NONE; rc = fi_cq_open(domain, &cq_attr, &context->cq, NULL); if (0 != rc) { BTL_VERBOSE(("%s failed fi_cq_open with err=%s", linux_device_name, fi_strerror(-rc))); goto single_fail; } rc = fi_ep_bind(ep, (fid_t) av, 0); if (0 != rc) { BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", linux_device_name, fi_strerror(-rc))); goto single_fail; } rc = fi_ep_bind(ep, (fid_t) context->cq, cq_flags); if (0 != rc) { BTL_VERBOSE( ("%s failed fi_scalable_ep_bind with err=%s", linux_device_name, fi_strerror(-rc))); goto single_fail; } rc = init_context_freelists(context); if (rc != OPAL_SUCCESS) { goto single_fail; } context->tx_ctx = ep; context->rx_ctx = ep; context->context_id = 0; return context; single_fail: mca_btl_ofi_context_finalize(context, false); return NULL; } /* mca_btl_ofi_context_alloc_scalable() * * This function allocate communication contexts and return the pointer * to the first btl context. It also take care of all the bindings needed. * USE WITH SCALABLE ENDPOINT ONLY */ mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info, struct fid_domain *domain, struct fid_ep *sep, struct fid_av *av, size_t num_contexts) { BTL_VERBOSE(("creating %zu contexts", num_contexts)); int rc; size_t i; char *linux_device_name = info->domain_attr->name; struct fi_cq_attr cq_attr = {0}; struct fi_tx_attr tx_attr = {0}; struct fi_rx_attr rx_attr = {0}; mca_btl_ofi_context_t *contexts; tx_attr.op_flags = FI_DELIVERY_COMPLETE; contexts = (mca_btl_ofi_context_t *) calloc(num_contexts, sizeof(*contexts)); if (NULL == contexts) { BTL_VERBOSE(("cannot allocate communication contexts.")); return NULL; } /* Don't really need to check, just avoiding compiler warning because * BTL_VERBOSE is a no op in performance build and the compiler will * complain about unused variable. */ if (NULL == linux_device_name) { BTL_VERBOSE(("linux device name is NULL. This shouldn't happen.")); goto scalable_fail; } /* bind AV to endpoint */ rc = fi_scalable_ep_bind(sep, (fid_t) av, 0); if (0 != rc) { BTL_VERBOSE( ("%s failed fi_scalable_ep_bind with err=%s", linux_device_name, fi_strerror(-rc))); goto scalable_fail; } for (i = 0; i < num_contexts; i++) { rc = fi_tx_context(sep, i, &tx_attr, &contexts[i].tx_ctx, NULL); if (0 != rc) { BTL_VERBOSE( ("%s failed fi_tx_context with err=%s", linux_device_name, fi_strerror(-rc))); goto scalable_fail; } /* We don't actually need a receiving context as we only do one-sided. * However, sockets provider will hang if we dont have one. It is * also nice to have equal number of tx/rx context. */ rc = fi_rx_context(sep, i, &rx_attr, &contexts[i].rx_ctx, NULL); if (0 != rc) { BTL_VERBOSE( ("%s failed fi_rx_context with err=%s", linux_device_name, fi_strerror(-rc))); goto scalable_fail; } /* create CQ */ cq_attr.format = FI_CQ_FORMAT_CONTEXT; cq_attr.wait_obj = FI_WAIT_NONE; rc = fi_cq_open(domain, &cq_attr, &contexts[i].cq, NULL); if (0 != rc) { BTL_VERBOSE(("%s failed fi_cq_open with err=%s", linux_device_name, fi_strerror(-rc))); goto scalable_fail; } /* bind cq to transmit context */ rc = fi_ep_bind(contexts[i].tx_ctx, (fid_t) contexts[i].cq, FI_TRANSMIT); if (0 != rc) { BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", linux_device_name, fi_strerror(-rc))); goto scalable_fail; } /* bind cq to receiving context */ if (TWO_SIDED_ENABLED) { rc = fi_ep_bind(contexts[i].rx_ctx, (fid_t) contexts[i].cq, FI_RECV); if (0 != rc) { BTL_VERBOSE( ("%s failed fi_ep_bind with err=%s", linux_device_name, fi_strerror(-rc))); goto scalable_fail; } } /* enable the context. */ rc = fi_enable(contexts[i].tx_ctx); if (0 != rc) { BTL_VERBOSE(("%s failed fi_enable with err=%s", linux_device_name, fi_strerror(-rc))); goto scalable_fail; } rc = fi_enable(contexts[i].rx_ctx); if (0 != rc) { BTL_VERBOSE(("%s failed fi_enable with err=%s", linux_device_name, fi_strerror(-rc))); goto scalable_fail; } /* initialize freelists. */ rc = init_context_freelists(&contexts[i]); if (rc != OPAL_SUCCESS) { goto scalable_fail; } /* assign the id */ contexts[i].context_id = i; } return contexts; scalable_fail: /* close and free */ for (i = 0; i < num_contexts; i++) { mca_btl_ofi_context_finalize(&contexts[i], true); } free(contexts); return NULL; } void mca_btl_ofi_context_finalize(mca_btl_ofi_context_t *context, bool scalable_ep) { /* if it is a scalable ep, we have to close all contexts. */ if (scalable_ep) { if (NULL != context->tx_ctx) { fi_close(&context->tx_ctx->fid); } if (NULL != context->rx_ctx) { fi_close(&context->rx_ctx->fid); } } if (NULL != context->cq) { fi_close(&context->cq->fid); } /* Can we destruct the object that hasn't been constructed? */ if (context->rdma_comp_list.fl_num_allocated != 0){ OBJ_DESTRUCT(&context->rdma_comp_list); } if (TWO_SIDED_ENABLED) { OBJ_DESTRUCT(&context->frag_comp_list); OBJ_DESTRUCT(&context->frag_list); } } /* Get a context to use for communication. * If TLS is supported, it will use the cached endpoint. * If not, it will invoke the normal round-robin assignment. */ mca_btl_ofi_context_t *get_ofi_context(mca_btl_ofi_module_t *btl) { #if OPAL_HAVE_THREAD_LOCAL /* With TLS, we cache the context we use. */ static volatile int64_t cur_num = 0; if (OPAL_UNLIKELY(my_context == NULL)) { OPAL_THREAD_LOCK(&btl->module_lock); my_context = &btl->contexts[cur_num]; cur_num = (cur_num + 1) % btl->num_contexts; OPAL_THREAD_UNLOCK(&btl->module_lock); } assert(my_context); return my_context; #else return get_ofi_context_rr(btl); #endif } /* return the context in a round-robin. */ /* There is no need for atomics here as it might hurt the performance. */ mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl) { static volatile uint64_t rr_num = 0; return &btl->contexts[rr_num++ % btl->num_contexts]; } int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) { int ret = 0; int events_read; int events = 0; struct fi_cq_entry cq_entry[MCA_BTL_OFI_DEFAULT_MAX_CQE]; struct fi_cq_err_entry cqerr = {0}; mca_btl_ofi_completion_context_t *c_ctx; mca_btl_ofi_base_completion_t *comp; mca_btl_ofi_rdma_completion_t *rdma_comp; mca_btl_ofi_frag_completion_t *frag_comp; ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read); if (0 < ret) { events_read = ret; for (int i = 0; i < events_read; i++) { if (NULL != cq_entry[i].op_context) { ++events; c_ctx = (mca_btl_ofi_completion_context_t *) cq_entry[i].op_context; /* We are casting to every type here just for simplicity. */ comp = (mca_btl_ofi_base_completion_t *) c_ctx->comp; frag_comp = (mca_btl_ofi_frag_completion_t *) c_ctx->comp; rdma_comp = (mca_btl_ofi_rdma_completion_t *) c_ctx->comp; switch (comp->type) { case MCA_BTL_OFI_TYPE_GET: case MCA_BTL_OFI_TYPE_PUT: case MCA_BTL_OFI_TYPE_AOP: case MCA_BTL_OFI_TYPE_AFOP: case MCA_BTL_OFI_TYPE_CSWAP: /* call the callback */ if (rdma_comp->cbfunc) { rdma_comp->cbfunc(comp->btl, comp->endpoint, rdma_comp->local_address, rdma_comp->local_handle, rdma_comp->cbcontext, rdma_comp->cbdata, OPAL_SUCCESS); } MCA_BTL_OFI_NUM_RDMA_DEC((mca_btl_ofi_module_t *) comp->btl); break; case MCA_BTL_OFI_TYPE_RECV: mca_btl_ofi_recv_frag((mca_btl_ofi_module_t *) comp->btl, (mca_btl_ofi_endpoint_t *) comp->endpoint, context, frag_comp->frag); break; case MCA_BTL_OFI_TYPE_SEND: MCA_BTL_OFI_NUM_SEND_DEC((mca_btl_ofi_module_t *) comp->btl); mca_btl_ofi_frag_complete(frag_comp->frag, OPAL_SUCCESS); break; default: /* catasthrophic */ BTL_ERROR(("unknown completion type")); MCA_BTL_OFI_ABORT(); } /* return the completion handler */ opal_free_list_return(comp->my_list, (opal_free_list_item_t *) comp); } } } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) { ret = fi_cq_readerr(context->cq, &cqerr, 0); /* cq readerr failed!? */ if (0 > ret) { BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)", __FILE__, __LINE__, fi_strerror(-ret), ret)); } else { BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n", cqerr.prov_errno)); } MCA_BTL_OFI_ABORT(); } #ifdef FI_EINTR /* sometimes, sockets provider complain about interrupt. We do nothing. */ else if (OPAL_UNLIKELY(ret == -FI_EINTR)) { } #endif /* If the error is not FI_EAGAIN, report the error and abort. */ else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) { BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret))); MCA_BTL_OFI_ABORT(); } return events; }