/*
 * Copyright (c) 2008      UT-Battelle, LLC. All rights reserved.
 * Copyright (c) 2010      Oracle and/or its affiliates.  All rights reserved.
 * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights
 *                         reserved.
 * Copyright (c) 2020      Intel, Inc.  All rights reserved.
 * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2022 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2006 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2011-2015 NVIDIA Corporation.  All rights reserved.
 * Copyright (c) 2012-2015 Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2015      Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
 * Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates.  All Rights reserved.
 * Copyright (c) 2022      IBM Corporation.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#include "ompi_config.h"
#include "opal/prefetch.h"
#include "opal/runtime/opal_params.h"
#include "opal/mca/btl/btl.h"
#include "opal/mca/mpool/mpool.h"
#include "ompi/constants.h"
#include "ompi/mca/pml/pml.h"
#include "pml_ob1.h"
#include "pml_ob1_hdr.h"
#include "pml_ob1_rdmafrag.h"
#include "pml_ob1_recvreq.h"
#include "pml_ob1_sendreq.h"
#include "ompi/mca/bml/base/base.h"

#include "opal/types.h"
#include "pml_ob1_accelerator.h"
#include "pml_ob1.h"
#include "opal/mca/accelerator/base/base.h"

static opal_accelerator_stream_t *dtoh_stream = NULL;
static opal_accelerator_stream_t *htod_stream = NULL;

static opal_mutex_t pml_ob1_accelerator_htod_lock;
static opal_mutex_t pml_ob1_accelerator_dtoh_lock;

/* Array of accelerator events to be queried for sending side and
 * receiving side. */
static opal_accelerator_event_t **accelerator_event_dtoh_array = NULL;
static opal_accelerator_event_t **accelerator_event_htod_array = NULL;

/* Array of fragments currently being moved by accelerator async non-blocking
 * operations */
static struct mca_btl_base_descriptor_t **accelerator_event_dtoh_frag_array = NULL;
static struct mca_btl_base_descriptor_t **accelerator_event_htod_frag_array = NULL;

/* First free/available location in accelerator_event_status_array */
static int accelerator_event_dtoh_first_avail, accelerator_event_htod_first_avail;

/* First currently-being used location in the accelerator_event_status_array */
static int accelerator_event_dtoh_first_used, accelerator_event_htod_first_used;

/* Number of status items currently in use */
static volatile int accelerator_event_dtoh_num_used, accelerator_event_htod_num_used;

/* Size of array holding events */
static int accelerator_event_htod_most = 0;

int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *frag)
{
    int result;

    if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "null")) {
        return 0;
    }

    /* First make sure there is room to store the event.  If not, then
     * return an error.  The error message will tell the user to try and
     * run again, but with a larger array for storing events. */
    OPAL_THREAD_LOCK(&pml_ob1_accelerator_htod_lock);
    if (accelerator_event_htod_num_used == mca_pml_ob1_accelerator_events_max) {
        opal_output_verbose(1, mca_pml_ob1_output, "Out of event handles. Max: %d. Suggested to rerun with new max with --mca pml_ob1_accelerator_events_max %d.",
                            mca_pml_ob1_accelerator_events_max, mca_pml_ob1_accelerator_events_max + 100);
        OPAL_THREAD_UNLOCK(&pml_ob1_accelerator_htod_lock);
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    if (accelerator_event_htod_num_used > accelerator_event_htod_most) {
        accelerator_event_htod_most = accelerator_event_htod_num_used;
        /* Just print multiples of 10 */
        if (0 == (accelerator_event_htod_most % 10)) {
            opal_output_verbose(20, mca_pml_ob1_output, "Maximum HtoD events used is now %d",
                                accelerator_event_htod_most);
        }
    }

    result = opal_accelerator.record_event(MCA_ACCELERATOR_NO_DEVICE_ID, accelerator_event_htod_array[accelerator_event_htod_first_avail], htod_stream);
    if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
        opal_output_verbose(1, mca_pml_ob1_output, "Event Record failed.");
        OPAL_THREAD_UNLOCK(&pml_ob1_accelerator_htod_lock);
        return OPAL_ERROR;
    }
    accelerator_event_htod_frag_array[accelerator_event_htod_first_avail] = frag;

    /* Bump up the first available slot and number used by 1 */
    accelerator_event_htod_first_avail++;
    if (accelerator_event_htod_first_avail >= mca_pml_ob1_accelerator_events_max) {
        accelerator_event_htod_first_avail = 0;
    }
    accelerator_event_htod_num_used++;

    OPAL_THREAD_UNLOCK(&pml_ob1_accelerator_htod_lock);
    return OPAL_SUCCESS;
}

opal_accelerator_stream_t *mca_pml_ob1_get_dtoh_stream(void)
{
    return dtoh_stream;
}
opal_accelerator_stream_t *mca_pml_ob1_get_htod_stream(void)
{
    return htod_stream;
}

/**
 * Progress any htod event completions.
 */
int mca_pml_ob1_progress_one_htod_event(struct mca_btl_base_descriptor_t **frag)
{
    int result;

    if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "null")) {
        return 0;
    }

    OPAL_THREAD_LOCK(&pml_ob1_accelerator_htod_lock);
    if (accelerator_event_htod_num_used > 0) {
        opal_output_verbose(30, mca_pml_ob1_output,
                            "mca_pml_ob1_progress_one_htod_event, outstanding_events=%d",
                            accelerator_event_htod_num_used);

        result = opal_accelerator.query_event(MCA_ACCELERATOR_NO_DEVICE_ID, accelerator_event_htod_array[accelerator_event_htod_first_used]);

        /* We found an event that is not ready, so return. */
        if (OPAL_ERR_RESOURCE_BUSY == result) {
            opal_output_verbose(30, mca_pml_ob1_output,
                                "Accelerator event query returned OPAL_ERR_RESOURCE_BUSY");
            *frag = NULL;
            OPAL_THREAD_UNLOCK(&pml_ob1_accelerator_htod_lock);
            return 0;
        } else if (OPAL_SUCCESS != result) {
            opal_output_verbose(1, mca_pml_ob1_output, "Accelerator event query failed: %d,", result);
            *frag = NULL;
            OPAL_THREAD_UNLOCK(&pml_ob1_accelerator_htod_lock);
            return OPAL_ERROR;
        }

        *frag = accelerator_event_htod_frag_array[accelerator_event_htod_first_used];

        /* Bump counters, loop around the circular buffer if necessary */
        --accelerator_event_htod_num_used;
        ++accelerator_event_htod_first_used;
        if (accelerator_event_htod_first_used >= mca_pml_ob1_accelerator_events_max) {
            accelerator_event_htod_first_used = 0;
        }
        /* A return value of 1 indicates an event completed and a frag was returned */
        OPAL_THREAD_UNLOCK(&pml_ob1_accelerator_htod_lock);
        return 1;
    }
    OPAL_THREAD_UNLOCK(&pml_ob1_accelerator_htod_lock);
    return 0;
}

int mca_pml_ob1_accelerator_init(void)
{
    int rc = OPAL_SUCCESS;
    int result = OPAL_SUCCESS;
    int i;

    if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "null")) {
        return 0;
    }

    OBJ_CONSTRUCT(&pml_ob1_accelerator_htod_lock, opal_mutex_t);
    OBJ_CONSTRUCT(&pml_ob1_accelerator_dtoh_lock, opal_mutex_t);

    /* Create Streams */
    result = opal_accelerator.create_stream(MCA_ACCELERATOR_NO_DEVICE_ID, &dtoh_stream);
    if (OPAL_SUCCESS != result) {
        opal_output_verbose(1, mca_pml_ob1_output, "Failed to create accelerator dtoh_stream stream.");
        rc = result;
        goto cleanup_and_error;
    }

    result = opal_accelerator.create_stream(MCA_ACCELERATOR_NO_DEVICE_ID, &htod_stream);
    if (OPAL_SUCCESS != result) {
        opal_output_verbose(1, mca_pml_ob1_output, "Failed to create accelerator htod_stream stream.");
        rc = result;
        goto cleanup_and_error;
    }

    /* Set up an array of pointers to store outstanding async dtoh events.
     * Used on the sending side for asynchronous copies. */
    accelerator_event_dtoh_num_used = 0;
    accelerator_event_dtoh_first_avail = 0;
    accelerator_event_dtoh_first_used = 0;

    accelerator_event_dtoh_array = calloc(mca_pml_ob1_accelerator_events_max, sizeof(opal_accelerator_event_t *));
    if (NULL == accelerator_event_dtoh_array) {
        opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
        rc = OPAL_ERROR;
        goto cleanup_and_error;
    }

    /* Create the events since they can be reused. */
    for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
        result = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_dtoh_array[i]);
        if (OPAL_SUCCESS != result) {
            opal_output_verbose(1, mca_pml_ob1_output, "Accelerator create event failed.");
            rc = OPAL_ERROR;
            goto cleanup_and_error;
        }
    }

    /* The first available status index is 0.  Make an empty frag
       array. */
    accelerator_event_dtoh_frag_array = (struct mca_btl_base_descriptor_t **) malloc(
        sizeof(struct mca_btl_base_descriptor_t *) * mca_pml_ob1_accelerator_events_max);
    if (NULL == accelerator_event_dtoh_frag_array) {
        opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
        rc = OPAL_ERROR;
        goto cleanup_and_error;
    }

    /* Set up an array of pointers to store outstanding async htod events.
     * Used on the receiving side for asynchronous copies. */
    accelerator_event_htod_num_used = 0;
    accelerator_event_htod_first_avail = 0;
    accelerator_event_htod_first_used = 0;

    accelerator_event_htod_array = calloc(mca_pml_ob1_accelerator_events_max, sizeof(opal_accelerator_event_t *));
    if (NULL == accelerator_event_htod_array) {
        opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
        rc = OPAL_ERROR;
        goto cleanup_and_error;
    }

    /* Create the events since they can be reused. */
    for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
        result = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_htod_array[i]);
        if (OPAL_SUCCESS != result) {
            opal_output_verbose(1, mca_pml_ob1_output, "Accelerator create event failed.");
            rc = OPAL_ERROR;
            goto cleanup_and_error;
        }
    }

    /* The first available status index is 0.  Make an empty frag
       array. */
    accelerator_event_htod_frag_array = (struct mca_btl_base_descriptor_t **) malloc(
        sizeof(struct mca_btl_base_descriptor_t *) * mca_pml_ob1_accelerator_events_max);
    if (NULL == accelerator_event_htod_frag_array) {
        opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
        rc = OPAL_ERROR;
        goto cleanup_and_error;
    }

cleanup_and_error:
    if (OPAL_SUCCESS != rc) {
        if (NULL != accelerator_event_dtoh_array) {
            free(accelerator_event_dtoh_array);
        }
        if (NULL != accelerator_event_dtoh_frag_array) {
            free(accelerator_event_dtoh_frag_array);
        }
        if (NULL != accelerator_event_htod_array) {
            free(accelerator_event_htod_array);
        }
        if (NULL != accelerator_event_htod_frag_array) {
            free(accelerator_event_htod_frag_array);
        }
        OBJ_DESTRUCT(&pml_ob1_accelerator_htod_lock);
        OBJ_DESTRUCT(&pml_ob1_accelerator_dtoh_lock);
    }

    return rc;
}

void mca_pml_ob1_accelerator_fini(void)
{
    int i;

    if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "null")) {
        return;
    }

    if (NULL != accelerator_event_htod_array) {
        for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
            if (NULL != accelerator_event_htod_array[i]) {
                OBJ_RELEASE(accelerator_event_htod_array[i]);
            }
        }
        free(accelerator_event_htod_array);
    }

    if (NULL != accelerator_event_dtoh_array) {
        for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
            if (NULL != accelerator_event_dtoh_array[i]) {
                OBJ_RELEASE(accelerator_event_dtoh_array[i]);
            }
        }
        free(accelerator_event_dtoh_array);
    }
    if (NULL != accelerator_event_dtoh_frag_array) {
        free(accelerator_event_dtoh_frag_array);
    }
    if (NULL != accelerator_event_htod_frag_array) {
        free(accelerator_event_htod_frag_array);
    }

    OBJ_RELEASE(htod_stream);
    OBJ_RELEASE(dtoh_stream);

    OBJ_DESTRUCT(&pml_ob1_accelerator_htod_lock);
    OBJ_DESTRUCT(&pml_ob1_accelerator_dtoh_lock);
}

size_t mca_pml_ob1_rdma_cuda_btls(
    mca_bml_base_endpoint_t* bml_endpoint,
    unsigned char* base,
    size_t size,
    mca_pml_ob1_com_btl_t* rdma_btls);

int mca_pml_ob1_accelerator_need_buffers(void * rreq,
                                         mca_btl_base_module_t* btl);

void mca_pml_ob1_accelerator_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t flags,
                                             ompi_proc_t* errproc, char* btlinfo);

/**
 * Handle the accelerator buffer.
 */
int mca_pml_ob1_send_request_start_accelerator(mca_pml_ob1_send_request_t* sendreq,
                                               mca_bml_base_btl_t* bml_btl,
                                               size_t size) {
    int rc;
#if OPAL_CUDA_GDR_SUPPORT
    /* With some BTLs, switch to RNDV from RGET at large messages */
    if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_ACCELERATOR) &&
        (sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_accelerator_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
        return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
    }
#endif /* OPAL_CUDA_GDR_SUPPORT */

    if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
        unsigned char *base;
        opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
        if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
                                                                           sendreq->req_endpoint,
                                                                           base,
                                                                           sendreq->req_send.req_bytes_packed,
                                                                           sendreq->req_rdma))) {
            rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                     sendreq->req_send.req_bytes_packed);
            if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
                mca_pml_ob1_free_rdma_resources(sendreq);
            }
        } else {
            if (bml_btl->btl_flags & MCA_BTL_FLAGS_ACCELERATOR_PUT) {
                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, size,
                                                         MCA_PML_OB1_HDR_FLAGS_CONTIG);
            } else {
                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
            }
        }
    } else {
        /* Do not send anything with first rendezvous message as copying GPU
         * memory into RNDV message is expensive. */
        rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
    }
    return rc;
}


size_t mca_pml_ob1_rdma_cuda_btls(
    mca_bml_base_endpoint_t* bml_endpoint,
    unsigned char* base,
    size_t size,
    mca_pml_ob1_com_btl_t* rdma_btls)
{
    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
    double weight_total = 0;
    int num_btls_used = 0, n;

    /* shortcut when there are no rdma capable btls */
    if(num_btls == 0) {
        return 0;
    }

    /* check to see if memory is registered */
    for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
            n++) {
        mca_bml_base_btl_t* bml_btl =
            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);

        if (bml_btl->btl_flags & MCA_BTL_FLAGS_ACCELERATOR_GET) {
            mca_btl_base_registration_handle_t *handle = NULL;

            if( NULL != bml_btl->btl->btl_register_mem ) {
                /* register the memory */
                handle = bml_btl->btl->btl_register_mem (bml_btl->btl, bml_btl->btl_endpoint,
                                                         base, size,
#if OPAL_CUDA_GDR_SUPPORT
                                                         MCA_BTL_REG_FLAG_CUDA_GPU_MEM |
#endif
                                                         MCA_BTL_REG_FLAG_REMOTE_READ);
            }

            if(NULL == handle)
                continue;

            rdma_btls[num_btls_used].bml_btl = bml_btl;
            rdma_btls[num_btls_used].btl_reg = handle;
            weight_total += bml_btl->btl_weight;
            num_btls_used++;
        }
    }

    /* if we don't use leave_pinned and all BTLs that already have this memory
     * registered amount to less then half of available bandwidth - fall back to
     * pipeline protocol */
    if(0 == num_btls_used || (!opal_leave_pinned && weight_total < 0.5))
        return 0;

    mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,
                                     weight_total);

    return num_btls_used;
}

int mca_pml_ob1_accelerator_need_buffers(void * rreq,
                                         mca_btl_base_module_t* btl)
{
    mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t*)rreq;
    mca_bml_base_endpoint_t* bml_endpoint = mca_bml_base_get_endpoint (recvreq->req_recv.req_base.req_proc);
    mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_send, btl);

    /* A btl could be in the rdma list but not in the send list so check there also */
    if (NULL == bml_btl) {
        bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, btl);
    }
    /* We should always be able to find back the bml_btl based on the btl */
    assert(NULL != bml_btl);

    if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_ACCELERATOR) &&
        (bml_btl->btl_flags & MCA_BTL_FLAGS_ACCELERATOR_GET)) {
        return opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor);
    }
    return true;
}

/*
 * This function enables us to start using RDMA get protocol with GPU buffers.
 * We do this by adjusting the flags in the BML structure.  This is not the
 * best thing, but this may go away if CUDA IPC is supported everywhere in the
 * future. */
void mca_pml_ob1_accelerator_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t flags,
                                      ompi_proc_t* errproc, char* btlinfo)
{
    mca_bml_base_endpoint_t* ep;
    int btl_verbose_stream = 0;
    int i;

    assert(NULL != errproc);
    assert(NULL != errproc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]);
    if (NULL != btlinfo) {
        btl_verbose_stream = *(int *)btlinfo;
    }
    ep = (mca_bml_base_endpoint_t*)errproc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];

    /* Find the corresponding bml and adjust the flag to support CUDA get */
    for( i = 0; i < (int)ep->btl_send.arr_size; i++ ) {
        if( ep->btl_send.bml_btls[i].btl == btl ) {
            if (4 < opal_output_get_verbosity(btl_verbose_stream)) {
                char *errhost = opal_get_proc_hostname(&errproc->super);
                opal_output(0, "BTL %s: rank=%d enabling accelerator IPC "
                            "to rank=%d on node=%s \n",
                            btl->btl_component->btl_version.mca_component_name,
                            OMPI_PROC_MY_NAME->vpid,
                            ((ompi_process_name_t*)&errproc->super.proc_name)->vpid,
                            errhost);
                free(errhost);
            }
            ep->btl_send.bml_btls[i].btl_flags |= MCA_BTL_FLAGS_ACCELERATOR_GET;
        }
    }
}