/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2022 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2007-2016 Los Alamos National Security, LLC.  All rights
 *                         reserved.
 * Copyright (c) 2014-2021 Cisco Systems, Inc.  All rights reserved
 * Copyright (c) 2015      Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
 * Copyright (c) 2022      IBM Corporation.  All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#include "ompi_config.h"

#include "pml_ob1.h"
#include "pml_ob1_sendreq.h"
#include "pml_ob1_recvreq.h"
#include "ompi/peruse/peruse-internal.h"
#include "ompi/runtime/ompi_spc.h"
#if MPI_VERSION >= 4
#include "ompi/mca/pml/base/pml_base_sendreq.h"
#endif

/**
 * Single usage request. As we allow recursive calls (as an
 * example from the request completion callback), we cannot rely
 * on using a global request. Thus, once a send acquires ownership
 * of this global request, it should set it to NULL to prevent
 * the reuse until the first user completes.
 */
mca_pml_ob1_send_request_t *mca_pml_ob1_sendreq = NULL;

int mca_pml_ob1_isend_init(const void *buf,
                           size_t count,
                           ompi_datatype_t * datatype,
                           int dst,
                           int tag,
                           mca_pml_base_send_mode_t sendmode,
                           ompi_communicator_t * comm,
                           ompi_request_t ** request)
{
    mca_pml_ob1_comm_proc_t *ob1_proc = mca_pml_ob1_peer_lookup (comm, dst);
    mca_pml_ob1_send_request_t *sendreq = NULL;
    MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq);
    if (NULL == sendreq)
        return OMPI_ERR_OUT_OF_RESOURCE;

    MCA_PML_OB1_SEND_REQUEST_INIT(sendreq, buf, count, datatype, dst, tag,
                                  comm, sendmode, true, ob1_proc);

    PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
                             &(sendreq)->req_send.req_base,
                             PERUSE_SEND);

    /* Work around a leak in start by marking this request as complete. The
     * problem occurred because we do not have a way to differentiate an
     * initial request and an incomplete pml request in start. This line
     * allows us to detect this state. */
    sendreq->req_send.req_base.req_pml_complete = true;

    *request = (ompi_request_t *) sendreq;
    return OMPI_SUCCESS;
}

/* try to get a small message out on to the wire quickly */
static inline int mca_pml_ob1_send_inline (const void *buf, size_t count,
                                           ompi_datatype_t * datatype,
                                           int dst, int tag, int16_t seqn,
                                           ompi_proc_t *dst_proc, mca_pml_ob1_comm_proc_t *ob1_proc,
                                           mca_bml_base_endpoint_t* endpoint,
                                           ompi_communicator_t * comm)
{
    mca_pml_ob1_match_hdr_t match;
    mca_bml_base_btl_t *bml_btl;
    opal_convertor_t convertor;
    size_t size;
    int rc;

    bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
    if( NULL == bml_btl || NULL == bml_btl->btl->btl_sendi)
        return OMPI_ERR_NOT_AVAILABLE;

    ompi_datatype_type_size (datatype, &size);

    /* the size used here was picked based on performance on a Cray XE-6. it should probably
     * be provided by the btl module */
    if ((size * count) > 256 || -1 == ob1_proc->comm_index) {
        return OMPI_ERR_NOT_AVAILABLE;
    }

    if (count > 0) {
        /* initialize just enough of the convertor to avoid a SEGV in opal_convertor_cleanup */
        OBJ_CONSTRUCT(&convertor, opal_convertor_t);

        /* We will create a convertor specialized for the        */
        /* remote architecture and prepared with the datatype.   */
        opal_convertor_copy_and_prepare_for_send (dst_proc->super.proc_convertor,
                                                  (const struct opal_datatype_t *) datatype,
                                                  count, buf, 0, &convertor);
        opal_convertor_get_packed_size (&convertor, &size);
    } else {
        size = 0;
    }

    mca_pml_ob1_match_hdr_prepare (&match, MCA_PML_OB1_HDR_TYPE_MATCH, 0,
                                   ob1_proc->comm_index, comm->c_my_rank,
                                   tag, seqn);

    ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, dst_proc);

    /* try to send immediately */
    rc = mca_bml_base_sendi (bml_btl, &convertor, &match, OMPI_PML_OB1_MATCH_HDR_LEN,
                             size, MCA_BTL_NO_ORDER, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
                             MCA_PML_OB1_HDR_TYPE_MATCH, NULL);

    /* This #if is required due to an issue that arises with the IBM CI (XL Compiler).
     * The compiler doesn't seem to like having a compiler hint attached to an if
     * statement that only has a no-op inside and has the following error:
     *
     * 1586-494 (U) INTERNAL COMPILER ERROR: Signal 11.
     */
#if SPC_ENABLE == 1
    if(OPAL_LIKELY(rc == OPAL_SUCCESS)) {
        SPC_USER_OR_MPI(tag, (ompi_spc_value_t)size, OMPI_SPC_BYTES_SENT_USER, OMPI_SPC_BYTES_SENT_MPI);
    }
#endif

    if (count > 0) {
        opal_convertor_cleanup (&convertor);
    }

    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
	return rc;
    }

    return (int) size;
}

int mca_pml_ob1_isend(const void *buf,
                      size_t count,
                      ompi_datatype_t * datatype,
                      int dst,
                      int tag,
                      mca_pml_base_send_mode_t sendmode,
                      ompi_communicator_t * comm,
                      ompi_request_t ** request)
{
    mca_pml_ob1_comm_proc_t *ob1_proc = mca_pml_ob1_peer_lookup (comm, dst);
    mca_pml_ob1_send_request_t *sendreq = NULL;
    ompi_proc_t *dst_proc = ob1_proc->ompi_proc;
    mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint (dst_proc);
    int16_t seqn = 0;
    int rc;

    if (OPAL_UNLIKELY(NULL == endpoint)) {
#if OPAL_ENABLE_FT_MPI
        if (!dst_proc->proc_active) {
            goto alloc_ft_req;
        }
#endif /* OPAL_ENABLE_FT_MPI */
        return OMPI_ERR_UNREACH;
    }

    if (!OMPI_COMM_CHECK_ASSERT_ALLOW_OVERTAKE(comm) || 0 > tag) {
        seqn = (uint16_t) OPAL_THREAD_ADD_FETCH32(&ob1_proc->send_sequence, 1);
    }

    if (MCA_PML_BASE_SEND_SYNCHRONOUS != sendmode) {
        rc = mca_pml_ob1_send_inline (buf, count, datatype, dst, tag, seqn, dst_proc, ob1_proc,
                                      endpoint, comm);
        if (OPAL_LIKELY(0 <= rc)) {
            /* NTH: it is legal to return ompi_request_empty since the only valid
             * field in a send completion status is whether or not the send was
             * cancelled (which it can't be at this point anyway). */
#if MPI_VERSION >= 4
            if (OPAL_UNLIKELY(OMPI_PML_BASE_WARN_DEP_CANCEL_SEND_NEVER != ompi_pml_base_warn_dep_cancel_send_level)) {
                *request = &ompi_request_empty_send;
                return OMPI_SUCCESS;
            }
#endif
            *request = &ompi_request_empty;
            return OMPI_SUCCESS;
        }
    }

    MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq);
    if (NULL == sendreq)
        return OMPI_ERR_OUT_OF_RESOURCE;

    MCA_PML_OB1_SEND_REQUEST_INIT(sendreq,
                                  buf,
                                  count,
                                  datatype,
                                  dst, tag,
                                  comm, sendmode, false, ob1_proc);

    PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
                             &(sendreq)->req_send.req_base,
                             PERUSE_SEND);

    MCA_PML_OB1_SEND_REQUEST_START_W_SEQ(sendreq, endpoint, seqn, rc);
    *request = (ompi_request_t *) sendreq;
    return rc;

#if OPAL_ENABLE_FT_MPI
alloc_ft_req:
    MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq);
    if (NULL == sendreq)
        return OMPI_ERR_OUT_OF_RESOURCE;

    MCA_PML_OB1_SEND_REQUEST_INIT(sendreq,
                                  buf,
                                  count,
                                  datatype,
                                  dst, tag,
                                  comm, sendmode, false, ob1_proc);

    PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
                             &(sendreq)->req_send.req_base,
                             PERUSE_SEND);

    /* No point in starting the request, it won't go through, mark completed
     * in error for collection in future wait */
    sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR = ompi_comm_is_revoked(comm)? MPI_ERR_REVOKED: MPI_ERR_PROC_FAILED;
    MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq, false);
    OPAL_OUTPUT_VERBOSE((2, ompi_ftmpi_output_handle, "Allocating request in error %p (peer %d, seq %" PRIu64 ") with error code %d",
                         (void*) sendreq,
                         dst,
                         sendreq->req_send.req_base.req_sequence,
                         sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR));
    *request = (ompi_request_t *) sendreq;
    return OMPI_SUCCESS;
#endif /* OPAL_ENABLE_FT_MPI */
}

int mca_pml_ob1_send(const void *buf,
                     size_t count,
                     ompi_datatype_t * datatype,
                     int dst,
                     int tag,
                     mca_pml_base_send_mode_t sendmode,
                     ompi_communicator_t * comm)
{
    mca_pml_ob1_comm_proc_t *ob1_proc = mca_pml_ob1_peer_lookup (comm, dst);
    ompi_proc_t *dst_proc = ob1_proc->ompi_proc;
    mca_bml_base_endpoint_t* endpoint = mca_bml_base_get_endpoint (dst_proc);
    mca_pml_ob1_send_request_t *sendreq = NULL;
    int16_t seqn = 0;
    int rc;

    if (OPAL_UNLIKELY(NULL == endpoint)) {
#if OPAL_ENABLE_FT_MPI
        if (!dst_proc->proc_active) {
            return MPI_ERR_PROC_FAILED;
        }
#endif /* OPAL_ENABLE_FT_MPI */
        return OMPI_ERR_UNREACH;
    }

    if (OPAL_UNLIKELY(MCA_PML_BASE_SEND_BUFFERED == sendmode)) {
        /* large buffered sends *need* a real request so use isend instead */
        ompi_request_t *brequest;

        rc = mca_pml_ob1_isend (buf, count, datatype, dst, tag, sendmode, comm, &brequest);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            return rc;
        }

        ompi_request_wait_completion (brequest);
        ompi_request_free (&brequest);
        return OMPI_SUCCESS;
    }

    if (!OMPI_COMM_CHECK_ASSERT_ALLOW_OVERTAKE(comm) || 0 > tag) {
        seqn = (uint16_t) OPAL_THREAD_ADD_FETCH32(&ob1_proc->send_sequence, 1);
    }

    /**
     * The immediate send will not have a request, so they are
     * intracable from the point of view of any debugger attached to
     * the parallel application.
     */
    if (MCA_PML_BASE_SEND_SYNCHRONOUS != sendmode) {
        rc = mca_pml_ob1_send_inline (buf, count, datatype, dst, tag, seqn, dst_proc,
                                      ob1_proc, endpoint, comm);
        if (OPAL_LIKELY(0 <= rc)) {
            return OMPI_SUCCESS;
        }
    }

    if (OPAL_LIKELY(!ompi_mpi_thread_multiple)) {
        sendreq = mca_pml_ob1_sendreq;
        mca_pml_ob1_sendreq = NULL;
    }

    if( OPAL_UNLIKELY(NULL == sendreq) ) {
        MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq);
        if (NULL == sendreq)
            return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
    }

    sendreq->req_send.req_base.req_proc = dst_proc;
    sendreq->rdma_frag = NULL;

    MCA_PML_OB1_SEND_REQUEST_INIT(sendreq, buf, count, datatype, dst, tag,
                                  comm, sendmode, false, ob1_proc);

    PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
                             &sendreq->req_send.req_base,
                             PERUSE_SEND);

    MCA_PML_OB1_SEND_REQUEST_START_W_SEQ(sendreq, endpoint, seqn, rc);
    if (OPAL_LIKELY(rc == OMPI_SUCCESS)) {
        ompi_request_wait_completion(&sendreq->req_send.req_base.req_ompi);

        rc = sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR;
    }

    if (OPAL_UNLIKELY(ompi_mpi_thread_multiple || NULL != mca_pml_ob1_sendreq)) {
        MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq);
    } else {
        mca_pml_ob1_send_request_fini (sendreq);
        mca_pml_ob1_sendreq = sendreq;
    }

    return rc;
}