/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
 * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2021 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2006-2008 Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2010-2012 Oracle and/or its affiliates.  All rights reserved.
 * Copyright (c) 2012      Oak Ridge National Labs.  All rights reserved.
 * Copyright (c) 2016-2017 Los Alamos National Security, LLC. All rights
 *                         reserved.
 * Copyright (c) 2016      Mellanox Technologies. All rights reserved.
 * Copyright (c) 2016      Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#include "ompi_config.h"
#include "ompi/constants.h"
#include "ompi/request/request.h"
#include "ompi/request/request_default.h"
#include "ompi/request/grequest.h"

int ompi_request_default_wait(
    ompi_request_t ** req_ptr,
    ompi_status_public_t * status)
{
    ompi_request_t *req = *req_ptr;

    ompi_request_wait_completion(req);

#if OPAL_ENABLE_FT_MPI
    /* Special case for MPI_ANY_SOURCE */
    if( MPI_ERR_PROC_FAILED_PENDING == req->req_status.MPI_ERROR ) {
        if( MPI_STATUS_IGNORE != status ) {
            OMPI_COPY_STATUS(status, req->req_status, false);
        }
        return MPI_ERR_PROC_FAILED_PENDING;
    }
#endif /* OPAL_ENABLE_FT_MPI */

    /* return status.  If it's a generalized request, we *have* to
       invoke the query_fn, even if the user provided STATUS_IGNORE.
       MPI-2:8.2. */
    if (OMPI_REQUEST_GEN == req->req_type) {
        ompi_grequest_invoke_query(req, &req->req_status);
    }
    if( MPI_STATUS_IGNORE != status ) {
        OMPI_COPY_STATUS(status, req->req_status, false);
    }
    if( req->req_persistent ) {
        if( req->req_state == OMPI_REQUEST_INACTIVE ) {
            if (MPI_STATUS_IGNORE != status) {
                OMPI_COPY_STATUS(status, ompi_status_empty, false);
            }
            return OMPI_SUCCESS;
        }
        req->req_state = OMPI_REQUEST_INACTIVE;
        return req->req_status.MPI_ERROR;
    }

    /* If there was an error, don't free the request -- just return
       the single error. */
    if (MPI_SUCCESS != req->req_status.MPI_ERROR) {
        return req->req_status.MPI_ERROR;
    }

    /* If there's an error while freeing the request, assume that the
       request is still there.  Otherwise, Bad Things will happen
       later! */
    return ompi_request_free(req_ptr);
}


int ompi_request_default_wait_any(size_t count,
                                  ompi_request_t ** requests,
                                  int *index,
                                  ompi_status_public_t * status)
{
    size_t i, completed = count, num_requests_null_inactive = 0;
    int rc = OMPI_SUCCESS;
    ompi_request_t *request=NULL;
    ompi_wait_sync_t sync;

    if (OPAL_UNLIKELY(0 == count)) {
        *index = MPI_UNDEFINED;
        return OMPI_SUCCESS;
    }

recheck:
    WAIT_SYNC_INIT(&sync, 1);

    num_requests_null_inactive = 0;
    for (i = 0; i < count; i++) {
        void *_tmp_ptr = REQUEST_PENDING;

        request = requests[i];

        /* Check for null or completed persistent request. For
         * MPI_REQUEST_NULL, the req_state is always OMPI_REQUEST_INACTIVE.
         */
        if( request->req_state == OMPI_REQUEST_INACTIVE ) {
            num_requests_null_inactive++;
            continue;
        }

        if( !OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(&request->req_complete, &_tmp_ptr, &sync) ) {
            if(OPAL_LIKELY( REQUEST_COMPLETE(request) )) {
                completed = i;
                *index = i;
                goto after_sync_wait;
            }
        }

#if OPAL_ENABLE_FT_MPI
        if(OPAL_UNLIKELY( ompi_request_is_failed(request) )) {
            completed = i;
            *index = i;
            goto after_sync_wait;
        }
#endif /* OPAL_ENABLE_FT_MPI */
    }

    if(num_requests_null_inactive == count) {
        *index = MPI_UNDEFINED;
        if (MPI_STATUS_IGNORE != status) {
            OMPI_COPY_STATUS(status, ompi_status_empty, false);
        }
        /* No signal-in-flight can be in this case */
        WAIT_SYNC_RELEASE_NOWAIT(&sync);
        return rc;
    }

    rc = SYNC_WAIT(&sync);

  after_sync_wait:
    /* recheck the complete status and clean up the sync primitives.
     * Do it backward to return the earliest complete request to the
     * user.
     */
    for(i = completed-1; (i+1) > 0; i--) {
        void *tmp_ptr = &sync;

        request = requests[i];

        if( request->req_state == OMPI_REQUEST_INACTIVE ) {
            continue;
        }
        /* Atomically mark the request as pending. If this succeed then
         * the request was not completed, and it is now marked as pending.
         * Otherwise, the request has been completed meanwhile, and it
         * has been atomically marked as REQUEST_COMPLETE.
         */
        if( !OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(&request->req_complete, &tmp_ptr, REQUEST_PENDING) ) {
            *index = i;
        }
    }

    /* Error path: SYNC_WAIT was interrupted by an error
     * We do this after the cleanup loop to make sure nobody is updating the
     * sync again while we are rearming it */
    if(OPAL_UNLIKELY( OMPI_SUCCESS != rc )) {
        rc = OMPI_SUCCESS;
        WAIT_SYNC_RELEASE(&sync);
        goto recheck;
    }

    if( *index == (int)completed ) {
        /* Only one request has triggered. There was no in-flight
         * completions. Drop the signalled flag so we won't block
         * in WAIT_SYNC_RELEASE 
         */
        WAIT_SYNC_SIGNALLED(&sync);
    }

    request = requests[*index];
#if OPAL_ENABLE_FT_MPI
    /* Special case for MPI_ANY_SOURCE */
    if( MPI_ERR_PROC_FAILED == request->req_status.MPI_ERROR ) {
        WAIT_SYNC_RELEASE(&sync);
        return MPI_ERR_PROC_FAILED_PENDING;
    }
#endif  /* OPAL_ENABLE_FT_MPI */
    assert( REQUEST_COMPLETE(request) );
    /* Per note above, we have to call gen request query_fn even
       if STATUS_IGNORE was provided */
    if (OMPI_REQUEST_GEN == request->req_type) {
        rc = ompi_grequest_invoke_query(request, &request->req_status);
    }
    if (MPI_STATUS_IGNORE != status) {
        OMPI_COPY_STATUS(status, request->req_status, false);
    }
    rc = request->req_status.MPI_ERROR;
    if( request->req_persistent ) {
        request->req_state = OMPI_REQUEST_INACTIVE;
    } else if (MPI_SUCCESS == rc) {
        /* Only free the request if there is no error on it */
        /* If there's an error while freeing the request,
           assume that the request is still there.  Otherwise,
           Bad Things will happen later! */
        rc = ompi_request_free(&requests[*index]);
    }

    WAIT_SYNC_RELEASE(&sync);
    return rc;
}


int ompi_request_default_wait_all( size_t count,
                                   ompi_request_t ** requests,
                                   ompi_status_public_t * statuses )
{
    size_t i, completed = 0, failed = 0;
    ompi_request_t **rptr;
    ompi_request_t *request;
    int mpi_error = OMPI_SUCCESS;
    ompi_wait_sync_t sync;

    if (OPAL_UNLIKELY(0 == count)) {
        return OMPI_SUCCESS;
    }

recheck:
    WAIT_SYNC_INIT(&sync, count);
    rptr = requests;
    for (i = 0; i < count; i++) {
        void *_tmp_ptr = REQUEST_PENDING;

        request = *rptr++;

        if( request->req_state == OMPI_REQUEST_INACTIVE ) {
            completed++;
            continue;
        }

        if (REQUEST_COMPLETE(request) || !OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(&request->req_complete, &_tmp_ptr, &sync)) {
            if( OPAL_LIKELY( REQUEST_COMPLETE(request) ) ) {
                if( OPAL_UNLIKELY( MPI_SUCCESS != request->req_status.MPI_ERROR ) ) {
                    failed++;
                }
                completed++;
            }
        }

#if OPAL_ENABLE_FT_MPI
        if(OPAL_UNLIKELY( ompi_request_is_failed(request) )) {
            failed++;
            continue;
        }
#endif /* OPAL_ENABLE_FT_MPI */
    }
    if( failed > 0 ) {
        /* We are completing only one here, lets prevent blocking in the
         * SYNC_RELEASE by marking the sync as SIGNALED */
        WAIT_SYNC_SIGNALLED(&sync);
        goto finish;
    }

    if( 0 != completed ) {
        wait_sync_update(&sync, completed, OPAL_SUCCESS);
    }

    /* wait until all requests complete or until an error is triggered. */
    mpi_error = SYNC_WAIT(&sync);
    if( OPAL_SUCCESS != mpi_error ) {
        /* The sync triggered because of an error. The error may be for us, but
         * it may be for some other pending wait, so we have to recheck
         * our request status.
         *
         * We are going to rearm the sync, but first make sure it is not
         * updated by any progress thread meanwhile by removing it from all
         * requests it has been attached to.
         */
        rptr = requests;
        for (i = 0; i < count; i++) {
            void *_tmp_ptr = &sync;

            request = *rptr++;

            if( request->req_state == OMPI_REQUEST_INACTIVE ) {
                continue;
            }

            OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(&request->req_complete, &_tmp_ptr, REQUEST_PENDING);
        }
        /* The sync is now ready for rearming */
        WAIT_SYNC_RELEASE(&sync);
        failed = completed = 0;
        goto recheck;
    }

 finish:
    rptr = requests;
    if (MPI_STATUSES_IGNORE != statuses) {
        /* fill out status and free request if required */
        for( i = 0; i < count; i++, rptr++ ) {
            void *_tmp_ptr = &sync;

            request = *rptr;

            if( request->req_state == OMPI_REQUEST_INACTIVE ) {
                OMPI_COPY_STATUS(&statuses[i], ompi_status_empty, true);
                continue;
            }

            if( OPAL_UNLIKELY(0 < failed) ) {
                /* if we have failed requests we skipped the waiting on the sync. Thus,
                 * some of the requests might not be properly completed, in which case
                 * we must detach all requests from the sync. However, if we can successfully
                 * mark the request as pending then it is neither failed nor complete, and
                 * we must stop altering it.
                 */
                if( OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(&request->req_complete, &_tmp_ptr, REQUEST_PENDING ) ) {
                    /*
                     * Per MPI 2.2 p 60:
                     * Allows requests to be marked as MPI_ERR_PENDING if they are
                     * "neither failed nor completed." Which can only happen if
                     * there was an error in one of the other requests.
                     */
                    statuses[i].MPI_ERROR = MPI_ERR_PENDING;
#if OPAL_ENABLE_FT_MPI
                    /* PROC_FAILED_PENDING errors are also not completed yet */
                    if( MPI_ERR_PROC_FAILED_PENDING == requests[i]->req_status.MPI_ERROR ) {
                        statuses[i].MPI_ERROR = MPI_ERR_PROC_FAILED_PENDING;
                    }
#endif /* OPAL_ENABLE_FT_MPI */
                    mpi_error = MPI_ERR_IN_STATUS;
                    continue;
                }
            }
            assert( REQUEST_COMPLETE(request) );

            if (OMPI_REQUEST_GEN == request->req_type) {
                ompi_grequest_invoke_query(request, &request->req_status);
            }

            OMPI_COPY_STATUS(&statuses[i], request->req_status, true);

            if( request->req_persistent ) {
                request->req_state = OMPI_REQUEST_INACTIVE;
                continue;
            }
            /* Only free the request if there is no error on it */
            if (MPI_SUCCESS == request->req_status.MPI_ERROR) {
                /* If there's an error while freeing the request,
                   assume that the request is still there.
                   Otherwise, Bad Things will happen later! */
                int tmp = ompi_request_free(rptr);
                if (OMPI_SUCCESS == mpi_error && OMPI_SUCCESS != tmp) {
                    mpi_error = tmp;
                }
            }
            if( statuses[i].MPI_ERROR != OMPI_SUCCESS) {
                mpi_error = MPI_ERR_IN_STATUS;
            }
        }
    } else {
        int rc;
        /* free request if required */
        for( i = 0; i < count; i++, rptr++ ) {
            void *_tmp_ptr = &sync;

            request = *rptr;

            if( request->req_state == OMPI_REQUEST_INACTIVE ) {
                rc = ompi_status_empty.MPI_ERROR;
                goto absorb_error_and_continue;
            }
            /*
             * Assert only if no requests were failed.
             * Since some may still be pending.
             */
            if( OPAL_UNLIKELY(0 < failed) ) {
                /* If the request is still pending due to a failed request
                 * then skip it in this loop.
                 */
                 if( OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(&request->req_complete, &_tmp_ptr, REQUEST_PENDING ) ) {
                    /*
                     * Per MPI 2.2 p 60:
                     * Allows requests to be marked as MPI_ERR_PENDING if they are
                     * "neither failed nor completed." Which can only happen if
                     * there was an error in one of the other requests.
                     */
                    rc = MPI_ERR_PENDING;
#if OPAL_ENABLE_FT_MPI
                    /* PROC_FAILED_PENDING errors are also not completed yet */
                    if( MPI_ERR_PROC_FAILED_PENDING == requests[i]->req_status.MPI_ERROR ) {
                        rc = MPI_ERR_PROC_FAILED_PENDING;
                    }
#endif  /* OPAL_ENABLE_FT_MPI */
                    goto absorb_error_and_continue;
                 }
            }
            assert( REQUEST_COMPLETE(request) );

            /* Per note above, we have to call gen request query_fn
               even if STATUSES_IGNORE was provided */
            if (OMPI_REQUEST_GEN == request->req_type) {
                rc = ompi_grequest_invoke_query(request, &request->req_status);
            }

            rc = request->req_status.MPI_ERROR;

            if( request->req_persistent ) {
                request->req_state = OMPI_REQUEST_INACTIVE;
            } else if (MPI_SUCCESS == rc) {
                /* Only free the request if there is no error on it */
                int tmp = ompi_request_free(rptr);
                if (OMPI_SUCCESS == mpi_error && OMPI_SUCCESS != tmp) {
                    mpi_error = tmp;
                }
            }
    absorb_error_and_continue:
#if OPAL_ENABLE_FT_MPI
            if( (MPI_ERR_PROC_FAILED == rc) || (MPI_ERR_REVOKED == rc) ) {
                mpi_error = rc;
            }
#endif  /* OPAL_ENABLE_FT_MPI */
            /*
             * Per MPI 2.2 p34:
             * "It is possible for an MPI function to return MPI_ERR_IN_STATUS
             *  even when MPI_STATUS_IGNORE or MPI_STATUSES_IGNORE has been
             *  passed to that function."
             * So we should do so here as well.
             */
            if( OMPI_SUCCESS == mpi_error && rc != OMPI_SUCCESS) {
                mpi_error = MPI_ERR_IN_STATUS;
            }
        }
    }
    WAIT_SYNC_RELEASE(&sync);
    return mpi_error;
}


int ompi_request_default_wait_some(size_t count,
                                   ompi_request_t ** requests,
                                   int * outcount,
                                   int * indices,
                                   ompi_status_public_t * statuses)
{
    size_t num_requests_null_inactive, num_requests_done, num_active_reqs;
    int rc = MPI_SUCCESS;
    ompi_request_t **rptr = NULL;
    ompi_request_t *request = NULL;
    ompi_wait_sync_t sync;
    size_t sync_sets = 0, sync_unsets = 0;

    if (OPAL_UNLIKELY(0 == count)) {
        *outcount = MPI_UNDEFINED;
        return OMPI_SUCCESS;
    }

  recheck:
    WAIT_SYNC_INIT(&sync, 1);

    *outcount = 0;

    rptr = requests;
    num_requests_null_inactive = 0;
    num_requests_done = 0;
    num_active_reqs = 0;
    for (size_t i = 0; i < count; i++, rptr++) {
        void *_tmp_ptr = REQUEST_PENDING;

        request = *rptr;
        /*
         * Check for null or completed persistent request.
         * For MPI_REQUEST_NULL, the req_state is always OMPI_REQUEST_INACTIVE.
         */
        if( request->req_state == OMPI_REQUEST_INACTIVE ) {
            num_requests_null_inactive++;
            continue;
        }
        indices[num_active_reqs] = OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(&request->req_complete, &_tmp_ptr, &sync);
        if( !indices[num_active_reqs] ) {
            /* If the request is completed go ahead and mark it as such */
            if( REQUEST_COMPLETE(request) ) {
                num_requests_done++;
            }
        }

#if OPAL_ENABLE_FT_MPI
        if(OPAL_UNLIKELY( ompi_request_is_failed(request) )) {
            num_requests_done++;
            continue;
        }
#endif /* OPAL_ENABLE_FT_MPI */
        num_active_reqs++;
    }

    if(num_requests_null_inactive == count) {
        *outcount = MPI_UNDEFINED;
        /* nobody will signal us */
        WAIT_SYNC_RELEASE_NOWAIT(&sync);
        return rc;
    }

    sync_sets = num_active_reqs - num_requests_done;
    if( 0 == num_requests_done ) {
        /* One completed request is enough to satisfy the some condition */
        SYNC_WAIT(&sync);
    }

    /* Do the final counting and */
    /* Clean up the synchronization primitives */

    rptr = requests;
    num_requests_done = 0;
    num_active_reqs = 0;
    for (size_t i = 0; i < count; i++, rptr++) {
        void *_tmp_ptr = &sync;

        request = *rptr;

        if( request->req_state == OMPI_REQUEST_INACTIVE ) {
            continue;
        }
        /* Here we have 3 possibilities:
         * a) request was found completed in the first loop
         *    => ( indices[i] == 0 )
         * b) request was completed between first loop and this check
         *    => ( indices[i] == 1 ) and we can NOT atomically mark the 
         *    request as pending.
         * c) request wasn't finished yet
         *    => ( indices[i] == 1 ) and we CAN  atomically mark the 
         *    request as pending.
         * NOTE that in any case (i >= num_requests_done) as latter grows
         * either slowly (in case of partial completion)
         * OR in parallel with `i` (in case of full set completion)  
         */
        if( !indices[num_active_reqs] ) {
            indices[num_requests_done++] = i;
        } else if( !OPAL_ATOMIC_COMPARE_EXCHANGE_STRONG_PTR(&request->req_complete, &_tmp_ptr, REQUEST_PENDING) ) {
            indices[num_requests_done++] = i;
        }
#if OPAL_ENABLE_FT_MPI
        /* Special case for MPI_ANY_SOURCE - Error managed below */
        else if(OPAL_UNLIKELY( ompi_request_is_failed(request) &&
                               MPI_ERR_PROC_FAILED_PENDING == request->req_status.MPI_ERROR )) {
            indices[num_requests_done++] = i;
        }
#endif /* OPAL_ENABLE_FT_MPI */
        num_active_reqs++;
    }
    sync_unsets = num_active_reqs - num_requests_done;

    if( sync_sets == sync_unsets ){
        /* nobody knows about us,
         * set signa-in-progress flag to false
         */
        WAIT_SYNC_SIGNALLED(&sync);
    }

    WAIT_SYNC_RELEASE(&sync);

    /* error path: no requests are done because the sync got triggered
     * We have nothing more to do here besides rearming the sync and trying
     * again */
    if(OPAL_UNLIKELY( 0 == num_requests_done )) {
        assert(OMPI_SUCCESS != sync.status);
        goto recheck;
    }

    *outcount = num_requests_done;

    for (size_t i = 0; i < num_requests_done; i++) {
        request = requests[indices[i]];
#if OPAL_ENABLE_FT_MPI
        /* Special case for MPI_ANY_SOURCE */
        if( MPI_ERR_PROC_FAILED_PENDING == request->req_status.MPI_ERROR ) {
            rc = MPI_ERR_IN_STATUS;
            if (MPI_STATUSES_IGNORE != statuses) {
                OMPI_COPY_STATUS(&statuses[i], request->req_status, true);
                statuses[i].MPI_ERROR = MPI_ERR_PROC_FAILED_PENDING;
            } else {
                if( (MPI_ERR_PROC_FAILED == request->req_status.MPI_ERROR) ||
                    (MPI_ERR_REVOKED == request->req_status.MPI_ERROR) ) {
                    rc = request->req_status.MPI_ERROR;
                }
            }
            continue;
        }
#endif /* OPAL_ENABLE_FT_MPI */
        assert( REQUEST_COMPLETE(request) );

        /* Per note above, we have to call gen request query_fn even
           if STATUS_IGNORE was provided */
        if (OMPI_REQUEST_GEN == request->req_type) {
            ompi_grequest_invoke_query(request, &request->req_status);
        }
        if (MPI_STATUSES_IGNORE != statuses) {
            OMPI_COPY_STATUS(&statuses[i], request->req_status, true);
        }

        if (MPI_SUCCESS != request->req_status.MPI_ERROR) {
            rc = MPI_ERR_IN_STATUS;
        }

        if( request->req_persistent ) {
            request->req_state = OMPI_REQUEST_INACTIVE;
        } else {
            /* Only free the request if there was no error */
            if (MPI_SUCCESS == request->req_status.MPI_ERROR) {
                int tmp;
                tmp = ompi_request_free(&(requests[indices[i]]));
                if (OMPI_SUCCESS != tmp) {
                    return tmp;
                }
            }
        }
    }

    return rc;
}