/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2020 High Performance Computing Center Stuttgart,
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2006-2018 Los Alamos National Security, LLC.  All rights
 *                         reserved.
 * Copyright (c) 2015-2016 Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
 *
 * Copyright (c) 2018-2020 Intel, Inc.  All rights reserved.
 * Copyright (c) 2020      Amazon.com, Inc. or its affiliates.
 *                         All Rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#include "opal_config.h"

#include "opal/constants.h"
#include "opal/mca/base/mca_base_var.h"
#include "opal/mca/threads/threads.h"
#include "opal/mca/timer/base/base.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_params.h"
#include "opal/runtime/opal_progress.h"
#include "opal/util/event.h"
#include "opal/util/output.h"

#define OPAL_PROGRESS_USE_TIMERS       (OPAL_TIMER_CYCLE_SUPPORTED || OPAL_TIMER_USEC_SUPPORTED)
#define OPAL_PROGRESS_ONLY_USEC_NATIVE (OPAL_TIMER_USEC_NATIVE && !OPAL_TIMER_CYCLE_NATIVE)

#if OPAL_ENABLE_DEBUG
bool opal_progress_debug = false;
#endif

/*
 * default parameters
 */
static int opal_progress_event_flag = OPAL_EVLOOP_ONCE | OPAL_EVLOOP_NONBLOCK;
int opal_progress_spin_count = 10000;

/*
 * Local variables
 */
static opal_atomic_lock_t progress_lock;

/* callbacks to progress */
static volatile opal_progress_callback_t *callbacks = NULL;
static size_t callbacks_len = 0;
static size_t callbacks_size = 0;

static volatile opal_progress_callback_t *callbacks_lp = NULL;
static size_t callbacks_lp_len = 0;
static size_t callbacks_lp_size = 0;

/* do we want to yield() if nothing happened */
bool opal_progress_yield_when_idle = false;

#if OPAL_PROGRESS_USE_TIMERS
static opal_timer_t event_progress_last_time = 0;
static opal_timer_t event_progress_delta = 0;
#else
/* current count down until we tick the event library */
static opal_atomic_int32_t event_progress_counter = 0;
/* reset value for counter when it hits 0 */
static int32_t event_progress_delta = 0;
#endif
/* users of the event library from MPI cause the tick rate to
   be every time */
static opal_atomic_int32_t num_event_users = 0;

#if OPAL_ENABLE_DEBUG
static int debug_output = -1;
#endif

/**
 * Fake callback used for threading purposes when one thread
 * progresses callbacks while another unregisters some. The root
 * of the problem is that we allow modifications of the callback
 * array directly from the callbacks themselves. Now if
 * writing a pointer is atomic, we should not have any more
 * problems.
 */
static int fake_cb(void)
{
    return 0;
}

static int _opal_progress_unregister(opal_progress_callback_t cb,
                                     volatile opal_progress_callback_t *callback_array,
                                     size_t *callback_array_len);

static void opal_progress_finalize(void)
{
    /* free memory associated with the callbacks */
    opal_atomic_lock(&progress_lock);

    callbacks_len = 0;
    callbacks_size = 0;
    free((void *) callbacks);
    callbacks = NULL;

    callbacks_lp_len = 0;
    callbacks_lp_size = 0;
    free((void *) callbacks_lp);
    callbacks_lp = NULL;

    opal_atomic_unlock(&progress_lock);
}

/* init the progress engine - called from orte_init */
int opal_progress_init(void)
{
    /* reentrant issues */
    opal_atomic_lock_init(&progress_lock, OPAL_ATOMIC_LOCK_UNLOCKED);

    /* set the event tick rate */
    opal_progress_set_event_poll_rate(10000);

#if OPAL_ENABLE_DEBUG
    if (opal_progress_debug) {
        debug_output = opal_output_open(NULL);
    }
#endif

    callbacks_size = callbacks_lp_size = 8;

    callbacks = malloc(callbacks_size * sizeof(callbacks[0]));
    callbacks_lp = malloc(callbacks_lp_size * sizeof(callbacks_lp[0]));

    if (NULL == callbacks || NULL == callbacks_lp) {
        free((void *) callbacks);
        free((void *) callbacks_lp);
        callbacks_size = callbacks_lp_size = 0;
        callbacks = callbacks_lp = NULL;
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    for (size_t i = 0; i < callbacks_size; ++i) {
        callbacks[i] = fake_cb;
    }

    for (size_t i = 0; i < callbacks_lp_size; ++i) {
        callbacks_lp[i] = fake_cb;
    }

    OPAL_OUTPUT(
        (debug_output, "progress: initialized event flag to: %x", opal_progress_event_flag));
    OPAL_OUTPUT((debug_output, "progress: initialized yield_when_idle to: %s",
                 opal_progress_yield_when_idle ? "true" : "false"));
    OPAL_OUTPUT((debug_output, "progress: initialized num users to: %d", num_event_users));
    OPAL_OUTPUT(
        (debug_output, "progress: initialized poll rate to: %ld", (long) event_progress_delta));

    opal_finalize_register_cleanup(opal_progress_finalize);

    return OPAL_SUCCESS;
}

static int opal_progress_events(void)
{
    static opal_atomic_int32_t lock = 0;
    int events = 0;

    if (opal_progress_event_flag != 0 && !OPAL_THREAD_SWAP_32(&lock, 1)) {
#if OPAL_PROGRESS_USE_TIMERS
#    if OPAL_PROGRESS_ONLY_USEC_NATIVE
        opal_timer_t now = opal_timer_base_get_usec();
#    else
        opal_timer_t now = opal_timer_base_get_cycles();
#    endif /* OPAL_PROGRESS_ONLY_USEC_NATIVE */
        /* trip the event library if we've reached our tick rate and we are
           enabled */
        if (now - event_progress_last_time > event_progress_delta) {
            event_progress_last_time = (num_event_users > 0) ? now - event_progress_delta : now;

            events += opal_event_loop(opal_sync_event_base, opal_progress_event_flag);
        }

#else /* OPAL_PROGRESS_USE_TIMERS */
        /* trip the event library if we've reached our tick rate and we are
           enabled */
        if (OPAL_THREAD_ADD_FETCH32(&event_progress_counter, -1) <= 0) {
            event_progress_counter = (num_event_users > 0) ? 0 : event_progress_delta;
            events += opal_event_loop(opal_sync_event_base, opal_progress_event_flag);
        }
#endif /* OPAL_PROGRESS_USE_TIMERS */
        lock = 0;
    }

    return events;
}

/*
 * Progress the event library and any functions that have registered to
 * be called.  We don't propagate errors from the progress functions,
 * so no action is taken if they return failures.  The functions are
 * expected to return the number of events progressed, to determine
 * whether or not we should yield the CPU during MPI progress.
 * This is only loosely tracked, as an error return can cause the number
 * of progressed events to appear lower than it actually is.  We don't
 * care, as the cost of that happening is far outweighed by the cost
 * of the if checks (they were resulting in bad pipe stalling behavior)
 */
int opal_progress(void)
{
    static uint32_t num_calls = 0;
    size_t i;
    int events = 0;

    /* progress all registered callbacks */
    for (i = 0; i < callbacks_len; ++i) {
        events += (callbacks[i])();
    }

    /* Run low priority callbacks and events once every 8 calls to opal_progress().
     * Even though "num_calls" can be modified by multiple threads, we do not use
     * atomic operations here, for performance reasons. In case of a race, the
     * number of calls may be inaccurate, but since it will eventually be incremented,
     * it's not a problem.
     */
    if (((num_calls++) & 0x7) == 0) {
        for (i = 0; i < callbacks_lp_len; ++i) {
            events += (callbacks_lp[i])();
        }

        opal_progress_events();
    } else if (num_event_users > 0) {
        opal_progress_events();
    }

    if (opal_progress_yield_when_idle && events <= 0) {
        /* If there is nothing to do - yield the processor - otherwise
         * we could consume the processor for the entire time slice. If
         * the processor is oversubscribed - this will result in a best-case
         * latency equivalent to the time-slice.
         * With some thread implementations, yielding might be required
         * to ensure correct scheduling of all communicating threads.
         */
        opal_thread_yield();
    }

    return events;
}

int opal_progress_set_event_flag(int flag)
{
    int tmp = opal_progress_event_flag;
    opal_progress_event_flag = flag;

    OPAL_OUTPUT((debug_output, "progress: set_event_flag setting to %d", flag));

    return tmp;
}

void opal_progress_event_users_increment(void)
{
#if OPAL_ENABLE_DEBUG
    int32_t val;
    val = opal_atomic_add_fetch_32(&num_event_users, 1);

    OPAL_OUTPUT((debug_output, "progress: event_users_increment setting count to %d", val));
#else
    (void) opal_atomic_add_fetch_32(&num_event_users, 1);
#endif

#if OPAL_PROGRESS_USE_TIMERS
    /* force an update next round (we'll be past the delta) */
    event_progress_last_time -= event_progress_delta;
#else
    /* always reset the tick rate - can't hurt */
    event_progress_counter = 0;
#endif
}

void opal_progress_event_users_decrement(void)
{
#if OPAL_ENABLE_DEBUG || !OPAL_PROGRESS_USE_TIMERS
    int32_t val;
    val = opal_atomic_sub_fetch_32(&num_event_users, 1);

    OPAL_OUTPUT((debug_output, "progress: event_users_decrement setting count to %d", val));
#else
    (void) opal_atomic_sub_fetch_32(&num_event_users, 1);
#endif

#if !OPAL_PROGRESS_USE_TIMERS
    /* start now in delaying if it's easy */
    if (val >= 0) {
        event_progress_counter = event_progress_delta;
    }
#endif
}

bool opal_progress_set_yield_when_idle(bool yieldopt)
{
    bool tmp = opal_progress_yield_when_idle;
    opal_progress_yield_when_idle = (yieldopt) ? 1 : 0;

    OPAL_OUTPUT((debug_output, "progress: progress_set_yield_when_idle to %s",
                 opal_progress_yield_when_idle ? "true" : "false"));

    return tmp;
}

void opal_progress_set_event_poll_rate(int polltime)
{
    OPAL_OUTPUT((debug_output, "progress: progress_set_event_poll_rate(%d)", polltime));

#if OPAL_PROGRESS_USE_TIMERS
    event_progress_delta = 0;
#    if OPAL_PROGRESS_ONLY_USEC_NATIVE
    event_progress_last_time = opal_timer_base_get_usec();
#    else
    event_progress_last_time = opal_timer_base_get_cycles();
#    endif
#else
    event_progress_counter = event_progress_delta = 0;
#endif

    if (polltime == 0) {
#if OPAL_PROGRESS_USE_TIMERS
        /* user specified as never tick - tick once per minute */
        event_progress_delta = 60 * 1000000;
#else
        /* user specified as never tick - don't count often */
        event_progress_delta = INT_MAX;
#endif
    } else {
#if OPAL_PROGRESS_USE_TIMERS
        event_progress_delta = polltime;
#else
        /* subtract one so that we can do post-fix subtraction
           in the inner loop and go faster */
        event_progress_delta = polltime - 1;
#endif
    }

#if OPAL_PROGRESS_USE_TIMERS && !OPAL_PROGRESS_ONLY_USEC_NATIVE
    /*  going to use cycles for counter.  Adjust specified usec into cycles */
    event_progress_delta = event_progress_delta * opal_timer_base_get_freq() / 1000000;
#endif
}

static int opal_progress_find_cb(opal_progress_callback_t cb,
                                 volatile opal_progress_callback_t *cbs, size_t cbs_len)
{
    for (size_t i = 0; i < cbs_len; ++i) {
        if (cbs[i] == cb) {
            return (int) i;
        }
    }

    return OPAL_ERR_NOT_FOUND;
}

static int _opal_progress_register(opal_progress_callback_t cb,
                                   volatile opal_progress_callback_t **cbs, size_t *cbs_size,
                                   size_t *cbs_len)
{
    int ret = OPAL_SUCCESS;

    if (OPAL_ERR_NOT_FOUND != opal_progress_find_cb(cb, *cbs, *cbs_len)) {
        return OPAL_SUCCESS;
    }

    /* see if we need to allocate more space */
    if (*cbs_len + 1 > *cbs_size) {
        opal_progress_callback_t *tmp, *old;

        tmp = (opal_progress_callback_t *) malloc(sizeof(tmp[0]) * 2 * *cbs_size);
        if (tmp == NULL) {
            return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
        }

        if (*cbs) {
            /* copy old callbacks */
            memcpy(tmp, (void *) *cbs, sizeof(tmp[0]) * *cbs_size);
        }

        for (size_t i = *cbs_len; i < 2 * *cbs_size; ++i) {
            tmp[i] = fake_cb;
        }

        opal_atomic_wmb();

        /* swap out callback array */
        old = (opal_progress_callback_t *) opal_atomic_swap_ptr((opal_atomic_intptr_t *) cbs,
                                                                (intptr_t) tmp);

        opal_atomic_wmb();

        free(old);
        *cbs_size *= 2;
    }

    cbs[0][*cbs_len] = cb;
    ++*cbs_len;

    opal_atomic_wmb();

    return ret;
}

int opal_progress_register(opal_progress_callback_t cb)
{
    int ret;

    opal_atomic_lock(&progress_lock);

    (void) _opal_progress_unregister(cb, callbacks_lp, &callbacks_lp_len);

    ret = _opal_progress_register(cb, &callbacks, &callbacks_size, &callbacks_len);

    opal_atomic_unlock(&progress_lock);

    return ret;
}

int opal_progress_register_lp(opal_progress_callback_t cb)
{
    int ret;

    opal_atomic_lock(&progress_lock);

    (void) _opal_progress_unregister(cb, callbacks, &callbacks_len);

    ret = _opal_progress_register(cb, &callbacks_lp, &callbacks_lp_size, &callbacks_lp_len);

    opal_atomic_unlock(&progress_lock);

    return ret;
}

static int _opal_progress_unregister(opal_progress_callback_t cb,
                                     volatile opal_progress_callback_t *callback_array,
                                     size_t *callback_array_len)
{
    int ret = opal_progress_find_cb(cb, callback_array, *callback_array_len);
    if (OPAL_ERR_NOT_FOUND == ret) {
        return ret;
    }

    /* If we found the function we're unregistering: If callbacks_len
       is 0, we're not goig to do anything interesting anyway, so
       skip.  If callbacks_len is 1, it will soon be 0, so no need to
       do any repacking. */
    for (size_t i = (size_t) ret; i < *callback_array_len - 1; ++i) {
        /* copy callbacks atomically since another thread may be in
         * opal_progress(). */
        (void) opal_atomic_swap_ptr((opal_atomic_intptr_t *) (callback_array + i),
                                    (intptr_t) callback_array[i + 1]);
    }

    --*callback_array_len;
    callback_array[*callback_array_len] = fake_cb;

    return OPAL_SUCCESS;
}

int opal_progress_unregister(opal_progress_callback_t cb)
{
    int ret;

    opal_atomic_lock(&progress_lock);

    ret = _opal_progress_unregister(cb, callbacks, &callbacks_len);

    if (OPAL_SUCCESS != ret) {
        /* if not in the high-priority array try to remove from the lp array.
         * a callback will never be in both. */
        ret = _opal_progress_unregister(cb, callbacks_lp, &callbacks_lp_len);
    }

    opal_atomic_unlock(&progress_lock);

    return ret;
}