/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All Rights * reserved. * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "ompi_config.h" #include "mpi.h" #include "opal/util/bit_ops.h" #include "ompi/constants.h" #include "ompi/communicator/communicator.h" #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/coll/base/coll_base_functions.h" #include "coll_base_topo.h" #include "coll_base_util.h" /** * A quick version of the MPI_Sendreceive implemented for the barrier. * No actual data is moved across the wire, we use 0-byte messages to * signal a two peer synchronization. */ static inline int ompi_coll_base_sendrecv_zero( int dest, int stag, int source, int rtag, MPI_Comm comm ) { int rc, line = 0; ompi_request_t *req = MPI_REQUEST_NULL; ompi_status_public_t status; /* post new irecv */ rc = MCA_PML_CALL(irecv( NULL, 0, MPI_BYTE, source, rtag, comm, &req )); if( MPI_SUCCESS != rc ) { line = __LINE__; goto error_handler; } /* send data to children */ rc = MCA_PML_CALL(send( NULL, 0, MPI_BYTE, dest, stag, MCA_PML_BASE_SEND_STANDARD, comm )); if( MPI_SUCCESS != rc ) { line = __LINE__; goto error_handler; } rc = ompi_request_wait( &req, &status ); if( MPI_SUCCESS != rc ) { line = __LINE__; goto error_handler; } return (MPI_SUCCESS); error_handler: if( MPI_REQUEST_NULL != req ) { /* cancel and complete the receive request */ #if OPAL_ENABLE_FT_MPI if( MPI_ERR_PROC_FAILED == req->req_status.MPI_ERROR || MPI_ERR_PROC_FAILED_PENDING == req->req_status.MPI_ERROR || MPI_ERR_REVOKED == req->req_status.MPI_ERROR ) { /* We cannot just 'free' and forget, as the PML/BTLS would still * be updating the request buffer after we return from the MPI * call! * For other errors that do not have a well defined post-error * behavior, calling the cancel/wait could deadlock, so we just * free, as this is the best that can be done in this case. */ ompi_request_cancel(req); ompi_request_wait(&req, MPI_STATUS_IGNORE); if( MPI_ERR_PROC_FAILED_PENDING == rc ) { rc = MPI_ERR_PROC_FAILED; } } else /* this 'else' intentionally spills outside the ifdef */ #endif /* OPAL_ENABLE_FT_MPI */ ompi_request_free(&req); } OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n", __FILE__, line, rc)); (void)line; // silence compiler warning return rc; } /* * Barrier is meant to be a synchronous operation, as some BTLs can mark * a request done before its passed to the NIC and progress might not be made * elsewhere we cannot allow a process to exit the barrier until its last * [round of] sends are completed. * * It is last round of sends rather than 'last' individual send as each pair of * peers can use different channels/devices/btls and the receiver of one of * these sends might be forced to wait as the sender * leaves the collective and does not make progress until the next mpi call * */ /* * Simple double ring version of barrier * * synchronous guarantee made by last ring of sends are synchronous * */ int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, size, err = 0, line = 0, left, right; size = ompi_comm_size(comm); if( 1 == size ) return OMPI_SUCCESS; rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank)); left = ((size+rank-1)%size); right = ((rank+1)%size); if (rank > 0) { /* receive message from the left */ err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* Send message to the right */ err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* root needs to receive from the last node */ if (rank == 0) { err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* Allow nodes to exit */ if (rank > 0) { /* post Receive from left */ err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* send message to the right one */ err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, right, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_SYNCHRONOUS, comm)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* rank 0 post receive from the last node */ if (rank == 0) { err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, left, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } return MPI_SUCCESS; err_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); (void)line; // silence compiler warning return err; } /* * To make synchronous, uses sync sends and sync sendrecvs */ int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, size, adjsize, err, line, mask, remote; size = ompi_comm_size(comm); if( 1 == size ) return OMPI_SUCCESS; rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_recursivedoubling rank %d", rank)); /* do nearest power of 2 less than size calc */ adjsize = opal_next_poweroftwo(size); adjsize >>= 1; /* if size is not exact power of two, perform an extra step */ if (adjsize != size) { if (rank >= adjsize) { /* send message to lower ranked node */ remote = rank - adjsize; err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, remote, MCA_COLL_BASE_TAG_BARRIER, comm); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} } else if (rank < (size - adjsize)) { /* receive message from high level rank */ err = MCA_PML_CALL(recv((void*)NULL, 0, MPI_BYTE, rank+adjsize, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} } } /* exchange messages */ if ( rank < adjsize ) { mask = 0x1; while ( mask < adjsize ) { remote = rank ^ mask; mask <<= 1; if (remote >= adjsize) continue; /* post receive from the remote node */ err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, remote, MCA_COLL_BASE_TAG_BARRIER, comm); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} } } /* non-power of 2 case */ if (adjsize != size) { if (rank < (size - adjsize)) { /* send enter message to higher ranked node */ remote = rank + adjsize; err = MCA_PML_CALL(send((void*)NULL, 0, MPI_BYTE, remote, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_SYNCHRONOUS, comm)); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} } } return MPI_SUCCESS; err_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); (void)line; // silence compiler warning return err; } /* * To make synchronous, uses sync sends and sync sendrecvs */ int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, size, distance, to, from, err, line = 0; size = ompi_comm_size(comm); if( 1 == size ) return MPI_SUCCESS; rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_bruck rank %d", rank)); /* exchange data with rank-2^k and rank+2^k */ for (distance = 1; distance < size; distance <<= 1) { from = (rank + size - distance) % size; to = (rank + distance) % size; /* send message to lower ranked node */ err = ompi_coll_base_sendrecv_zero(to, MCA_COLL_BASE_TAG_BARRIER, from, MCA_COLL_BASE_TAG_BARRIER, comm); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;} } return MPI_SUCCESS; err_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); (void)line; // silence compiler warning return err; } /* * To make synchronous, uses sync sends and sync sendrecvs */ /* special case for two processes */ int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int remote, size, err; size = ompi_comm_size(comm); if( 1 == size ) return MPI_SUCCESS; if( 2 != ompi_comm_size(comm) ) { return MPI_ERR_UNSUPPORTED_OPERATION; } remote = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_two_procs rank %d", remote)); remote = (remote + 1) & 0x1; err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER, remote, MCA_COLL_BASE_TAG_BARRIER, comm); return (err); } /* * Linear functions are copied from the BASIC coll module * they do not segment the message and are simple implementations * but for some small number of nodes and/or small data sizes they * are just as fast as base/tree based segmenting operations * and as such may be selected by the decision functions * These are copied into this module due to the way we select modules * in V1. i.e. in V2 we will handle this differently and so will not * have to duplicate code. * GEF Oct05 after asking Jeff. */ /* copied function (with appropriate renaming) starts here */ int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, err, rank, size, line; ompi_request_t** requests = NULL; size = ompi_comm_size(comm); if( 1 == size ) return MPI_SUCCESS; rank = ompi_comm_rank(comm); /* All non-root send & receive zero-length message. */ if (rank > 0) { err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } /* The root collects and broadcasts the messages. */ else { requests = ompi_coll_base_comm_get_reqs(module->base_data, size); if( NULL == requests ) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } for (i = 1; i < size; ++i) { err = MCA_PML_CALL(irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE, MCA_COLL_BASE_TAG_BARRIER, comm, &(requests[i]))); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } err = ompi_request_wait_all( size-1, requests+1, MPI_STATUSES_IGNORE ); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } requests = NULL; /* we're done the requests array is clean */ for (i = 1; i < size; ++i) { err = MCA_PML_CALL(send(NULL, 0, MPI_BYTE, i, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } /* All done */ return MPI_SUCCESS; err_hndl: if( NULL != requests ) { /* find a real error code */ if (MPI_ERR_IN_STATUS == err) { for( i = 0; i < size; i++ ) { if (MPI_REQUEST_NULL == requests[i]) continue; if (MPI_ERR_PENDING == requests[i]->req_status.MPI_ERROR) continue; if (requests[i]->req_status.MPI_ERROR != MPI_SUCCESS) { err = requests[i]->req_status.MPI_ERROR; break; } } } ompi_coll_base_free_reqs(requests, size); #if OPAL_ENABLE_FT_MPI if( MPI_ERR_PROC_FAILED_PENDING == err ) { /* do not return any-source errors from a collective */ err = MPI_ERR_PROC_FAILED; } #endif /* OPAL_ENABLE_FT_MPI */ } OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank) ); (void)line; // silence compiler warning return err; } /* copied function (with appropriate renaming) ends here */ /* * Another recursive doubling type algorithm, but in this case * we go up the tree and back down the tree. */ int ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int rank, size, depth, err, jump, partner; size = ompi_comm_size(comm); if( 1 == size ) return MPI_SUCCESS; rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_barrier_intra_tree %d", rank)); /* Find the nearest power of 2 of the communicator size. */ depth = opal_next_poweroftwo_inclusive(size); for (jump=1; jump rank) { err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) return err; } else if (partner < rank) { err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) return err; } } } depth >>= 1; for (jump = depth; jump>0; jump>>=1) { partner = rank ^ jump; if (!(partner & (jump-1)) && partner < size) { if (partner > rank) { err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, partner, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) return err; } else if (partner < rank) { err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, partner, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) return err; } } } return MPI_SUCCESS; }