/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2006 The Regents of the University of California. * All rights reserved. * Copyright (c) 2012 Sandia National Laboratories. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2017 Intel, Inc. All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ /** * @file * * Matching Transport Layer * * The Matching Transport Layer (MTL) provides device-layer support * for transfer of MPI point-to-point messages over devices that * support hardware / library message matching. This layer is used * with the MTL PML component to provide lowest latency and highest * bandwidth on given architectures. Features found in other PML * interfaces, such as message fragmenting, multi-device support, and * NIC failover are not provided by the upper layers. * * In general, this interface should not be used for transport layer * support. Instead, the BTL interface should be used. The BTL * interface allows for multiplexing between multiple users * (point-to-point, one-sided, etc.) and provides many features not * found in this interface (RDMA from arbitrary buffers, active * messaging, reasonable pinned memory caching, etc.) */ #ifndef OMPI_MTL_H #define OMPI_MTL_H #include "ompi_config.h" #include "mpi.h" /* needed for MPI_ANY_TAG */ #include "ompi/mca/mca.h" #include "ompi/mca/pml/pml_constants.h" /* for send_mode enum */ #include "ompi/request/request.h" BEGIN_C_DECLS struct ompi_request_t; struct opal_convertor_t; struct mca_mtl_base_module_t; struct mca_mtl_request_t { /** pointer to associated ompi_request_t */ struct ompi_request_t *ompi_req; void (*completion_callback)(struct mca_mtl_request_t* mtl_request); }; typedef struct mca_mtl_request_t mca_mtl_request_t; /** * MTL module flags */ #define MCA_MTL_BASE_FLAG_REQUIRE_WORLD 0x00000001 #define MCA_MTL_BASE_FLAG_ACCELERATOR_INIT_DISABLE 0x00000002 #define MCA_MTL_BASE_FLAG_SUPPORTS_EXT_CID 0x00000004 /** * Initialization routine for MTL component * * Initialization routine for MTL component. This function should * allocate resources for communication and try to do all local setup. * It should not attempt to contact it's peers, as that should be * done at add_procs time. Contact information should be published * during this initialization function. It will be made available * during add_procs(). * * @param enable_progress_threads (IN) Progress threads have been * enabled by the user and the component must be * capable of making asynchronous progress (either * with its own thread, with the kernel, or with * the event library. * @param enable_mpi_threads (IN) MPI threads have been enabled by the * user and the component must be capable of coping * with threads. If the component can cope with * MPI_THREAD_MULTIPLE, enable_mpi_thread_multiple * should be set to true. Otherwise, it is assumed * that only THREAD_FUNNELLED and THREAD_SERIALIZED * can be used. * @param enable_mpi_thread_multiple (OUT) Component does / does not * support MPI_THREAD_MULTIPLE. This variable only * needs to be set if enable_mpi_threads is true. * Otherwise, the return value will be ignored. * @param accelerator_support (OUT) Component does / does not support * direct transfers with an accelerator buffer. * * @retval NULL component can not operate on the current machine * @retval non-NULL component interface function */ typedef struct mca_mtl_base_module_t* (*mca_mtl_base_component_init_fn_t)(bool enable_progress_threads, bool enable_mpi_threads, bool *accelerator_support); struct mca_mtl_base_component_2_0_0_t { mca_base_component_t mtl_version; mca_base_component_data_t mtl_data; mca_mtl_base_component_init_fn_t mtl_init; bool accelerator_support; }; typedef struct mca_mtl_base_component_2_0_0_t mca_mtl_base_component_2_0_0_t; typedef struct mca_mtl_base_component_2_0_0_t mca_mtl_base_component_t; /** * MCA->MTL Clean up any resources held by MTL module * * Opposite of module_init. Called when communication will no longer * be necessary. Usually this is during MPI_FINALIZE, but it can be * earlier if the component was not selected to run. Assuming * module_init was called, finalize will always be called before the * component_close function is called. * * @param mtl (IN) MTL module returned from call to initialize * * @retval OMPI_SUCCESS cleanup finished successfully * @retval other failure during cleanup * */ typedef int (*mca_mtl_base_module_finalize_fn_t)(struct mca_mtl_base_module_t* mtl); /** * PML->MTL notification of change in the process list. * * The mca_mtl_base_module_add_procs_fn_t() is used by the PML to * notify the MTL that new processes are connected to the current * process. Any addressing information exported by the peer via the * ompi_modex_send() function should be available during this * call via the corresponding ompi_modex_recv() function. The * MTL may utilize this information to determine reachability of each * peer process. * * It is an error for a proc to not be reachable by the given MTL, and * an error should be returned if that case is detected. If a MTL * requires per-endpoint data, it must handle storage, either using a * static endpoint tag (MTL is the default tag that should generally * be used) or a dynamic endpoint tag (although it should be noted * that OMPI can be built without dynamic endpoint tag support). * * @param mtl (IN) MTL module * @param nprocs (IN) Number of processes * @param procs (IN) Set of processes * * @retval OMPI_SUCCESS successfully connected to processes * @retval other failure during setup */ typedef int (*mca_mtl_base_module_add_procs_fn_t)( struct mca_mtl_base_module_t* mtl, size_t nprocs, struct ompi_proc_t** procs); /** * Notification of change to the process list. * * When the process list changes, the PML notifies the MTL of the * change, to provide the opportunity to cleanup or release any * resources associated with the peer. The MTL is responsible for * releasing any memory associated with the endpoint data it may have * stored during add_procs(). * * @param mtl (IN) MTL module * @param nprocs (IN) Number of processes * @param proc (IN) Set of processes * @param peer (IN) Set of peer addressing information. * * @return Status indicating if cleanup was successful */ typedef int (*mca_mtl_base_module_del_procs_fn_t)( struct mca_mtl_base_module_t* mtl, size_t nprocs, struct ompi_proc_t** procs); /** * Blocking send to peer * * Blocking send (Call should not return until the user buffer may be * used again). Standard MPI semantics must be met by this call, as * mandated in the mode argument. There is one special mode argument, * MCA_PML_BASE_SEND_COMPLETE, which requires local completion before * the function can return. This is an optimization for coillective * routines that can otherwise lead to degenerate performance for * broadcast-based collectives. * * @param comm (IN) Communicator used for operation * @param dest (IN) Destination rank for send (relative to comm) * @param tag (IN) MPI tag used for sending. See note below. * @param convertor (IN) Datatype convertor describing send datatype. * Already prepared for send. * @param mode (IN) Mode for send operation * * @return OMPI_SUCCESS or error value * * \note Open MPI is built around non-blocking operations. This * function is provided for networks where progressing events outside * of point-to-point (for example, collectives, I/O, one-sided) can * occur without a progress function regularly being triggered. * * \note While MPI does not allow users to specify negative tags, they * are used internally in Open MPI to provide a unique channel for * collective operations. Therefore, the MTL can *not* cause an error * if a negative tag is used. */ typedef int (*mca_mtl_base_module_send_fn_t)( struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t *comm, int dest, int tag, struct opal_convertor_t *convertor, mca_pml_base_send_mode_t mode); /** * Non-blocking send to peer * * Non-blocking send to peer. Standard MPI semantics must be met by * this call, as mandated in the mode argument. There is one special * mode argument, MCA_PML_BASE_SEND_COMPLETE, which requires local * completion before the request is marked as complete. * * The PML will handle creation of the request, leaving the number of * bytes requested in the module structure available for the MTL * directly after the ompi_request_t structure. The PML will handle * proper destruction of the request once it can safely be destructed * (it has been completed and freed by a call to REQUEST_FReE or * TEST/WAIT). The MTL should remove all resources associated with * the request when it is marked as completed. * * @param comm (IN) Communicator used for operation * @param dest (IN) Destination rank for send (relative to comm) * @param tag (IN) MPI tag used for sending. See note below. * @param convertor (IN) Datatype convertor describing send datatype. * Already prepared for send. * @param mode (IN) Mode for send operation (see pml.h) * @param blocking (IN) True if the call originated from a blocking * call, but the PML decided to use a * non-blocking operation, likely for * internal performance decisions This is an * optimization flag and is not needed for * correctness. * @param mtl_request (IN) Pointer to mtl_request. The ompi_req field * will be populated with an initialized * ompi_request_t before calling. * * @return OMPI_SUCCESS or error value * * \note While MPI does not allow users to specify negative tags, they * are used internally in Open MPI to provide a unique channel for * collective operations. Therefore, the MTL can *not* cause an error * if a negative tag is used. */ typedef int (*mca_mtl_base_module_isend_fn_t)( struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t *comm, int dest, int tag, struct opal_convertor_t *convertor, mca_pml_base_send_mode_t mode, bool blocking, mca_mtl_request_t *mtl_request); /** * Non-blocking receive * * Non-blocking receive function. Standard MPI semantics for * MPI_Irecv must be implemented by this call. * * The PML will handle creation of the request, leaving the number of * bytes requested in the module structure available for the MTL, * directly after the ompi_request_t structure. The PML will handle * proper destruction of the request once it can safely be destroyed * (it has been completed and free'ed by a call to REQUEST_FREE or * TEST/WAIT). The MTL should remove all resources associated with * the request when it is marked as completed. * * @param comm (IN) Communicator used for operation * @param src (IN) Source rank for send (relative to comm) * @param tag (IN) MPI tag used for sending. See note below. * @param convertor (IN) Datatype convertor describing receive datatype. * Already prepared for receive. * @param mtl_request (IN) Pointer to mtl_request. The ompi_req field * will be populated with an initialized * ompi_request_t before calling. * * @return OMPI_SUCCESS or error value * * \note While MPI does not allow users to specify negative tags, they * are used internally in Open MPI to provide a unique channel for * collective operations. Therefore, the MTL can *not* cause an error * if a negative tag is used. Further, MPI_ANY_TAG should *not* match * against negative tags. */ typedef int (*mca_mtl_base_module_irecv_fn_t)( struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t *comm, int src, int tag, struct opal_convertor_t *convertor, struct mca_mtl_request_t *mtl_request); /** * Non-blocking probe * * Non-blocking probe function. Standard MPI semantics for MPI_IPROBE * must be implemented by this call. * * @param comm (IN) Communicator used for operation * @param src (IN) Source rank for send (relative to comm) * @param tag (IN) MPI tag used for sending. See note below. * @param flag (OUT) true if message available, false otherwise * @param status (OUT) Status structure for information on * available message * * \note While MPI does not allow users to specify negative tags, they * are used internally in Open MPI to provide a unique channel for * collective operations. Therefore, the MTL can *not* cause an error * if a negative tag is used. Further, MPI_ANY_TAG should *not* match * against negative tags. */ typedef int (*mca_mtl_base_module_iprobe_fn_t)( struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t *comm, int src, int tag, int *flag, struct ompi_status_public_t *status); typedef int (*mca_mtl_base_module_imrecv_fn_t)(struct mca_mtl_base_module_t* mtl, struct opal_convertor_t *convertor, struct ompi_message_t **message, struct mca_mtl_request_t *mtl_request); typedef int (*mca_mtl_base_module_improbe_fn_t)(struct mca_mtl_base_module_t *mtl, struct ompi_communicator_t *comm, int src, int tag, int *matched, struct ompi_message_t **message, struct ompi_status_public_t *status); /** * Cancel an existing request * * Attempt to cancel an existing request. The (poorly defined) * semantics for MPI_CANCEL must be implemented by this call. This, * of course, allows the MTL module to do nothing at all. * Implementations of the MTL should make a good faith effort to * cancel receive requests that have not been started, as the "post a * receive for control messages" paradigm is a common one in loosely * coupled MPI applications. * * @param request(IN) Request that should be cancelled * @param flag Unknown exactly what this does. * */ typedef int (*mca_mtl_base_module_cancel_fn_t)( struct mca_mtl_base_module_t* mtl, mca_mtl_request_t *mtl_request, int flag); /** * Downcall from PML layer when a new communicator is created. * * @param comm Communicator * @return OMPI_SUCCESS or failure status. * * Provides the MTL the opportunity to initialize/cache a data structure * on the communicator. */ typedef int (*mca_mtl_base_module_add_comm_fn_t)( struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t* comm); /** * Downcall from PML layer when a communicator is destroyed. * * @param comm Communicator * @return OMPI_SUCCESS or failure status. * * Provides the MTL the opportunity to cleanup any datastructures * associated with the communicator. */ typedef int (*mca_mtl_base_module_del_comm_fn_t)( struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t* comm); /** * MTL module interface functions and attributes. */ struct mca_mtl_base_module_t { int mtl_max_contextid; /**< maximum allowable contextid */ int mtl_max_tag; /**< maximum tag value. note that negative tags must be allowed */ size_t mtl_request_size; /**< number of bytes to reserve with request structure */ uint32_t mtl_flags; /**< flags (put/get...) */ /* MTL function table */ mca_mtl_base_module_add_procs_fn_t mtl_add_procs; mca_mtl_base_module_del_procs_fn_t mtl_del_procs; mca_mtl_base_module_finalize_fn_t mtl_finalize; mca_mtl_base_module_send_fn_t mtl_send; mca_mtl_base_module_isend_fn_t mtl_isend; mca_mtl_base_module_irecv_fn_t mtl_irecv; mca_mtl_base_module_iprobe_fn_t mtl_iprobe; mca_mtl_base_module_imrecv_fn_t mtl_imrecv; mca_mtl_base_module_improbe_fn_t mtl_improbe; /* Optional MTL functions */ mca_mtl_base_module_cancel_fn_t mtl_cancel; mca_mtl_base_module_add_comm_fn_t mtl_add_comm; mca_mtl_base_module_del_comm_fn_t mtl_del_comm; }; typedef struct mca_mtl_base_module_t mca_mtl_base_module_t; /* * Macro for use in modules that are of type mtl */ #define MCA_MTL_BASE_VERSION_2_0_0 \ OMPI_MCA_BASE_VERSION_2_1_0("mtl", 2, 0, 0) OMPI_DECLSPEC extern mca_mtl_base_module_t *ompi_mtl; /* * macro for doing direct call / call through struct */ #if MCA_ompi_mtl_DIRECT_CALL #define OMPI_MTL_CALL_STAMP(a, b) ompi_mtl_ ## a ## _ ## b #define OMPI_MTL_CALL_EXPANDER(a, b) OMPI_MTL_CALL_STAMP(a,b) #define OMPI_MTL_CALL(a) OMPI_MTL_CALL_EXPANDER(MCA_ompi_mtl_DIRECT_CALL_COMPONENT, a) #include MCA_ompi_mtl_DIRECT_CALL_HEADER #else #define OMPI_MTL_CALL(a) ompi_mtl->mtl_ ## a #endif END_C_DECLS #endif