# # Copyright (c) 2020 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # $COPYRIGHT$ # # Additional copyrights may follow # # $HEADER$ # # An Aggregate MCA Parameter Set to setup an environment that can support # User-Level Failure Mitigation (ULFM) fault tolerance (must also be # compiled in with --with-ft=mpi). # # Usage: # shell$ mpirun --tune ft-mpi ./app # mpi_ft_enable=true # Since failures are expected, reduce the verbosity of the transport errors btl_base_warn_peer_error=false # # Performance tuning parameters (default shown) # By default the PRTE failure detector is used (see README.ULFM.md) #mpi_ft_detector=false #mpi_ft_detector_thread=false #mpi_ft_detector_rdma_heartbeat=false #mpi_ft_detector_period=3. #mpi_ft_detector_timeout=10. # # # Select only ULFM ready components # disabling non-tested and known broken components in FT-MPI builds # # # The following frameworks/components are TESTED # They handle faults amd should be preferred when running with FT. # pml ob1 # btl tcp, self, sm(+xpmem,+cma), ugni, uct # coll base/basic, tuned, ftagree, libnbc pml=ob1 threads=pthreads # # The following frameworks/components are UNTESTED, but **may** work. # They should run without faults, and **may** work with faults. # You may try and report if successful. # btl ofi, portals4, smcuda, usnic, sm(+knem) # coll inter, sm, sync, cuda, monitoring # pml monitoring, v/vprotocol # We will disable only the components for which good components are known to exist. btl=^usnic # older versions of xpmem generate bus errors when the other end is dead. #btl_sm_single_copy_mechanism=cma # # The following frameworks/components are UNTESTED, and probably won't work. # They should run without faults, and will probably crash/deadlock after a fault. # You may try at your own risk. # coll hcoll, portals4 # topo (all) # osc (all) # io (all) # fcoll (all) # fbtl (all) # We will disable only the components for which good components are known to exist. # Other untested components are selectable but will issue a runtime warning at # initiation if FT is enabled. coll=^hcoll,portals4 # # The following frameworks/components are NOT WORKING. Do not enable these with FT. # mtl (all) # pml cm, crcpw, ucx mtl=^ofi,portals4,psm2 # already enforced by pml=ob1 above #pml=^cm,crcpw,ucx # already enforced by threads=pthreads above #threads=^argobots,qthreads # There is a bug in libevent with the "select" backend that causes an infinite loop # when an unplanned disconnect happens. Use something else, or bail. opal_event_include=epoll,devpoll,kqueue,evport,poll