/* * Copyright (c) 2013-2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013-2018 Inria. All rights reserved. * Copyright (c) 2015 Bull SAS. All rights reserved. * Copyright (c) 2016-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "ompi_config.h" #include "common_monitoring.h" #include "common_monitoring_coll.h" #include "ompi/constants.h" #include "ompi/communicator/communicator.h" #include "opal/mca/base/mca_base_component_repository.h" #include "opal/class/opal_hash_table.h" #include /*** Monitoring specific variables ***/ struct mca_monitoring_coll_data_t { opal_object_t super; char*procs; char*comm_name; int world_rank; int is_released; ompi_communicator_t*p_comm; opal_atomic_size_t o2a_count; opal_atomic_size_t o2a_size; opal_atomic_size_t a2o_count; opal_atomic_size_t a2o_size; opal_atomic_size_t a2a_count; opal_atomic_size_t a2a_size; }; /* Collectives operation monitoring */ static opal_hash_table_t *comm_data = NULL; int mca_common_monitoring_coll_cache_name(ompi_communicator_t*comm) { mca_monitoring_coll_data_t*data; int ret = opal_hash_table_get_value_uint64(comm_data, *((uint64_t*)&comm), (void*)&data); if( OPAL_SUCCESS == ret ) { data->comm_name = strdup(comm->c_name); data->p_comm = NULL; } return ret; } static inline void mca_common_monitoring_coll_cache(mca_monitoring_coll_data_t*data) { if( -1 == data->world_rank ) { /* Get current process world_rank */ mca_common_monitoring_get_world_rank(ompi_comm_rank(data->p_comm), data->p_comm->c_remote_group, &data->world_rank); } /* Only list procs if the hashtable is already initialized, i.e. if the previous call worked */ if( (-1 != data->world_rank) && (NULL == data->procs || 0 == strlen(data->procs)) ) { int i, pos = 0, size, world_size = -1, max_length, world_rank; char*tmp_procs; size = ompi_comm_size(data->p_comm); world_size = ompi_comm_size((ompi_communicator_t*)&ompi_mpi_comm_world) - 1; assert( 0 < size ); /* Allocate enough space for list (add 1 to keep the final '\0' if already exact size) */ max_length = snprintf(NULL, 0, "%d,", world_size - 1) + 1; tmp_procs = malloc((1 + max_length * size) * sizeof(char)); if( NULL == tmp_procs ) { OPAL_MONITORING_PRINT_ERR("Cannot allocate memory for caching proc list."); } else { tmp_procs[0] = '\0'; /* Build procs list */ for(i = 0; i < size; ++i) { if( OPAL_SUCCESS == mca_common_monitoring_get_world_rank(i, data->p_comm->c_remote_group, &world_rank) ) pos += sprintf(&tmp_procs[pos], "%d,", world_rank); } tmp_procs[pos - 1] = '\0'; /* Remove final coma */ data->procs = realloc(tmp_procs, pos * sizeof(char)); /* Adjust to size required */ } } } mca_monitoring_coll_data_t*mca_common_monitoring_coll_new( ompi_communicator_t*comm ) { mca_monitoring_coll_data_t*data = OBJ_NEW(mca_monitoring_coll_data_t); if( NULL == data ) { OPAL_MONITORING_PRINT_ERR("coll: new: data structure cannot be allocated"); return NULL; } data->p_comm = comm; /* Allocate hashtable */ if( NULL == comm_data ) { comm_data = OBJ_NEW(opal_hash_table_t); if( NULL == comm_data ) { OPAL_MONITORING_PRINT_ERR("coll: new: failed to allocate hashtable"); return data; } opal_hash_table_init(comm_data, 2048); } /* Insert in hashtable */ uint64_t key = *((uint64_t*)&comm); if( OPAL_SUCCESS != opal_hash_table_set_value_uint64(comm_data, key, (void*)data) ) { OPAL_MONITORING_PRINT_ERR("coll: new: failed to allocate memory or " "growing the hash table"); } /* Cache data so the procs can be released without affecting the output */ mca_common_monitoring_coll_cache(data); return data; } void mca_common_monitoring_coll_release(mca_monitoring_coll_data_t*data) { #if OPAL_ENABLE_DEBUG if( NULL == data ) { OPAL_MONITORING_PRINT_ERR("coll: release: data structure empty or already deallocated"); return; } #endif /* OPAL_ENABLE_DEBUG */ /* not flushed yet */ data->is_released = 1; mca_common_monitoring_coll_cache(data); } static void mca_common_monitoring_coll_cond_release(mca_monitoring_coll_data_t*data) { #if OPAL_ENABLE_DEBUG if( NULL == data ) { OPAL_MONITORING_PRINT_ERR("coll: release: data structure empty or already deallocated"); return; } #endif /* OPAL_ENABLE_DEBUG */ if( data->is_released ) { /* if the communicator is already released */ opal_hash_table_remove_value_uint64(comm_data, *((uint64_t*)&data->p_comm)); data->p_comm = NULL; free(data->comm_name); free(data->procs); OBJ_RELEASE(data); } } void mca_common_monitoring_coll_finalize( void ) { if( NULL != comm_data ) { opal_hash_table_remove_all( comm_data ); OBJ_RELEASE(comm_data); } } void mca_common_monitoring_coll_flush(FILE *pf, mca_monitoring_coll_data_t*data) { /* Flush data */ fprintf(pf, "D\t%s\tprocs: %s\n" "O2A\t%" PRId32 "\t%zu bytes\t%zu msgs sent\n" "A2O\t%" PRId32 "\t%zu bytes\t%zu msgs sent\n" "A2A\t%" PRId32 "\t%zu bytes\t%zu msgs sent\n", data->comm_name ? data->comm_name : data->p_comm ? data->p_comm->c_name : "(no-name)", data->procs, data->world_rank, data->o2a_size, data->o2a_count, data->world_rank, data->a2o_size, data->a2o_count, data->world_rank, data->a2a_size, data->a2a_count); } void mca_common_monitoring_coll_flush_all(FILE *pf) { if( NULL == comm_data ) return; /* No hashtable */ uint64_t key; mca_monitoring_coll_data_t*previous = NULL, *data; OPAL_HASH_TABLE_FOREACH(key, uint64, data, comm_data) { if( NULL != previous && NULL == previous->p_comm ) { /* Phase flushed -> free already released once coll_data_t */ mca_common_monitoring_coll_cond_release(previous); } mca_common_monitoring_coll_flush(pf, data); previous = data; } mca_common_monitoring_coll_cond_release(previous); } void mca_common_monitoring_coll_reset(void) { if( NULL == comm_data ) return; /* No hashtable */ uint64_t key; mca_monitoring_coll_data_t*data; OPAL_HASH_TABLE_FOREACH(key, uint64, data, comm_data) { data->o2a_count = 0; data->o2a_size = 0; data->a2o_count = 0; data->a2o_size = 0; data->a2a_count = 0; data->a2a_size = 0; } } int mca_common_monitoring_coll_messages_notify(mca_base_pvar_t *pvar, mca_base_pvar_event_t event, void *obj_handle, int *count) { switch (event) { case MCA_BASE_PVAR_HANDLE_BIND: *count = 1; case MCA_BASE_PVAR_HANDLE_UNBIND: return OMPI_SUCCESS; case MCA_BASE_PVAR_HANDLE_START: mca_common_monitoring_current_state = mca_common_monitoring_enabled; return OMPI_SUCCESS; case MCA_BASE_PVAR_HANDLE_STOP: mca_common_monitoring_current_state = 0; return OMPI_SUCCESS; } return OMPI_ERROR; } void mca_common_monitoring_coll_o2a(size_t size, mca_monitoring_coll_data_t*data) { if( 0 == mca_common_monitoring_current_state ) return; /* right now the monitoring is not started */ #if OPAL_ENABLE_DEBUG if( NULL == data ) { OPAL_MONITORING_PRINT_ERR("coll: o2a: data structure empty"); return; } #endif /* OPAL_ENABLE_DEBUG */ opal_atomic_add_fetch_size_t(&data->o2a_size, size); opal_atomic_add_fetch_size_t(&data->o2a_count, 1); } int mca_common_monitoring_coll_get_o2a_count(const struct mca_base_pvar_t *pvar, void *value, void *obj_handle) { ompi_communicator_t *comm = (ompi_communicator_t *) obj_handle; size_t *value_size = (size_t*) value; mca_monitoring_coll_data_t*data; int ret = opal_hash_table_get_value_uint64(comm_data, *((uint64_t*)&comm), (void*)&data); if( OPAL_SUCCESS == ret ) { *value_size = data->o2a_count; } return ret; } int mca_common_monitoring_coll_get_o2a_size(const struct mca_base_pvar_t *pvar, void *value, void *obj_handle) { ompi_communicator_t *comm = (ompi_communicator_t *) obj_handle; size_t *value_size = (size_t*) value; mca_monitoring_coll_data_t*data; int ret = opal_hash_table_get_value_uint64(comm_data, *((uint64_t*)&comm), (void*)&data); if( OPAL_SUCCESS == ret ) { *value_size = data->o2a_size; } return ret; } void mca_common_monitoring_coll_a2o(size_t size, mca_monitoring_coll_data_t*data) { if( 0 == mca_common_monitoring_current_state ) return; /* right now the monitoring is not started */ #if OPAL_ENABLE_DEBUG if( NULL == data ) { OPAL_MONITORING_PRINT_ERR("coll: a2o: data structure empty"); return; } #endif /* OPAL_ENABLE_DEBUG */ opal_atomic_add_fetch_size_t(&data->a2o_size, size); opal_atomic_add_fetch_size_t(&data->a2o_count, 1); } int mca_common_monitoring_coll_get_a2o_count(const struct mca_base_pvar_t *pvar, void *value, void *obj_handle) { ompi_communicator_t *comm = (ompi_communicator_t *) obj_handle; size_t *value_size = (size_t*) value; mca_monitoring_coll_data_t*data; int ret = opal_hash_table_get_value_uint64(comm_data, *((uint64_t*)&comm), (void*)&data); if( OPAL_SUCCESS == ret ) { *value_size = data->a2o_count; } return ret; } int mca_common_monitoring_coll_get_a2o_size(const struct mca_base_pvar_t *pvar, void *value, void *obj_handle) { ompi_communicator_t *comm = (ompi_communicator_t *) obj_handle; size_t *value_size = (size_t*) value; mca_monitoring_coll_data_t*data; int ret = opal_hash_table_get_value_uint64(comm_data, *((uint64_t*)&comm), (void*)&data); if( OPAL_SUCCESS == ret ) { *value_size = data->a2o_size; } return ret; } void mca_common_monitoring_coll_a2a(size_t size, mca_monitoring_coll_data_t*data) { if( 0 == mca_common_monitoring_current_state ) return; /* right now the monitoring is not started */ #if OPAL_ENABLE_DEBUG if( NULL == data ) { OPAL_MONITORING_PRINT_ERR("coll: a2a: data structure empty"); return; } #endif /* OPAL_ENABLE_DEBUG */ opal_atomic_add_fetch_size_t(&data->a2a_size, size); opal_atomic_add_fetch_size_t(&data->a2a_count, 1); } int mca_common_monitoring_coll_get_a2a_count(const struct mca_base_pvar_t *pvar, void *value, void *obj_handle) { ompi_communicator_t *comm = (ompi_communicator_t *) obj_handle; size_t *value_size = (size_t*) value; mca_monitoring_coll_data_t*data; int ret = opal_hash_table_get_value_uint64(comm_data, *((uint64_t*)&comm), (void*)&data); if( OPAL_SUCCESS == ret ) { *value_size = data->a2a_count; } return ret; } int mca_common_monitoring_coll_get_a2a_size(const struct mca_base_pvar_t *pvar, void *value, void *obj_handle) { ompi_communicator_t *comm = (ompi_communicator_t *) obj_handle; size_t *value_size = (size_t*) value; mca_monitoring_coll_data_t*data; int ret = opal_hash_table_get_value_uint64(comm_data, *((uint64_t*)&comm), (void*)&data); if( OPAL_SUCCESS == ret ) { *value_size = data->a2a_size; } return ret; } static void mca_monitoring_coll_construct (mca_monitoring_coll_data_t*coll_data) { coll_data->procs = NULL; coll_data->comm_name = NULL; coll_data->world_rank = -1; coll_data->p_comm = NULL; coll_data->is_released = 0; coll_data->o2a_count = 0; coll_data->o2a_size = 0; coll_data->a2o_count = 0; coll_data->a2o_size = 0; coll_data->a2a_count = 0; coll_data->a2a_size = 0; } static void mca_monitoring_coll_destruct (mca_monitoring_coll_data_t*coll_data){} OBJ_CLASS_INSTANCE(mca_monitoring_coll_data_t, opal_object_t, mca_monitoring_coll_construct, mca_monitoring_coll_destruct);