/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006 Voltaire. All rights reserved. * Copyright (c) 2007 Mellanox Technologies. All rights reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #define OPAL_DISABLE_ENABLE_MEM_DEBUG 1 #include "opal_config.h" #include "mpool_hugepage.h" #include "opal/align.h" #include #include #ifdef HAVE_MALLOC_H # include #endif #include "opal/include/opal_stdint.h" #include "opal/mca/allocator/base/base.h" #include "opal/mca/mpool/base/base.h" #include "opal/runtime/opal_params.h" #include "opal/util/printf.h" #include #include static void *mca_mpool_hugepage_alloc(mca_mpool_base_module_t *mpool, size_t size, size_t align, uint32_t flags); static void *mca_mpool_hugepage_realloc(mca_mpool_base_module_t *mpool, void *addr, size_t size); static void mca_mpool_hugepage_free(mca_mpool_base_module_t *mpool, void *addr); static void mca_mpool_hugepage_finalize(mca_mpool_base_module_t *mpool); static void mca_mpool_hugepage_hugepage_constructor(mca_mpool_hugepage_hugepage_t *huge_page) { memset((char *) huge_page + sizeof(huge_page->super), 0, sizeof(*huge_page) - sizeof(huge_page->super)); } static void mca_mpool_hugepage_hugepage_destructor(mca_mpool_hugepage_hugepage_t *huge_page) { free(huge_page->path); } OBJ_CLASS_INSTANCE(mca_mpool_hugepage_hugepage_t, opal_list_item_t, mca_mpool_hugepage_hugepage_constructor, mca_mpool_hugepage_hugepage_destructor); static int mca_mpool_rb_hugepage_compare(void *key1, void *key2) { if (key1 == key2) { return 0; } return (key1 < key2) ? -1 : 1; } /* * Initializes the mpool module. */ int mca_mpool_hugepage_module_init(mca_mpool_hugepage_module_t *mpool, mca_mpool_hugepage_hugepage_t *huge_page) { mca_allocator_base_component_t *allocator_component; int rc; mpool->super.mpool_component = &mca_mpool_hugepage_component.super; mpool->super.mpool_base = NULL; /* no base .. */ mpool->super.mpool_alloc = mca_mpool_hugepage_alloc; mpool->super.mpool_realloc = mca_mpool_hugepage_realloc; mpool->super.mpool_free = mca_mpool_hugepage_free; mpool->super.mpool_finalize = mca_mpool_hugepage_finalize; mpool->super.flags = MCA_MPOOL_FLAGS_MPI_ALLOC_MEM; OBJ_CONSTRUCT(&mpool->lock, opal_mutex_t); mpool->huge_page = huge_page; /* use an allocator component to reduce waste when making small allocations */ allocator_component = mca_allocator_component_lookup("bucket"); if (NULL == allocator_component) { return OPAL_ERR_NOT_AVAILABLE; } mpool->allocator = allocator_component->allocator_init(true, mca_mpool_hugepage_seg_alloc, mca_mpool_hugepage_seg_free, mpool); OBJ_CONSTRUCT(&mpool->allocation_tree, opal_rb_tree_t); rc = opal_rb_tree_init(&mpool->allocation_tree, mca_mpool_rb_hugepage_compare); if (OPAL_SUCCESS != rc) { OBJ_DESTRUCT(&mpool->allocation_tree); return OPAL_ERR_NOT_AVAILABLE; } return OPAL_SUCCESS; } void *mca_mpool_hugepage_seg_alloc(void *ctx, size_t *sizep) { mca_mpool_hugepage_module_t *hugepage_module = (mca_mpool_hugepage_module_t *) ctx; mca_mpool_hugepage_hugepage_t *huge_page = hugepage_module->huge_page; size_t size = *sizep; void *base = NULL; char *path = NULL; int flags = MAP_PRIVATE; int fd = -1; int rc; size = OPAL_ALIGN(size, huge_page->page_size, size_t); if (huge_page->path) { int32_t count; count = opal_atomic_add_fetch_32(&huge_page->count, 1); rc = opal_asprintf(&path, "%s/hugepage.openmpi.%d.%d", huge_page->path, getpid(), count); if (0 > rc) { return NULL; } fd = open(path, O_RDWR | O_CREAT, 0600); if (-1 == fd) { free(path); return NULL; } if (0 != ftruncate(fd, size)) { close(fd); unlink(path); free(path); return NULL; } } else { #if defined(MAP_ANONYMOUS) flags |= MAP_ANONYMOUS; #elif defined(MAP_ANON) /* older versions of OS X do not define MAP_ANONYMOUS (10.9.x and older) */ flags |= MAP_ANON; #endif } base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags | huge_page->mmap_flags, fd, 0); if (path) { unlink(path); free(path); } if (fd >= 0) { close(fd); } if (MAP_FAILED == base) { opal_output_verbose(MCA_BASE_VERBOSE_WARN, opal_mpool_base_framework.framework_verbose, "could not allocate huge page(s). falling back on standard pages"); /* fall back on regular pages */ base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); } if (MAP_FAILED == base) { return NULL; } opal_mutex_lock(&hugepage_module->lock); opal_rb_tree_insert(&hugepage_module->allocation_tree, base, (void *) (intptr_t) size); (void) opal_atomic_fetch_add_size_t(&mca_mpool_hugepage_component.bytes_allocated, size); opal_mutex_unlock(&hugepage_module->lock); OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_TRACE, opal_mpool_base_framework.framework_verbose, "allocated segment %p of size %lu bytes", base, size)); *sizep = size; return base; } void mca_mpool_hugepage_seg_free(void *ctx, void *addr) { mca_mpool_hugepage_module_t *hugepage_module = (mca_mpool_hugepage_module_t *) ctx; size_t size; opal_mutex_lock(&hugepage_module->lock); size = (size_t)(intptr_t) opal_rb_tree_find(&hugepage_module->allocation_tree, addr); if (size > 0) { opal_rb_tree_delete(&hugepage_module->allocation_tree, addr); OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_TRACE, opal_mpool_base_framework.framework_verbose, "freeing segment %p of size %lu bytes", addr, size)); munmap(addr, size); (void) opal_atomic_fetch_add_size_t(&mca_mpool_hugepage_component.bytes_allocated, -size); } opal_mutex_unlock(&hugepage_module->lock); } /** * allocate function */ static void *mca_mpool_hugepage_alloc(mca_mpool_base_module_t *mpool, size_t size, size_t align, uint32_t flags) { mca_mpool_hugepage_module_t *hugepage_module = (mca_mpool_hugepage_module_t *) mpool; return hugepage_module->allocator->alc_alloc(hugepage_module->allocator, size, align); } /** * allocate function */ static void *mca_mpool_hugepage_realloc(mca_mpool_base_module_t *mpool, void *addr, size_t size) { mca_mpool_hugepage_module_t *hugepage_module = (mca_mpool_hugepage_module_t *) mpool; return hugepage_module->allocator->alc_realloc(hugepage_module->allocator, addr, size); } /** * free function */ static void mca_mpool_hugepage_free(mca_mpool_base_module_t *mpool, void *addr) { mca_mpool_hugepage_module_t *hugepage_module = (mca_mpool_hugepage_module_t *) mpool; hugepage_module->allocator->alc_free(hugepage_module->allocator, addr); } static void mca_mpool_hugepage_finalize(struct mca_mpool_base_module_t *mpool) { mca_mpool_hugepage_module_t *hugepage_module = (mca_mpool_hugepage_module_t *) mpool; if (hugepage_module->allocator) { (void) hugepage_module->allocator->alc_finalize(hugepage_module->allocator); hugepage_module->allocator = NULL; } OBJ_DESTRUCT(&hugepage_module->lock); OBJ_DESTRUCT(&hugepage_module->allocation_tree); }