/* * Copyright (c) 2019 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2019 Arm Ltd. All rights reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. * * $COPYRIGHT$ * * Additional copyrights may follow * * $HEADER$ */ #include "ompi_config.h" #ifdef HAVE_SYS_TYPES_H #include #endif #include "opal/util/output.h" #include "ompi/op/op.h" #include "ompi/mca/op/op.h" #include "ompi/mca/op/base/base.h" #include "ompi/mca/op/aarch64/op_aarch64.h" #if defined(GENERATE_SVE_CODE) # include #define OMPI_OP_TYPE_PREPEND sv #define OMPI_OP_OP_PREPEND sv #define APPEND _sve #elif defined(GENERATE_NEON_CODE) # include #define OMPI_OP_TYPE_PREPEND #define OMPI_OP_OP_PREPEND v #define APPEND _neon #else #error we should not reach this #endif /* OMPI_MCA_OP_HAVE_SVE */ /* * Concatenate preprocessor tokens A and B without expanding macro definitions * (however, if invoked from a macro, macro arguments are expanded). */ #define OP_CONCAT_NX(A, B) A ## B /* * Concatenate preprocessor tokens A and B after macro-expanding them. */ #define OP_CONCAT(A, B) OP_CONCAT_NX(A, B) #if defined(GENERATE_SVE_CODE) # define svcnt(X) \ _Generic((X), \ int8_t: svcntb, \ uint8_t: svcntb, \ int16_t: svcnth, \ uint16_t: svcnth, \ int32_t: svcntw, \ uint32_t: svcntw, \ int64_t: svcntd, \ uint64_t: svcntd, \ float32_t: svcntw, \ float64_t: svcntd)() #else #define DUMP2(out, in1, in2) \ case 2: (out)[1] = current_func((in1)[1], (in2)[1]); \ case 1: (out)[0] = current_func((in1)[0], (in2)[0]); #define DUMP4(out, in1, in2) \ case 4: (out)[3] = current_func((in1)[3], (in2)[3]); \ case 3: (out)[2] = current_func((in1)[2], (in2)[2]); \ DUMP2(out, in1, in2) #define DUMP8(out, in1, in2) \ case 8: (out)[7] = current_func((in1)[7], (in2)[7]); \ case 7: (out)[6] = current_func((in1)[6], (in2)[6]); \ case 6: (out)[5] = current_func((in1)[5], (in2)[5]); \ case 5: (out)[4] = current_func((in1)[4], (in2)[4]); \ DUMP4(out, in1, in2) #define DUMP16(out, in1, in2) \ case 16: (out)[15] = current_func((in1)[15], (in2)[15]); \ case 15: (out)[14] = current_func((in1)[14], (in2)[14]); \ case 14: (out)[13] = current_func((in1)[13], (in2)[13]); \ case 13: (out)[12] = current_func((in1)[12], (in2)[12]); \ case 12: (out)[11] = current_func((in1)[11], (in2)[11]); \ case 11: (out)[10] = current_func((in1)[10], (in2)[10]); \ case 10: (out)[ 9] = current_func((in1)[ 9], (in2)[ 9]); \ case 9: (out)[ 8] = current_func((in1)[ 8], (in2)[ 8]); \ DUMP8(out, in1, in2) # define neon_loop(how_much, out, in1, in2) \ _Generic((*(out)), \ int8_t: __extension__({ switch ((how_much)) { DUMP16(out, in1, in2) }}), \ uint8_t: __extension__({ switch ((how_much)) { DUMP16(out, in1, in2) }}), \ int16_t: __extension__({ switch ((how_much)) { DUMP8(out, in1, in2) }}), \ uint16_t: __extension__({ switch ((how_much)) { DUMP8(out, in1, in2) }}), \ int32_t: __extension__({ switch ((how_much)) { DUMP4(out, in1, in2) }}), \ uint32_t: __extension__({ switch ((how_much)) { DUMP4(out, in1, in2) }}), \ int64_t: __extension__({ switch ((how_much)) { DUMP2(out, in1, in2) }}), \ uint64_t: __extension__({ switch ((how_much)) { DUMP2(out, in1, in2) }}), \ float32_t: __extension__({ switch ((how_much)) { DUMP4(out, in1, in2) }}), \ float64_t: __extension__({ switch ((how_much)) { DUMP2(out, in1, in2) }})) #endif /* defined(GENERATE_SVE_CODE) */ /* * Since all the functions in this file are essentially identical, we * use a macro to substitute in names and types. The core operation * in all functions that use this macro is the same. * * This macro is for (out op in). * */ #if defined(GENERATE_NEON_CODE) #define OP_AARCH64_FUNC(name, type_name, type_size, type_cnt, type, op) \ static void OP_CONCAT(ompi_op_aarch64_2buff_##name##_##type##type_size##_t, \ APPEND)(const void *_in, void *_out, int *count, \ struct ompi_datatype_t **dtype, \ struct ompi_op_base_module_1_0_0_t *module) \ { \ int left_over = *count; \ type##type_size##_t *in = (type##type_size##_t *) _in, \ *out = (type##type_size##_t *) _out; \ OP_CONCAT(OMPI_OP_TYPE_PREPEND, type##type_size##x##type_cnt##_t) vsrc, vdst; \ for (; left_over >= type_cnt; left_over -= type_cnt) { \ vsrc = vld1q##_##type_name##type_size(in); \ vdst = vld1q##_##type_name##type_size(out); \ in += type_cnt; \ vdst = OP_CONCAT(OMPI_OP_OP_PREPEND, op##q##_##type_name##type_size)(vdst, vsrc); \ vst1q##_##type_name##type_size(out, vdst); \ out += type_cnt; \ } \ \ if(left_over > 0) { \ neon_loop(left_over, out, out, in); \ } \ } #elif defined(GENERATE_SVE_CODE) #define OP_AARCH64_FUNC(name, type_name, type_size, type_cnt, type, op) \ static void OP_CONCAT(ompi_op_aarch64_2buff_##name##_##type##type_size##_t, APPEND) \ (const void *_in, void *_out, int *count, \ struct ompi_datatype_t **dtype, \ struct ompi_op_base_module_1_0_0_t *module) \ { \ int types_per_step = svcnt(*((type##type_size##_t *) _in)); \ size_t idx = 0, left_over = *count; \ type##type_size##_t *in = (type##type_size##_t *) _in, \ *out = (type##type_size##_t *) _out; \ OP_CONCAT(OMPI_OP_TYPE_PREPEND, type##type_size##_t) vsrc, vdst; \ svbool_t pred = svwhilelt_b##type_size(idx, left_over); \ do { \ vsrc = svld1(pred, &in[idx]); \ vdst = svld1(pred, &out[idx]); \ vdst = OP_CONCAT(OMPI_OP_OP_PREPEND, op##_x)(pred, vdst, vsrc); \ OP_CONCAT(OMPI_OP_OP_PREPEND, st1)(pred, &out[idx], vdst); \ idx += types_per_step; \ pred = svwhilelt_b##type_size(idx, left_over); \ } while (svptest_any(svptrue_b##type_size(), pred)); \ } #endif /************************************************************************* * Max *************************************************************************/ #undef current_func #define current_func(a, b) ((a) > (b) ? (a) : (b)) OP_AARCH64_FUNC(max, s, 8, 16, int, max) OP_AARCH64_FUNC(max, u, 8, 16, uint, max) OP_AARCH64_FUNC(max, s, 16, 8, int, max) OP_AARCH64_FUNC(max, u, 16, 8, uint, max) OP_AARCH64_FUNC(max, s, 32, 4, int, max) OP_AARCH64_FUNC(max, u, 32, 4, uint, max) #if defined(GENERATE_SVE_CODE) OP_AARCH64_FUNC(max, s, 64, 2, int, max) OP_AARCH64_FUNC(max, u, 64, 2, uint, max) #endif /* defined(GENERATE_SVE_CODE) */ OP_AARCH64_FUNC(max, f, 32, 4, float, max) OP_AARCH64_FUNC(max, f, 64, 2, float, max) /************************************************************************* * Min *************************************************************************/ #undef current_func #define current_func(a, b) ((a) < (b) ? (a) : (b)) OP_AARCH64_FUNC(min, s, 8, 16, int, min) OP_AARCH64_FUNC(min, u, 8, 16, uint, min) OP_AARCH64_FUNC(min, s, 16, 8, int, min) OP_AARCH64_FUNC(min, u, 16, 8, uint, min) OP_AARCH64_FUNC(min, s, 32, 4, int, min) OP_AARCH64_FUNC(min, u, 32, 4, uint, min) #if defined(GENERATE_SVE_CODE) OP_AARCH64_FUNC(min, s, 64, 2, int, min) OP_AARCH64_FUNC(min, u, 64, 2, uint, min) #endif /* defined(GENERATE_SVE_CODE) */ OP_AARCH64_FUNC(min, f, 32, 4, float, min) OP_AARCH64_FUNC(min, f, 64, 2, float, min) /************************************************************************* * Sum ************************************************************************/ #undef current_func #define current_func(a, b) ((a) + (b)) OP_AARCH64_FUNC(sum, s, 8, 16, int, add) OP_AARCH64_FUNC(sum, u, 8, 16, uint, add) OP_AARCH64_FUNC(sum, s, 16, 8, int, add) OP_AARCH64_FUNC(sum, u, 16, 8, uint, add) OP_AARCH64_FUNC(sum, s, 32, 4, int, add) OP_AARCH64_FUNC(sum, u, 32, 4, uint, add) OP_AARCH64_FUNC(sum, s, 64, 2, int, add) OP_AARCH64_FUNC(sum, u, 64, 2, uint, add) OP_AARCH64_FUNC(sum, f, 32, 4, float, add) OP_AARCH64_FUNC(sum, f, 64, 2, float, add) /************************************************************************* * Product *************************************************************************/ #undef current_func #define current_func(a, b) ((a) * (b)) OP_AARCH64_FUNC(prod, s, 8, 16, int, mul) OP_AARCH64_FUNC(prod, u, 8, 16, uint, mul) OP_AARCH64_FUNC(prod, s, 16, 8, int, mul) OP_AARCH64_FUNC(prod, u, 16, 8, uint, mul) OP_AARCH64_FUNC(prod, s, 32, 4, int, mul) OP_AARCH64_FUNC(prod, u, 32, 4, uint, mul) #if defined(GENERATE_SVE_CODE) OP_AARCH64_FUNC(prod, s, 64, 2, int, mul) OP_AARCH64_FUNC(prod, u, 64, 2, uint, mul) #endif /* defined(GENERATE_SVE_CODE) */ OP_AARCH64_FUNC(prod, f, 32, 4, float, mul) OP_AARCH64_FUNC(prod, f, 64, 2, float, mul) /************************************************************************* * Bitwise AND *************************************************************************/ #undef current_func #define current_func(a, b) ((a) & (b)) OP_AARCH64_FUNC(band, s, 8, 16, int, and) OP_AARCH64_FUNC(band, u, 8, 16, uint, and) OP_AARCH64_FUNC(band, s, 16, 8, int, and) OP_AARCH64_FUNC(band, u, 16, 8, uint, and) OP_AARCH64_FUNC(band, s, 32, 4, int, and) OP_AARCH64_FUNC(band, u, 32, 4, uint, and) OP_AARCH64_FUNC(band, s, 64, 2, int, and) OP_AARCH64_FUNC(band, u, 64, 2, uint, and) /************************************************************************* * Bitwise OR *************************************************************************/ #undef current_func #define current_func(a, b) ((a) | (b)) OP_AARCH64_FUNC(bor, s, 8, 16, int, orr) OP_AARCH64_FUNC(bor, u, 8, 16, uint, orr) OP_AARCH64_FUNC(bor, s, 16, 8, int, orr) OP_AARCH64_FUNC(bor, u, 16, 8, uint, orr) OP_AARCH64_FUNC(bor, s, 32, 4, int, orr) OP_AARCH64_FUNC(bor, u, 32, 4, uint, orr) OP_AARCH64_FUNC(bor, s, 64, 2, int, orr) OP_AARCH64_FUNC(bor, u, 64, 2, uint, orr) /************************************************************************* * Bitwise XOR *************************************************************************/ #undef current_func #define current_func(a, b) ((a) ^ (b)) OP_AARCH64_FUNC(bxor, s, 8, 16, int, eor) OP_AARCH64_FUNC(bxor, u, 8, 16, uint, eor) OP_AARCH64_FUNC(bxor, s, 16, 8, int, eor) OP_AARCH64_FUNC(bxor, u, 16, 8, uint, eor) OP_AARCH64_FUNC(bxor, s, 32, 4, int, eor) OP_AARCH64_FUNC(bxor, u, 32, 4, uint, eor) OP_AARCH64_FUNC(bxor, s, 64, 2, int, eor) OP_AARCH64_FUNC(bxor, u, 64, 2, uint, eor) /* * This is a three buffer (2 input and 1 output) version of the reduction * routines, needed for some optimizations. */ #if defined(GENERATE_NEON_CODE) #define OP_AARCH64_FUNC_3BUFF(name, type_name, type_size, type_cnt, type, op) \ static void OP_CONCAT(ompi_op_aarch64_3buff_##name##_##type##type_size##_t, APPEND) \ (const void *_in1, const void *_in2, void *_out, int *count, \ struct ompi_datatype_t **dtype, \ struct ompi_op_base_module_1_0_0_t *module) \ { \ int left_over = *count; \ type##type_size##_t *in1 = (type##type_size##_t *) _in1, \ *in2 = (type##type_size##_t *) _in2, \ *out = (type##type_size##_t *) _out; \ OP_CONCAT(OMPI_OP_TYPE_PREPEND, type##type_size##x##type_cnt##_t) vsrc, vdst; \ for (; left_over >= type_cnt; left_over -= type_cnt) { \ vsrc = vld1q##_##type_name##type_size(in1); \ vdst = vld1q##_##type_name##type_size(in2); \ in1 += type_cnt; \ in2 += type_cnt; \ vdst = OP_CONCAT(OMPI_OP_OP_PREPEND, op##q##_##type_name##type_size)(vdst, vsrc); \ vst1q##_##type_name##type_size(out, vdst); \ out += type_cnt; \ } \ if (left_over > 0) { \ neon_loop(left_over, out, in1, in2); \ } \ } #elif defined(GENERATE_SVE_CODE) #define OP_AARCH64_FUNC_3BUFF(name, type_name, type_size, type_cnt, type, op) \ static void OP_CONCAT(ompi_op_aarch64_3buff_##name##_##type##type_size##_t, APPEND) \ (const void *_in1, const void *_in2, void *_out, int *count, \ struct ompi_datatype_t **dtype, \ struct ompi_op_base_module_1_0_0_t *module) \ { \ int types_per_step = svcnt(*((type##type_size##_t *) _in1)); \ type##type_size##_t *in1 = (type##type_size##_t *) _in1, \ *in2 = (type##type_size##_t *) _in2, \ *out = (type##type_size##_t *) _out; \ size_t idx = 0, left_over = *count; \ OP_CONCAT(OMPI_OP_TYPE_PREPEND, type##type_size##_t) vsrc, vdst; \ svbool_t pred = svwhilelt_b##type_size(idx, left_over); \ do { \ vsrc = svld1(pred, &in1[idx]); \ vdst = svld1(pred, &in2[idx]); \ vdst = OP_CONCAT(OMPI_OP_OP_PREPEND, op##_x)(pred, vdst, vsrc); \ OP_CONCAT(OMPI_OP_OP_PREPEND, st1)(pred, &out[idx], vdst); \ idx += types_per_step; \ pred = svwhilelt_b##type_size(idx, left_over); \ } while (svptest_any(svptrue_b##type_size(), pred)); \ } #endif /* defined(GENERATE_SVE_CODE) */ /************************************************************************* * Max *************************************************************************/ #undef current_func #define current_func(a, b) ((a) > (b) ? (a) : (b)) OP_AARCH64_FUNC_3BUFF(max, s, 8, 16, int, max) OP_AARCH64_FUNC_3BUFF(max, u, 8, 16, uint, max) OP_AARCH64_FUNC_3BUFF(max, s, 16, 8, int, max) OP_AARCH64_FUNC_3BUFF(max, u, 16, 8, uint, max) OP_AARCH64_FUNC_3BUFF(max, s, 32, 4, int, max) OP_AARCH64_FUNC_3BUFF(max, u, 32, 4, uint, max) #if defined(GENERATE_SVE_CODE) OP_AARCH64_FUNC_3BUFF(max, s, 64, 2, int, max) OP_AARCH64_FUNC_3BUFF(max, u, 64, 2, uint, max) #endif /* defined(GENERATE_SVE_CODE) */ OP_AARCH64_FUNC_3BUFF(max, f, 32, 4, float, max) OP_AARCH64_FUNC_3BUFF(max, f, 64, 2, float, max) /************************************************************************* * Min *************************************************************************/ #undef current_func #define current_func(a, b) ((a) < (b) ? (a) : (b)) OP_AARCH64_FUNC_3BUFF(min, s, 8, 16, int, min) OP_AARCH64_FUNC_3BUFF(min, u, 8, 16, uint, min) OP_AARCH64_FUNC_3BUFF(min, s, 16, 8, int, min) OP_AARCH64_FUNC_3BUFF(min, u, 16, 8, uint, min) OP_AARCH64_FUNC_3BUFF(min, s, 32, 4, int, min) OP_AARCH64_FUNC_3BUFF(min, u, 32, 4, uint, min) #if defined(GENERATE_SVE_CODE) OP_AARCH64_FUNC_3BUFF(min, s, 64, 2, int, min) OP_AARCH64_FUNC_3BUFF(min, u, 64, 2, uint, min) #endif /* defined(GENERATE_SVE_CODE) */ OP_AARCH64_FUNC_3BUFF(min, f, 32, 4, float, min) OP_AARCH64_FUNC_3BUFF(min, f, 64, 2, float, min) /************************************************************************* * Sum ************************************************************************/ #undef current_func #define current_func(a, b) ((a) + (b)) OP_AARCH64_FUNC_3BUFF(sum, s, 8, 16, int, add) OP_AARCH64_FUNC_3BUFF(sum, u, 8, 16, uint, add) OP_AARCH64_FUNC_3BUFF(sum, s, 16, 8, int, add) OP_AARCH64_FUNC_3BUFF(sum, u, 16, 8, uint, add) OP_AARCH64_FUNC_3BUFF(sum, s, 32, 4, int, add) OP_AARCH64_FUNC_3BUFF(sum, u, 32, 4, uint, add) OP_AARCH64_FUNC_3BUFF(sum, s, 64, 2, int, add) OP_AARCH64_FUNC_3BUFF(sum, u, 64, 2, uint, add) OP_AARCH64_FUNC_3BUFF(sum, f, 32, 4, float, add) OP_AARCH64_FUNC_3BUFF(sum, f, 64, 2, float, add) /************************************************************************* * Product *************************************************************************/ #undef current_func #define current_func(a, b) ((a) * (b)) OP_AARCH64_FUNC_3BUFF(prod, s, 8, 16, int, mul) OP_AARCH64_FUNC_3BUFF(prod, u, 8, 16, uint, mul) OP_AARCH64_FUNC_3BUFF(prod, s, 16, 8, int, mul) OP_AARCH64_FUNC_3BUFF(prod, u, 16, 8, uint, mul) OP_AARCH64_FUNC_3BUFF(prod, s, 32, 4, int, mul) OP_AARCH64_FUNC_3BUFF(prod, u, 32, 4, uint, mul) #if defined(GENERATE_SVE_CODE) OP_AARCH64_FUNC_3BUFF(prod, s, 64, 2, int, mul) OP_AARCH64_FUNC_3BUFF(prod, u, 64, 2, uint, mul) #endif /* defined(GENERATE_SVE_CODE) */ OP_AARCH64_FUNC_3BUFF(prod, f, 32, 4, float, mul) OP_AARCH64_FUNC_3BUFF(prod, f, 64, 2, float, mul) /************************************************************************* * Bitwise AND *************************************************************************/ #undef current_func #define current_func(a, b) ((a) & (b)) OP_AARCH64_FUNC_3BUFF(band, s, 8, 16, int, and) OP_AARCH64_FUNC_3BUFF(band, u, 8, 16, uint, and) OP_AARCH64_FUNC_3BUFF(band, s, 16, 8, int, and) OP_AARCH64_FUNC_3BUFF(band, u, 16, 8, uint, and) OP_AARCH64_FUNC_3BUFF(band, s, 32, 4, int, and) OP_AARCH64_FUNC_3BUFF(band, u, 32, 4, uint, and) OP_AARCH64_FUNC_3BUFF(band, s, 64, 2, int, and) OP_AARCH64_FUNC_3BUFF(band, u, 64, 2, uint, and) /************************************************************************* * Bitwise OR *************************************************************************/ #undef current_func #define current_func(a, b) ((a) | (b)) OP_AARCH64_FUNC_3BUFF(bor, s, 8, 16, int, orr) OP_AARCH64_FUNC_3BUFF(bor, u, 8, 16, uint, orr) OP_AARCH64_FUNC_3BUFF(bor, s, 16, 8, int, orr) OP_AARCH64_FUNC_3BUFF(bor, u, 16, 8, uint, orr) OP_AARCH64_FUNC_3BUFF(bor, s, 32, 4, int, orr) OP_AARCH64_FUNC_3BUFF(bor, u, 32, 4, uint, orr) OP_AARCH64_FUNC_3BUFF(bor, s, 64, 2, int, orr) OP_AARCH64_FUNC_3BUFF(bor, u, 64, 2, uint, orr) /************************************************************************* * Bitwise XOR *************************************************************************/ #undef current_func #define current_func(a, b) ((a) ^ (b)) OP_AARCH64_FUNC_3BUFF(bxor, s, 8, 16, int, eor) OP_AARCH64_FUNC_3BUFF(bxor, u, 8, 16, uint, eor) OP_AARCH64_FUNC_3BUFF(bxor, s, 16, 8, int, eor) OP_AARCH64_FUNC_3BUFF(bxor, u, 16, 8, uint, eor) OP_AARCH64_FUNC_3BUFF(bxor, s, 32, 4, int, eor) OP_AARCH64_FUNC_3BUFF(bxor, u, 32, 4, uint, eor) OP_AARCH64_FUNC_3BUFF(bxor, s, 64, 2, int, eor) OP_AARCH64_FUNC_3BUFF(bxor, u, 64, 2, uint, eor) /** C integer ***********************************************************/ #define C_INTEGER_BASE(name, ftype) \ [OMPI_OP_BASE_TYPE_INT8_T] = OP_CONCAT(ompi_op_aarch64_##ftype##_##name##_int8_t, APPEND), \ [OMPI_OP_BASE_TYPE_UINT8_T] = OP_CONCAT(ompi_op_aarch64_##ftype##_##name##_uint8_t, APPEND), \ [OMPI_OP_BASE_TYPE_INT16_T] = OP_CONCAT(ompi_op_aarch64_##ftype##_##name##_int16_t, APPEND), \ [OMPI_OP_BASE_TYPE_UINT16_T] = OP_CONCAT(ompi_op_aarch64_##ftype##_##name##_uint16_t, APPEND), \ [OMPI_OP_BASE_TYPE_INT32_T] = OP_CONCAT(ompi_op_aarch64_##ftype##_##name##_int32_t, APPEND), \ [OMPI_OP_BASE_TYPE_UINT32_T] = OP_CONCAT(ompi_op_aarch64_##ftype##_##name##_uint32_t, APPEND) #define C_INTEGER_EX(name, ftype) \ [OMPI_OP_BASE_TYPE_INT64_T] = OP_CONCAT(ompi_op_aarch64_##ftype##_##name##_int64_t, APPEND), \ [OMPI_OP_BASE_TYPE_UINT64_T] = OP_CONCAT(ompi_op_aarch64_##ftype##_##name##_uint64_t, APPEND) /** Floating point, including all the Fortran reals *********************/ #define FLOAT(name, ftype) OP_CONCAT(ompi_op_aarch64_##ftype##_##name##_float32_t, APPEND) #define DOUBLE(name, ftype) OP_CONCAT(ompi_op_aarch64_##ftype##_##name##_float64_t, APPEND) #define FLOATING_POINT(name, ftype) \ [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype), \ [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype) /* * MPI_OP_NULL * All types */ #define FLAGS_NO_FLOAT \ (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | OMPI_OP_FLAGS_COMMUTE) #define FLAGS \ (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \ OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE) ompi_op_base_handler_fn_t OP_CONCAT(ompi_op_aarch64_functions, APPEND) [OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = { /* Corresponds to MPI_OP_NULL */ [OMPI_OP_BASE_FORTRAN_NULL] = { /* Leaving this empty puts in NULL for all entries */ NULL, }, /* Corresponds to MPI_MAX */ [OMPI_OP_BASE_FORTRAN_MAX] = { C_INTEGER_BASE(max, 2buff), #if defined(GENERATE_SVE_CODE) C_INTEGER_EX(max, 2buff), #endif /* defined(GENERATE_SVE_CODE) */ FLOATING_POINT(max, 2buff), }, /* Corresponds to MPI_MIN */ [OMPI_OP_BASE_FORTRAN_MIN] = { C_INTEGER_BASE(min, 2buff), #if defined(GENERATE_SVE_CODE) C_INTEGER_EX(min, 2buff), #endif /* defined(GENERATE_SVE_CODE) */ FLOATING_POINT(min, 2buff), }, /* Corresponds to MPI_SUM */ [OMPI_OP_BASE_FORTRAN_SUM] = { C_INTEGER_BASE(sum, 2buff), C_INTEGER_EX(sum, 2buff), FLOATING_POINT(sum, 2buff), }, /* Corresponds to MPI_PROD */ [OMPI_OP_BASE_FORTRAN_PROD] = { C_INTEGER_BASE(prod, 2buff), #if defined(GENERATE_SVE_CODE) C_INTEGER_EX(prod, 2buff), #endif /* defined(GENERATE_SVE_CODE) */ FLOATING_POINT(prod, 2buff), }, /* Corresponds to MPI_LAND */ [OMPI_OP_BASE_FORTRAN_LAND] = { NULL, }, /* Corresponds to MPI_BAND */ [OMPI_OP_BASE_FORTRAN_BAND] = { C_INTEGER_BASE(band, 2buff), C_INTEGER_EX(band, 2buff), }, /* Corresponds to MPI_LOR */ [OMPI_OP_BASE_FORTRAN_LOR] = { NULL, }, /* Corresponds to MPI_BOR */ [OMPI_OP_BASE_FORTRAN_BOR] = { C_INTEGER_BASE(bor, 2buff), C_INTEGER_EX(bor, 2buff), }, /* Corresponds to MPI_LXOR */ [OMPI_OP_BASE_FORTRAN_LXOR] = { NULL, }, /* Corresponds to MPI_BXOR */ [OMPI_OP_BASE_FORTRAN_BXOR] = { C_INTEGER_BASE(bxor, 2buff), C_INTEGER_EX(bxor, 2buff), }, /* Corresponds to MPI_REPLACE */ [OMPI_OP_BASE_FORTRAN_REPLACE] = { /* (MPI_ACCUMULATE is handled differently than the other reductions, so just zero out its function implementations here to ensure that users don't invoke MPI_REPLACE with any reduction operations other than ACCUMULATE) */ NULL, }, }; ompi_op_base_3buff_handler_fn_t OP_CONCAT(ompi_op_aarch64_3buff_functions, APPEND)[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = { /* Corresponds to MPI_OP_NULL */ [OMPI_OP_BASE_FORTRAN_NULL] = { /* Leaving this empty puts in NULL for all entries */ NULL, }, /* Corresponds to MPI_MAX */ [OMPI_OP_BASE_FORTRAN_MAX] = { C_INTEGER_BASE(max, 3buff), #if defined(GENERATE_SVE_CODE) C_INTEGER_EX(max, 3buff), #endif /* defined(GENERATE_SVE_CODE) */ FLOATING_POINT(max, 3buff), }, /* Corresponds to MPI_MIN */ [OMPI_OP_BASE_FORTRAN_MIN] = { C_INTEGER_BASE(min, 3buff), #if defined(GENERATE_SVE_CODE) C_INTEGER_EX(min, 3buff), #endif /* defined(GENERATE_SVE_CODE) */ FLOATING_POINT(min, 3buff), }, /* Corresponds to MPI_SUM */ [OMPI_OP_BASE_FORTRAN_SUM] = { C_INTEGER_BASE(sum, 3buff), C_INTEGER_EX(sum, 3buff), FLOATING_POINT(sum, 3buff), }, /* Corresponds to MPI_PROD */ [OMPI_OP_BASE_FORTRAN_PROD] = { C_INTEGER_BASE(prod, 3buff), #if defined(GENERATE_SVE_CODE) C_INTEGER_EX(prod, 3buff), #endif /* defined(GENERATE_SVE_CODE) */ FLOATING_POINT(prod, 3buff), }, /* Corresponds to MPI_LAND */ [OMPI_OP_BASE_FORTRAN_LAND] ={ NULL, }, /* Corresponds to MPI_BAND */ [OMPI_OP_BASE_FORTRAN_BAND] = { C_INTEGER_BASE(band, 3buff), C_INTEGER_EX(band, 3buff), }, /* Corresponds to MPI_LOR */ [OMPI_OP_BASE_FORTRAN_LOR] = { NULL, }, /* Corresponds to MPI_BOR */ [OMPI_OP_BASE_FORTRAN_BOR] = { C_INTEGER_BASE(bor, 3buff), C_INTEGER_EX(bor, 3buff), }, /* Corresponds to MPI_LXOR */ [OMPI_OP_BASE_FORTRAN_LXOR] = { NULL, }, /* Corresponds to MPI_BXOR */ [OMPI_OP_BASE_FORTRAN_BXOR] = { C_INTEGER_BASE(bxor, 3buff), C_INTEGER_EX(bxor, 3buff), }, /* Corresponds to MPI_REPLACE */ [OMPI_OP_BASE_FORTRAN_REPLACE] = { /* MPI_ACCUMULATE is handled differently than the other reductions, so just zero out its function implementations here to ensure that users don't invoke MPI_REPLACE with any reduction operations other than ACCUMULATE */ NULL, }, };