From 34c794f60e824215a11544e630aed2fea4519908 Mon Sep 17 00:00:00 2001 From: fwaters Date: Mon, 16 Sep 2019 14:28:23 +0000 Subject: [PATCH 18/21] Rewrite inlining --- numpy/core/src/mkl_defs/aligned_alloc.c | 142 ++------------------- numpy/core/src/mkl_defs/aligned_alloc.h | 94 +++++++++++++- numpy/core/src/multiarray/item_selection.c | 2 +- numpy/core/src/umath/loops.c.src | 17 ++- 4 files changed, 115 insertions(+), 140 deletions(-) diff --git a/numpy/core/src/mkl_defs/aligned_alloc.c b/numpy/core/src/mkl_defs/aligned_alloc.c index 81e9e3340..37eb17247 100644 --- a/numpy/core/src/mkl_defs/aligned_alloc.c +++ b/numpy/core/src/mkl_defs/aligned_alloc.c @@ -1,22 +1,6 @@ -#include "mkl.h" -#include -#include -#ifndef Py_PYTHON_H -# include "Python.h" -#endif -#include "numpy/npy_common.h" - -#define ALIGNMENT 64 -#define __THRESHOLD 524288 -#define __UNIT_STRIDE 1 -#define __NULL_STRIDE 0 -#define __8BYTES_ALIGNMENT_OFFSET(ptr) (((size_t) (ptr)) & 0x7) -#define MKL_INT_MAX ((size_t) (~((MKL_UINT) 0) >> 1)) - -#if defined(_MSC_VER) -#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno) -#endif -static int is_tbb_enabled(void) { +#include "aligned_alloc.h" +#if !defined(_MSC_VER) +int is_tbb_enabled(void) { static int TBB_ENABLED = -1; if (TBB_ENABLED == -1) { char* mkl_threading = getenv("MKL_THREADING_LAYER"); @@ -25,70 +9,9 @@ static int is_tbb_enabled(void) { return TBB_ENABLED; } -static NPY_INLINE void call_dcopy_chunked(size_t size, double* src, double* dest) { - while (size > MKL_INT_MAX) { - cblas_dcopy(MKL_INT_MAX, src , __NULL_STRIDE, dest, __UNIT_STRIDE); - size -= MKL_INT_MAX; - dest += MKL_INT_MAX; - } - if (size > 0) { - if (size >= __THRESHOLD) { - cblas_dcopy(size, src , __NULL_STRIDE, dest, __UNIT_STRIDE); - } else { - memset(dest, 0, size * sizeof(double)); - } - } -} - -void * _aligned_alloc(size_t size) { - /* Only available for Linux and OSX (has been explicitly disabled on Windows : see aligned_alloc.h) - * With Windows, we would run into composability issues with modules like h5py which allocate - * memory using libc functions in another library, like hdf5 for instance - */ - size = (size > 0) ? size : 1; - void* data = NULL; - int ret_code = posix_memalign(&data, ALIGNMENT, size); - if (ret_code == 0) { - return data; - } - return NULL; -} - - -#ifdef WITH_ALIGNED_CALLOC -void * _aligned_calloc(size_t nelem, size_t elsize) -{ - size_t size = nelem * elsize; - void *data = _aligned_alloc(size); - char *memory = NULL; - - if (data != NULL) { - memory = (char*) data; - if((size > __THRESHOLD) && !is_tbb_enabled()) { - size_t offset = __8BYTES_ALIGNMENT_OFFSET(8 - __8BYTES_ALIGNMENT_OFFSET(memory)); - size_t rem_size, ch_size, n_ch = (size - offset) / sizeof(double); - double init = 0; - if (offset) { - memset(memory, 0, offset); - } - - call_dcopy_chunked(n_ch, &init, (double*) (memory+offset)); - - ch_size = offset + n_ch * sizeof(double); - rem_size = size - ch_size; - if(rem_size > 0) { - memset(memory + ch_size, 0, rem_size); - } - } else { - memset(memory, 0, size); - } - } - return data; -} -#endif #if PY_VERSION_HEX >= 0x03040000 -static int is_tracemalloc_enabled(void) { +int is_tracemalloc_enabled(void) { static int TRACEMALLOC_PRESENT = -1; if (TRACEMALLOC_PRESENT == -1) { TRACEMALLOC_PRESENT = (getenv("PYTHONTRACEMALLOC")) ? 1 : 0; @@ -97,53 +20,12 @@ static int is_tracemalloc_enabled(void) { } #endif -void* call_aligned_malloc(size_t size) { -#if PY_VERSION_HEX >= 0x03040000 - if(is_tracemalloc_enabled()){ - return PyMem_RawMalloc(size); - } else -#endif - { - return _aligned_alloc(size); - } -} +// C99 inlining model +extern void* call_aligned_malloc(size_t); +extern void* _aligned_alloc(size_t); +extern void* call_aligned_calloc(size_t, size_t); +extern void* call_aligned_realloc(void*, size_t); +extern void call_free(void*); +#endif // !defined(_MSC_VER) +static int _prevent_empty_translation_unit; -void* call_aligned_realloc(void* input, size_t size) { -#if PY_VERSION_HEX >= 0x03040000 - if(is_tracemalloc_enabled()){ - return PyMem_RawRealloc(input, size); - } else -#endif - { - if (input) { - return realloc(input, size ? size : 1); - } - return _aligned_alloc(size); - } -} - -void* call_aligned_calloc(size_t num, size_t size) { -#if PY_VERSION_HEX >= 0x03040000 - if(is_tracemalloc_enabled()){ - return PyMem_RawCalloc(num, size); - } else -#endif - { -#ifdef WITH_ALIGNED_CALLOC - return _aligned_calloc(num, size); -#else - return calloc(num, size); -#endif - } -} - -void call_free(void* ptr) { -#if PY_VERSION_HEX >= 0x03040000 - if(is_tracemalloc_enabled()){ - PyMem_RawFree(ptr); - } else -#endif - { - free(ptr); - } -} diff --git a/numpy/core/src/mkl_defs/aligned_alloc.h b/numpy/core/src/mkl_defs/aligned_alloc.h index 931f45474..ba19bfe2f 100644 --- a/numpy/core/src/mkl_defs/aligned_alloc.h +++ b/numpy/core/src/mkl_defs/aligned_alloc.h @@ -1,10 +1,94 @@ +// Not on windows #if !defined(ALIGNED_ALLOC_H) && !defined(_MSC_VER) # define ALIGNED_ALLOC_H -# include - inline void* call_aligned_malloc(size_t); - inline void* call_aligned_realloc(void*, size_t); - inline void* call_aligned_calloc(size_t, size_t); - inline void call_free(void*); + + +#ifndef Py_PYTHON_H +# include "Python.h" +#endif + +#include +#include + +#include "numpy/npy_common.h" + +#define ALIGNMENT 64 +#define __THRESHOLD 524288 +#define __UNIT_STRIDE 1 +#define __NULL_STRIDE 0 +#define __8BYTES_ALIGNMENT_OFFSET(ptr) (((size_t) (ptr)) & 0x7) +#define MKL_INT_MAX ((size_t) (~((MKL_UINT) 0) >> 1)) + +// prototypes +int is_tbb_enabled(void); +#if PY_VERSION_HEX >= 0x03040000 +int is_tracemalloc_enabled(void); +#endif + + +// inlinables + +NPY_INLINE void* _aligned_alloc(size_t size) { + /* Only available for Linux and OSX (has been explicitly disabled on Windows : see aligned_alloc.h) + * With Windows, we would run into composability issues with modules like h5py which allocate + * memory using libc functions in another library, like hdf5 for instance + */ + size = (size > 0) ? size : 1; + void* data = NULL; + int ret_code = posix_memalign(&data, ALIGNMENT, size); + if (ret_code == 0) { + return data; + } + return NULL; +} + +NPY_INLINE void* call_aligned_malloc(size_t size) { +#if PY_VERSION_HEX >= 0x03040000 + if(is_tracemalloc_enabled()){ + return PyMem_RawMalloc(size); + } else +#endif + { + return _aligned_alloc(size); + } +} + +NPY_INLINE void* call_aligned_realloc(void* input, size_t size) { +#if PY_VERSION_HEX >= 0x03040000 + if(is_tracemalloc_enabled()){ + return PyMem_RawRealloc(input, size); + } else +#endif + { + if (input) { + return realloc(input, size ? size : 1); + } + return _aligned_alloc(size); + } +} + +NPY_INLINE void* call_aligned_calloc(size_t num, size_t size) { +#if PY_VERSION_HEX >= 0x03040000 + if(is_tracemalloc_enabled()){ + return PyMem_RawCalloc(num, size); + } else +#endif + { + return calloc(num, size); + } +} + +NPY_INLINE void call_free(void* ptr) { +#if PY_VERSION_HEX >= 0x03040000 + if(is_tracemalloc_enabled()){ + PyMem_RawFree(ptr); + } else +#endif + { + free(ptr); + } +} + # define PyArray_malloc call_aligned_malloc # define PyArray_free call_free # define PyArray_realloc call_aligned_realloc diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index 0ea4380a3..80a841399 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -26,7 +26,7 @@ #include "npy_sort.h" #include "npy_partition.h" #include "npy_binsearch.h" -#include "mkl_cpy.c" +#include "mkl_cpy.h" #include "alloc.h" #define memmove(src, dst, size) call_mkl_mv(src, dst, size, __FILE__, __func__, __LINE__) diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 4fccd71ab..8747318bb 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -42,6 +42,15 @@ #define VML_ASM_THRESHOLD 100000 #define VML_D_THRESHOLD 8000 +// Fix the definition of __assume_aligned for GCC/clang toolchain +#ifndef _MSC_VER +#define __assume_aligned(var, align) (var=(__typeof__(var))\ + __builtin_assume_aligned((void *)var, align)); +#else +// and hide it on windows +#define __assume_aligned(var, align) +#endif + #define MKL_INT_MAX ((npy_intp) ((~((MKL_UINT) 0)) >> 1)) #define CHUNKED_VML_CALL2(vml_func, n, type, in1, op1) \ @@ -3016,7 +3025,7 @@ NPY_NO_EXPORT void const npy_intp blocked_end = npy_blocked_end(peel, sizeof(@type@), vsize, n); npy_intp i; - _Pragma("novector"); + for(i = 0; i < peel; i++) { op1[i] = ip1[i] @OP@ ip2[i]; } @@ -3046,7 +3055,7 @@ NPY_NO_EXPORT void } } - _Pragma("novector"); + for(; i < n; i++) { op1[i] = ip1[i] @OP@ ip2[i]; } @@ -3062,7 +3071,7 @@ NPY_NO_EXPORT void npy_intp i; const @type@ ip1c = ip1[0]; - _Pragma("novector"); + for(i = 0; i < peel; i++) { op1[i] = ip1c @OP@ ip2[i]; } @@ -3081,7 +3090,7 @@ NPY_NO_EXPORT void } } - _Pragma("novector"); + for(; i < n; i++) { op1[i] = ip1c @OP@ ip2[i]; } -- 2.20.1