diff --git a/.github/workflows/conda-python-build.yaml b/.github/workflows/conda-python-build.yaml index eaa45fdbf..ef72c9ac6 100644 --- a/.github/workflows/conda-python-build.yaml +++ b/.github/workflows/conda-python-build.yaml @@ -104,7 +104,7 @@ jobs: if: "!cancelled()" uses: actions/upload-artifact@v4 with: - name: conda-repo + name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} path: "/tmp/conda-bld-output" - name: Publish conda package if: inputs.upload_to_anaconda diff --git a/.github/workflows/conda-python-tests.yaml b/.github/workflows/conda-python-tests.yaml index 4e30a4814..c438ebbb8 100644 --- a/.github/workflows/conda-python-tests.yaml +++ b/.github/workflows/conda-python-tests.yaml @@ -97,7 +97,9 @@ jobs: ref: ${{ inputs.sha }} fetch-depth: 0 - uses: actions/download-artifact@v4 - name: conda-repo + with: + name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} + path: conda-repo - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/.github/workflows/docs-build.yaml b/.github/workflows/docs-build.yaml index 7973ce568..83c78fe8d 100644 --- a/.github/workflows/docs-build.yaml +++ b/.github/workflows/docs-build.yaml @@ -49,7 +49,7 @@ jobs: env: RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts container: - image: rapidsai/ci-conda:latest + image: rapidsai/ci-conda:cuda13.0.0-ubuntu24.04-py3.13 env: RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} steps: @@ -59,7 +59,9 @@ jobs: ref: ${{ inputs.sha }} fetch-depth: 0 - uses: actions/download-artifact@v4 - name: conda-repo + with: + name: conda-repo-py3.13-amd64 + path: conda-repo - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index da52b2d39..b2b5a2a6a 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -22,6 +22,7 @@ jobs: - test-conda-ctypes-binding - test-simulator - build-wheels + - build-wheels-windows - test-wheels-windows - test-wheels - test-wheels-ctypes-binding @@ -108,10 +109,11 @@ jobs: build_type: pull-request script: "ci/build_wheel.sh" matrix: ${{ needs.compute-matrix.outputs.BUILD_MATRIX }} + build-wheels-windows: + uses: ./.github/workflows/wheel-windows-build.yaml test-wheels-windows: needs: - - build-wheels - - compute-matrix + - build-wheels-windows uses: ./.github/workflows/wheel-windows-tests.yaml test-wheels: needs: @@ -169,4 +171,4 @@ jobs: build_type: pull-request script: "ci/coverage_report.sh" matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} - matrix_filter: 'map(select(.ARCH == "amd64" and .CUDA_VER == "12.9.1" and .PY_VER == "3.11")) | .[0:1]' + matrix_filter: 'map(select(.ARCH == "amd64" and .CUDA_VER == "12.9.1" and .PY_VER == "3.12")) | .[0:1]' diff --git a/.github/workflows/simulator-test.yaml b/.github/workflows/simulator-test.yaml index 3202b20e0..fe306bc26 100644 --- a/.github/workflows/simulator-test.yaml +++ b/.github/workflows/simulator-test.yaml @@ -86,7 +86,9 @@ jobs: ref: ${{ inputs.sha }} fetch-depth: 0 - uses: actions/download-artifact@v4 - name: conda-repo + with: + name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} + path: conda-repo - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml new file mode 100644 index 000000000..6a861db5c --- /dev/null +++ b/.github/workflows/wheel-windows-build.yaml @@ -0,0 +1,55 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-2-Clause + +name: "CI: Build wheel on Windows" + +on: + workflow_call: + inputs: + sha: + type: string + repo: + type: string + script: + type: string + default: "./ci/build_wheel.ps1" + +jobs: + build: + name: ${{ matrix.PY_VER }}, windows + strategy: + fail-fast: false + matrix: + PY_VER: + - "3.10" + - "3.11" + - "3.12" + - "3.13" + runs-on: windows-2022 + steps: + - uses: actions/checkout@v4 + with: + repository: ${{ inputs.repo }} + ref: ${{ inputs.sha }} + fetch-depth: 0 + + - name: Set up Python ${{ matrix.PY_VER }} + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: ${{ matrix.PY_VER }} + + - name: Set up MSVC + uses: ilammy/msvc-dev-cmd@v1 # TODO: ask admin to allow pinning commits + + - name: Python build + run: | + ${{ inputs.script }} + env: + CUDA_VER: "12.8.0" + + - name: Upload wheel + if: "!cancelled()" + uses: actions/upload-artifact@v4 + with: + name: wheel-py${{ matrix.PY_VER }}-windows + path: ${{ env.wheel_path }} diff --git a/.github/workflows/wheel-windows-tests.yaml b/.github/workflows/wheel-windows-tests.yaml index 1847ccf7e..766a2f9a5 100644 --- a/.github/workflows/wheel-windows-tests.yaml +++ b/.github/workflows/wheel-windows-tests.yaml @@ -15,24 +15,16 @@ on: default: "./ci/test_wheel.ps1" jobs: - compute-matrix: - runs-on: ubuntu-latest - outputs: - MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }} - steps: - - name: Compute Python Test Matrix - id: compute-matrix - run: | - set -eo pipefail - export TEST_MATRIX="{ ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0' }" - MATRIX=$(yq -n -o json 'env(TEST_MATRIX)' | jq -c) - echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" tests: - name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, ${{ matrix.ARCH }}, windows - needs: compute-matrix + name: ${{ matrix.PY_VER }}, windows strategy: fail-fast: false - matrix: ${{ fromJSON('{"ARCH":["amd64"],"PY_VER":["3.12"],"CUDA_VER":["12.8.0"]}') }} + matrix: + PY_VER: + - "3.10" + - "3.11" + - "3.12" + - "3.13" runs-on: "cuda-python-windows-gpu-github" steps: - uses: actions/checkout@v4 @@ -49,7 +41,9 @@ jobs: run: nvidia-smi - uses: actions/download-artifact@v4 - name: wheel + with: + name: wheel-py${{ matrix.PY_VER }}-windows + path: wheel - name: Display structure of downloaded files run: Get-ChildItem -Recurse @@ -63,4 +57,4 @@ jobs: run: | ${{ inputs.script }} env: - CUDA_VER: ${{ matrix.CUDA_VER }} + CUDA_VER: "12.8.0" diff --git a/.github/workflows/wheels-build.yaml b/.github/workflows/wheels-build.yaml index f184b9e08..bc3110b70 100644 --- a/.github/workflows/wheels-build.yaml +++ b/.github/workflows/wheels-build.yaml @@ -152,11 +152,11 @@ jobs: if: "!cancelled()" uses: actions/upload-artifact@v4 with: - name: sdist + name: sdist-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} path: ${{ env.sdist_path }} - name: Upload wheel if: "!cancelled()" uses: actions/upload-artifact@v4 with: - name: wheel + name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} path: ${{ env.wheel_path }} diff --git a/.github/workflows/wheels-test.yaml b/.github/workflows/wheels-test.yaml index 415a54489..afa05076f 100644 --- a/.github/workflows/wheels-test.yaml +++ b/.github/workflows/wheels-test.yaml @@ -115,7 +115,9 @@ jobs: fetch-depth: 0 # unshallow fetch for setuptools-scm persist-credentials: false - uses: actions/download-artifact@v4 - name: wheel + with: + name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }} + path: wheel - name: Display structure of downloaded files run: ls -R - name: Standardize repository information diff --git a/ci/build_wheel.ps1 b/ci/build_wheel.ps1 new file mode 100644 index 000000000..77608ac54 --- /dev/null +++ b/ci/build_wheel.ps1 @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-2-Clause + +$ErrorActionPreference = 'Stop' +Set-StrictMode -Version Latest + +function rapids-logger { + param ( + [Parameter(Mandatory=$true)] + [string]$Text + ) + + # Determine padding and box width + $padding = 2 + $boxWidth = $Text.Length + ($padding * 2) + $topBottom = '+' + ('-' * $boxWidth) + '+' + $middle = '|' + (' ' * $padding) + $Text + (' ' * $padding) + '|' + + # Print the box in green + Write-Host $topBottom -ForegroundColor Green + Write-Host $middle -ForegroundColor Green + Write-Host $topBottom -ForegroundColor Green +} + +rapids-logger "Install build package" +python -m pip install build + +rapids-logger "Build sdist and wheel" +python -m build . + +$wheel_path = Resolve-Path dist\numba_cuda*.whl | Select-Object -ExpandProperty Path +echo "Wheel path: $wheel_path" +echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV diff --git a/ci/matrix.yml b/ci/matrix.yml index 6cf2e4dac..f810738c1 100644 --- a/ci/matrix.yml +++ b/ci/matrix.yml @@ -5,23 +5,30 @@ # [ARCH, PY_VER, CUDA_VER, LINUX_VER, GPU, DRIVER, DEPENDENCIES] # build-matrix: + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } + - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } simulator-matrix: - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' } # We test "oldest" dependencies with the oldest supported Python version and # the second-newest Python version. test-matrix: - - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.9', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'earliest', DEPENDENCIES: 'oldest' } - - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.9', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'earliest', DEPENDENCIES: 'latest' } - - { CUDA_VER: '12.2.2', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' } - - { CUDA_VER: '12.9.1', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'ubuntu24.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'earliest', DEPENDENCIES: 'oldest' } + - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'earliest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '12.2.2', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '12.9.1', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'ubuntu24.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' } - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'oldest' } - - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.13', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' } - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' } - - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.9', LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'oldest' } - - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.9', LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'latest' } - - { CUDA_VER: '12.2.2', ARCH: 'arm64', PY_VER: '3.10', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } - - { CUDA_VER: '12.9.1', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.10', LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'oldest' } + - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '12.2.2', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '12.9.1', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'oldest' } - - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } + - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.13', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' } diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml index 7e2b49541..b03309822 100644 --- a/conda/recipes/numba-cuda/meta.yaml +++ b/conda/recipes/numba-cuda/meta.yaml @@ -17,7 +17,6 @@ source: path: ../../.. build: - noarch: python script: - {{ PYTHON }} -m pip install . -vv @@ -26,6 +25,7 @@ requirements: - python - pip - setuptools + - numpy >=2.1.0 run: - python - numba >=0.59.1 diff --git a/numba_cuda/numba/cuda/cext/__init__.py b/numba_cuda/numba/cuda/cext/__init__.py new file mode 100644 index 000000000..79b94ffc4 --- /dev/null +++ b/numba_cuda/numba/cuda/cext/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-2-Clause diff --git a/numba_cuda/numba/cuda/cext/_devicearray.cpp b/numba_cuda/numba/cuda/cext/_devicearray.cpp new file mode 100644 index 000000000..216612cf8 --- /dev/null +++ b/numba_cuda/numba/cuda/cext/_devicearray.cpp @@ -0,0 +1,161 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + +/* This file contains the base class implementation for all device arrays. The + * base class is implemented in C so that computing typecodes for device arrays + * can be implemented efficiently. */ + +#include "_pymodule.h" + + +/* Include _devicearray., but make sure we don't get the definitions intended + * for consumers of the Device Array API. + */ +#define NUMBA_IN_DEVICEARRAY_CPP_ +#include "_devicearray.h" + +/* DeviceArray PyObject implementation. Note that adding more members here is + * presently prohibited because mapped and managed arrays derive from both + * DeviceArray and NumPy's ndarray, which is also a C extension class - the + * layout of the object cannot be resolved if this class also has members beyond + * PyObject_HEAD. */ +class DeviceArray { + PyObject_HEAD +}; + +/* Trivial traversal - DeviceArray instances own nothing. */ +static int +DeviceArray_traverse(DeviceArray *self, visitproc visit, void *arg) +{ + return 0; +} + +/* Trivial clear of all references - DeviceArray instances own nothing. */ +static int +DeviceArray_clear(DeviceArray *self) +{ + return 0; +} + +/* The _devicearray.DeviceArray type */ +PyTypeObject DeviceArrayType = { + PyVarObject_HEAD_INIT(NULL, 0) + "_devicearray.DeviceArray", /* tp_name */ + sizeof(DeviceArray), /* tp_basicsize */ + 0, /* tp_itemsize */ + 0, /* tp_dealloc */ + 0, /* tp_vectorcall_offset */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_as_async */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call*/ + 0, /* tp_str*/ + 0, /* tp_getattro*/ + 0, /* tp_setattro*/ + 0, /* tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, + /* tp_flags*/ + "DeviceArray object", /* tp_doc */ + (traverseproc) DeviceArray_traverse, /* tp_traverse */ + (inquiry) DeviceArray_clear, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ + 0, /* tp_free */ + 0, /* tp_is_gc */ + 0, /* tp_bases */ + 0, /* tp_mro */ + 0, /* tp_cache */ + 0, /* tp_subclasses */ + 0, /* tp_weaklist */ + 0, /* tp_del */ + 0, /* tp_version_tag */ + 0, /* tp_finalize */ + 0, /* tp_vectorcall */ +#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION == 12) +/* This was introduced first in 3.12 + * https://github.com/python/cpython/issues/91051 + */ + 0, /* tp_watched */ +#endif + +/* WARNING: Do not remove this, only modify it! It is a version guard to + * act as a reminder to update this struct on Python version update! */ +#if (PY_MAJOR_VERSION == 3) +#if ! (NB_SUPPORTED_PYTHON_MINOR) +#error "Python minor version is not supported." +#endif +#else +#error "Python major version is not supported." +#endif +/* END WARNING*/ +}; + +/* CUDA device array C API */ +static void *_DeviceArray_API[1] = { + (void*)&DeviceArrayType +}; + +MOD_INIT(_devicearray) { + PyObject *m = nullptr; + PyObject *d = nullptr; + PyObject *c_api = nullptr; + int error = 0; + + + MOD_DEF(m, "_devicearray", "No docs", NULL) + if (m == NULL) + goto error_occurred; + + c_api = PyCapsule_New((void *)_DeviceArray_API, NUMBA_DEVICEARRAY_IMPORT_NAME "._DEVICEARRAY_API", NULL); + if (c_api == NULL) + goto error_occurred; + + DeviceArrayType.tp_new = PyType_GenericNew; + if (PyType_Ready(&DeviceArrayType) < 0) + goto error_occurred; + + + Py_INCREF(&DeviceArrayType); + error = PyModule_AddObject(m, "DeviceArray", (PyObject*)(&DeviceArrayType)); + if (error) + goto error_occurred; + + d = PyModule_GetDict(m); + if (d == NULL) + goto error_occurred; + + error = PyDict_SetItemString(d, "_DEVICEARRAY_API", c_api); + /* Decref and set c_api to NULL, Py_XDECREF in error_occurred will have no + * effect. */ + Py_CLEAR(c_api); + + if (error) + goto error_occurred; + + return MOD_SUCCESS_VAL(m); + +error_occurred: + Py_XDECREF(m); + Py_XDECREF(c_api); + Py_XDECREF((PyObject*)&DeviceArrayType); + + return MOD_ERROR_VAL; +} diff --git a/numba_cuda/numba/cuda/cext/_devicearray.h b/numba_cuda/numba/cuda/cext/_devicearray.h new file mode 100644 index 000000000..2e9df6cbb --- /dev/null +++ b/numba_cuda/numba/cuda/cext/_devicearray.h @@ -0,0 +1,29 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + +#ifndef NUMBA_DEVICEARRAY_H_ +#define NUMBA_DEVICEARRAY_H_ + +#ifdef __cplusplus + extern "C" { +#endif + +#define NUMBA_DEVICEARRAY_IMPORT_NAME "numba.cuda.cext._devicearray" +/* These definitions should only be used by consumers of the Device Array API. + * Consumers access the API through the opaque pointer stored in + * _devicearray._DEVICEARRAY_API. We don't want these definitions in + * _devicearray.cpp itself because they would conflict with the actual + * implementations there. + */ +#ifndef NUMBA_IN_DEVICEARRAY_CPP_ + + extern void **DeviceArray_API; + #define DeviceArrayType (*(PyTypeObject*)DeviceArray_API[0]) + +#endif /* ndef NUMBA_IN_DEVICEARRAY_CPP */ + +#ifdef __cplusplus + } +#endif + +#endif /* NUMBA_DEVICEARRAY_H_ */ diff --git a/numba_cuda/numba/cuda/cext/_dispatcher.cpp b/numba_cuda/numba/cuda/cext/_dispatcher.cpp new file mode 100644 index 000000000..653421f9e --- /dev/null +++ b/numba_cuda/numba/cuda/cext/_dispatcher.cpp @@ -0,0 +1,1092 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + +#include "_pymodule.h" + +#include +#include +#include +#include + +#include "_typeof.h" +#include "frameobject.h" +#include "traceback.h" +#include "typeconv.hpp" +#include "_devicearray.h" + +/* + * Notes on the C_TRACE macro: + * + * The original C_TRACE macro (from ceval.c) would call + * PyTrace_C_CALL et al., for which the frame argument wouldn't + * be usable. Since we explicitly synthesize a frame using the + * original Python code object, we call PyTrace_CALL instead so + * the profiler can report the correct source location. + * + * Likewise, while ceval.c would call PyTrace_C_EXCEPTION in case + * of error, the profiler would simply expect a RETURN in case of + * a Python function, so we generate that here (making sure the + * exception state is preserved correctly). + * + */ + +#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13)) + +#ifndef Py_BUILD_CORE + #define Py_BUILD_CORE 1 +#endif +#include "internal/pycore_frame.h" +// This is a fix suggested in the comments in https://github.com/python/cpython/issues/108216 +// specifically https://github.com/python/cpython/issues/108216#issuecomment-1696565797 +#ifdef HAVE_STD_ATOMIC +# undef HAVE_STD_ATOMIC +#endif +#undef _PyGC_FINALIZED + +/* dynamic_annotations.h is needed for building Python with --with-valgrind + * support. The following include is to workaround issues described in + * https://github.com/numba/numba/pull/10073 + */ +#include "dynamic_annotations.h" +#if (PY_MINOR_VERSION == 12) + #include "internal/pycore_atomic.h" +#endif +#include "internal/pycore_interp.h" +#include "internal/pycore_pyerrors.h" +#include "internal/pycore_instruments.h" +#include "internal/pycore_call.h" +#include "cpython/code.h" + +#elif (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 11) +#ifndef Py_BUILD_CORE + #define Py_BUILD_CORE 1 +#endif +#include "internal/pycore_frame.h" +#include "internal/pycore_pyerrors.h" + +/* + * Code originally from: + * https://github.com/python/cpython/blob/deaf509e8fc6e0363bd6f26d52ad42f976ec42f2/Python/ceval.c#L6804 + */ +static int +call_trace(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + int what, PyObject *arg) +{ + int result; + if (tstate->tracing) { + return 0; + } + if (frame == NULL) { + return -1; + } + int old_what = tstate->tracing_what; + tstate->tracing_what = what; + PyThreadState_EnterTracing(tstate); + result = func(obj, frame, what, NULL); + PyThreadState_LeaveTracing(tstate); + tstate->tracing_what = old_what; + return result; +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4220-L4240 + */ +static int +call_trace_protected(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + int what, PyObject *arg) +{ + PyObject *type, *value, *traceback; + int err; + _PyErr_Fetch(tstate, &type, &value, &traceback); + err = call_trace(func, obj, tstate, frame, what, arg); + if (err == 0) + { + _PyErr_Restore(tstate, type, value, traceback); + return 0; + } + else { + Py_XDECREF(type); + Py_XDECREF(value); + Py_XDECREF(traceback); + return -1; + } +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/deaf509e8fc6e0363bd6f26d52ad42f976ec42f2/Python/ceval.c#L7245 + * NOTE: The state test https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4521 + * has been removed, it's dealt with in call_cfunc. + */ +#define C_TRACE(x, call, frame) \ +if (call_trace(tstate->c_profilefunc, tstate->c_profileobj, \ + tstate, frame, \ + PyTrace_CALL, cfunc)) { \ + x = NULL; \ +} \ +else { \ + x = call; \ + if (tstate->c_profilefunc != NULL) { \ + if (x == NULL) { \ + call_trace_protected(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, frame, \ + PyTrace_RETURN, cfunc); \ + /* XXX should pass (type, value, tb) */ \ + } else { \ + if (call_trace(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, frame, \ + PyTrace_RETURN, cfunc)) { \ + Py_DECREF(x); \ + x = NULL; \ + } \ + } \ + } \ +} \ + +#elif (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 10 || PY_MINOR_VERSION == 11) + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L36-L40 + */ +typedef struct { + PyCodeObject *code; // The code object for the bounds. May be NULL. + PyCodeAddressRange bounds; // Only valid if code != NULL. + CFrame cframe; +} PyTraceInfo; + + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Objects/codeobject.c#L1257-L1266 + * NOTE: The function is renamed. + */ +static void +_nb_PyLineTable_InitAddressRange(const char *linetable, Py_ssize_t length, int firstlineno, PyCodeAddressRange *range) +{ + range->opaque.lo_next = linetable; + range->opaque.limit = range->opaque.lo_next + length; + range->ar_start = -1; + range->ar_end = 0; + range->opaque.computed_line = firstlineno; + range->ar_line = -1; +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Objects/codeobject.c#L1269-L1275 + * NOTE: The function is renamed. + */ +static int +_nb_PyCode_InitAddressRange(PyCodeObject* co, PyCodeAddressRange *bounds) +{ + const char *linetable = PyBytes_AS_STRING(co->co_linetable); + Py_ssize_t length = PyBytes_GET_SIZE(co->co_linetable); + _nb_PyLineTable_InitAddressRange(linetable, length, co->co_firstlineno, bounds); + return bounds->ar_line; +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5468-L5475 + * NOTE: The call to _PyCode_InitAddressRange is renamed. + */ +static void +initialize_trace_info(PyTraceInfo *trace_info, PyFrameObject *frame) +{ + if (trace_info->code != frame->f_code) { + trace_info->code = frame->f_code; + _nb_PyCode_InitAddressRange(frame->f_code, &trace_info->bounds); + } +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5477-L5501 + */ +static int +call_trace(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + PyTraceInfo *trace_info, + int what, PyObject *arg) +{ + int result; + if (tstate->tracing) + return 0; + tstate->tracing++; + tstate->cframe->use_tracing = 0; + if (frame->f_lasti < 0) { + frame->f_lineno = frame->f_code->co_firstlineno; + } + else { + initialize_trace_info(trace_info, frame); + frame->f_lineno = _PyCode_CheckLineNumber(frame->f_lasti*sizeof(_Py_CODEUNIT), &trace_info->bounds); + } + result = func(obj, frame, what, arg); + frame->f_lineno = 0; + tstate->cframe->use_tracing = ((tstate->c_tracefunc != NULL) + || (tstate->c_profilefunc != NULL)); + tstate->tracing--; + return result; +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5445-L5466 + */ +static int +call_trace_protected(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + PyTraceInfo *trace_info, + int what, PyObject *arg) +{ + PyObject *type, *value, *traceback; + int err; + PyErr_Fetch(&type, &value, &traceback); + err = call_trace(func, obj, tstate, frame, trace_info, what, arg); + if (err == 0) + { + PyErr_Restore(type, value, traceback); + return 0; + } + else + { + Py_XDECREF(type); + Py_XDECREF(value); + Py_XDECREF(traceback); + return -1; + } +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5810-L5839 + * NOTE: The state test https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5811 + * has been removed, it's dealt with in call_cfunc. + */ +#define C_TRACE(x, call) \ +if (call_trace(tstate->c_profilefunc, tstate->c_profileobj, \ + tstate, tstate->frame, &trace_info, PyTrace_CALL,\ + cfunc)) \ + x = NULL; \ +else \ +{ \ + x = call; \ + if (tstate->c_profilefunc != NULL) \ + { \ + if (x == NULL) \ + { \ + call_trace_protected(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, tstate->frame, \ + &trace_info, \ + PyTrace_RETURN, cfunc); \ + /* XXX should pass (type, value, tb) */ \ + } \ + else \ + { \ + if (call_trace(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, tstate->frame, \ + &trace_info, \ + PyTrace_RETURN, cfunc)) \ + { \ + Py_DECREF(x); \ + x = NULL; \ + } \ + } \ + } \ +} + +#else // Python <3.10 + +/* + * Code originally from: + * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4242-L4257 + */ +static int +call_trace(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + int what, PyObject *arg) +{ + int result; + if (tstate->tracing) + return 0; + tstate->tracing++; + tstate->use_tracing = 0; + result = func(obj, frame, what, arg); + tstate->use_tracing = ((tstate->c_tracefunc != NULL) + || (tstate->c_profilefunc != NULL)); + tstate->tracing--; + return result; +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4220-L4240 + */ +static int +call_trace_protected(Py_tracefunc func, PyObject *obj, + PyThreadState *tstate, PyFrameObject *frame, + int what, PyObject *arg) +{ + PyObject *type, *value, *traceback; + int err; + PyErr_Fetch(&type, &value, &traceback); + err = call_trace(func, obj, tstate, frame, what, arg); + if (err == 0) + { + PyErr_Restore(type, value, traceback); + return 0; + } + else + { + Py_XDECREF(type); + Py_XDECREF(value); + Py_XDECREF(traceback); + return -1; + } +} + +/* + * Code originally from: + * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4520-L4549 + * NOTE: The state test https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4521 + * has been removed, it's dealt with in call_cfunc. + */ +#define C_TRACE(x, call) \ +if (call_trace(tstate->c_profilefunc, tstate->c_profileobj, \ + tstate, tstate->frame, PyTrace_CALL, cfunc)) \ + x = NULL; \ +else \ +{ \ + x = call; \ + if (tstate->c_profilefunc != NULL) \ + { \ + if (x == NULL) \ + { \ + call_trace_protected(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, tstate->frame, \ + PyTrace_RETURN, cfunc); \ + /* XXX should pass (type, value, tb) */ \ + } \ + else \ + { \ + if (call_trace(tstate->c_profilefunc, \ + tstate->c_profileobj, \ + tstate, tstate->frame, \ + PyTrace_RETURN, cfunc)) \ + { \ + Py_DECREF(x); \ + x = NULL; \ + } \ + } \ + } \ +} + + +#endif + +typedef std::vector TypeTable; +typedef std::vector Functions; + +/* The Dispatcher class is the base class of all dispatchers in the CUDA target. + Its main responsibilities are: + + - Resolving the best overload to call for a given set of arguments, and + - Calling the resolved overload. + + This logic is implemented within this class for efficiency (lookup of the + appropriate overload needs to be fast) and ease of implementation (calling + directly into a compiled function using a function pointer is easier within + the C++ code where the overload has been resolved). */ +class Dispatcher { +public: + PyObject_HEAD + /* Whether compilation of new overloads is permitted */ + char can_compile; + /* Whether fallback to object mode is permitted */ + char can_fallback; + /* Whether types must match exactly when resolving overloads. + If not, conversions (e.g. float32 -> float64) are permitted when + searching for a match. */ + char exact_match_required; + /* Borrowed reference */ + PyObject *fallbackdef; + /* Whether to fold named arguments and default values + (false for lifted loops) */ + int fold_args; + /* Whether the last positional argument is a stararg */ + int has_stararg; + /* Tuple of argument names */ + PyObject *argnames; + /* Tuple of default values */ + PyObject *defargs; + /* Number of arguments to function */ + int argct; + /* Used for selecting overloaded function implementations */ + TypeManager *tm; + /* An array of overloads */ + Functions functions; + /* A flattened array of argument types to all overloads + * (invariant: sizeof(overloads) == argct * sizeof(functions)) */ + TypeTable overloads; + + /* Add a new overload. Parameters: + + - args: An array of Type objects, one for each parameter + - callable: The callable implementing this overload. */ + void addDefinition(Type args[], PyObject *callable) { + overloads.reserve(argct + overloads.size()); + for (int i=0; iselectOverload(sig, &overloads[0], selected, argct, + ovct, allow_unsafe, + exact_match_required); + } + if (matches == 1) { + return functions[selected]; + } + return NULL; + } + + /* Remove all overloads */ + void clear() { + functions.clear(); + overloads.clear(); + } + +}; + + +static int +Dispatcher_traverse(Dispatcher *self, visitproc visit, void *arg) +{ + Py_VISIT(self->defargs); + return 0; +} + +static void +Dispatcher_dealloc(Dispatcher *self) +{ + Py_XDECREF(self->argnames); + Py_XDECREF(self->defargs); + self->clear(); + Py_TYPE(self)->tp_free((PyObject*)self); +} + + +static int +Dispatcher_init(Dispatcher *self, PyObject *args, PyObject *kwds) +{ + PyObject *tmaddrobj; + void *tmaddr; + int argct; + int can_fallback; + int has_stararg = 0; + int exact_match_required = 0; + + if (!PyArg_ParseTuple(args, "OiiO!O!i|ii", &tmaddrobj, &argct, + &self->fold_args, + &PyTuple_Type, &self->argnames, + &PyTuple_Type, &self->defargs, + &can_fallback, + &has_stararg, + &exact_match_required + )) { + return -1; + } + Py_INCREF(self->argnames); + Py_INCREF(self->defargs); + tmaddr = PyLong_AsVoidPtr(tmaddrobj); + self->tm = static_cast(tmaddr); + self->argct = argct; + self->can_compile = 1; + self->can_fallback = can_fallback; + self->fallbackdef = NULL; + self->has_stararg = has_stararg; + self->exact_match_required = exact_match_required; + return 0; +} + +static PyObject * +Dispatcher_clear(Dispatcher *self, PyObject *args) +{ + self->clear(); + Py_RETURN_NONE; +} + +static +PyObject* +Dispatcher_Insert(Dispatcher *self, PyObject *args, PyObject *kwds) +{ + /* The cuda kwarg is a temporary addition until CUDA overloads are compiled + * functions. Once they are compiled functions, kwargs can be removed from + * this function. */ + static char *keywords[] = { + (char*)"sig", + (char*)"func", + (char*)"objectmode", + (char*)"cuda", + NULL + }; + + PyObject *sigtup, *cfunc; + int i, sigsz; + int *sig; + int objectmode = 0; + int cuda = 0; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|ip", keywords, &sigtup, + &cfunc, &objectmode, &cuda)) { + return NULL; + } + + if (!cuda && !PyObject_TypeCheck(cfunc, &PyCFunction_Type) ) { + PyErr_SetString(PyExc_TypeError, "must be builtin_function_or_method"); + return NULL; + } + + sigsz = PySequence_Fast_GET_SIZE(sigtup); + sig = new int[sigsz]; + + for (i = 0; i < sigsz; ++i) { + sig[i] = PyLong_AsLong(PySequence_Fast_GET_ITEM(sigtup, i)); + } + + /* The reference to cfunc is borrowed; this only works because the + derived Python class also stores an (owned) reference to cfunc. */ + self->addDefinition(sig, cfunc); + + /* Add pure python fallback */ + if (!self->fallbackdef && objectmode){ + self->fallbackdef = cfunc; + } + + delete[] sig; + + Py_RETURN_NONE; +} + +static +void explain_issue(PyObject *dispatcher, PyObject *args, PyObject *kws, + const char *method_name, const char *default_msg) +{ + PyObject *callback, *result; + callback = PyObject_GetAttrString(dispatcher, method_name); + if (!callback) { + PyErr_SetString(PyExc_TypeError, default_msg); + return; + } + result = PyObject_Call(callback, args, kws); + Py_DECREF(callback); + if (result != NULL) { + PyErr_Format(PyExc_RuntimeError, "%s must raise an exception", + method_name); + Py_DECREF(result); + } +} + +static +void explain_ambiguous(PyObject *dispatcher, PyObject *args, PyObject *kws) +{ + explain_issue(dispatcher, args, kws, "_explain_ambiguous", + "Ambiguous overloading"); +} + +static +void explain_matching_error(PyObject *dispatcher, PyObject *args, PyObject *kws) +{ + explain_issue(dispatcher, args, kws, "_explain_matching_error", + "No matching definition"); +} + +static +int search_new_conversions(PyObject *dispatcher, PyObject *args, PyObject *kws) +{ + PyObject *callback, *result; + int res; + + callback = PyObject_GetAttrString(dispatcher, + "_search_new_conversions"); + if (!callback) { + return -1; + } + result = PyObject_Call(callback, args, kws); + Py_DECREF(callback); + if (result == NULL) { + return -1; + } + if (!PyBool_Check(result)) { + Py_DECREF(result); + PyErr_SetString(PyExc_TypeError, + "_search_new_conversions() should return a boolean"); + return -1; + } + res = (result == Py_True) ? 1 : 0; + Py_DECREF(result); + return res; +} + + +/* A custom, fast, inlinable version of PyCFunction_Call() */ +static PyObject * +call_cfunc(Dispatcher *self, PyObject *cfunc, PyObject *args, PyObject *kws, PyObject *locals) +{ + PyCFunctionWithKeywords fn = NULL; + PyObject * pyresult = NULL; + + assert(PyCFunction_Check(cfunc)); + assert(PyCFunction_GET_FLAGS(cfunc) == (METH_VARARGS | METH_KEYWORDS)); + fn = (PyCFunctionWithKeywords) PyCFunction_GET_FUNCTION(cfunc); + + // make call + pyresult = fn(PyCFunction_GET_SELF(cfunc), args, kws); + + return pyresult; +} + + + + +/* A copy of compile_and_invoke, that only compiles. This is needed for CUDA + * kernels, because its overloads are Python instances of the _Kernel class, + * rather than compiled functions. Once CUDA overloads are compiled functions, + * cuda_compile_only can be removed. */ +static +PyObject* +cuda_compile_only(Dispatcher *self, PyObject *args, PyObject *kws, PyObject *locals) +{ + /* Compile a new one */ + PyObject *cfa, *cfunc; + cfa = PyObject_GetAttrString((PyObject*)self, "_compile_for_args"); + if (cfa == NULL) + return NULL; + + cfunc = PyObject_Call(cfa, args, kws); + Py_DECREF(cfa); + + return cfunc; +} + +static int +find_named_args(Dispatcher *self, PyObject **pargs, PyObject **pkws) +{ + PyObject *oldargs = *pargs, *newargs; + PyObject *kws = *pkws; + Py_ssize_t pos_args = PyTuple_GET_SIZE(oldargs); + Py_ssize_t named_args, total_args, i; + Py_ssize_t func_args = PyTuple_GET_SIZE(self->argnames); + Py_ssize_t defaults = PyTuple_GET_SIZE(self->defargs); + /* Last parameter with a default value */ + Py_ssize_t last_def = (self->has_stararg) + ? func_args - 2 + : func_args - 1; + /* First parameter with a default value */ + Py_ssize_t first_def = last_def - defaults + 1; + /* Minimum number of required arguments */ + Py_ssize_t minargs = first_def; + + if (kws != NULL) + named_args = PyDict_Size(kws); + else + named_args = 0; + total_args = pos_args + named_args; + if (!self->has_stararg && total_args > func_args) { + PyErr_Format(PyExc_TypeError, + "too many arguments: expected %d, got %d", + (int) func_args, (int) total_args); + return -1; + } + else if (total_args < minargs) { + if (minargs == func_args) + PyErr_Format(PyExc_TypeError, + "not enough arguments: expected %d, got %d", + (int) minargs, (int) total_args); + else + PyErr_Format(PyExc_TypeError, + "not enough arguments: expected at least %d, got %d", + (int) minargs, (int) total_args); + return -1; + } + newargs = PyTuple_New(func_args); + if (!newargs) + return -1; + /* First pack the stararg */ + if (self->has_stararg) { + Py_ssize_t stararg_size = Py_MAX(0, pos_args - func_args + 1); + PyObject *stararg = PyTuple_New(stararg_size); + if (!stararg) { + Py_DECREF(newargs); + return -1; + } + for (i = 0; i < stararg_size; i++) { + PyObject *value = PyTuple_GET_ITEM(oldargs, func_args - 1 + i); + Py_INCREF(value); + PyTuple_SET_ITEM(stararg, i, value); + } + /* Put it in last position */ + PyTuple_SET_ITEM(newargs, func_args - 1, stararg); + + } + for (i = 0; i < pos_args; i++) { + PyObject *value = PyTuple_GET_ITEM(oldargs, i); + if (self->has_stararg && i >= func_args - 1) { + /* Skip stararg */ + break; + } + Py_INCREF(value); + PyTuple_SET_ITEM(newargs, i, value); + } + + /* Iterate over missing positional arguments, try to find them in + named arguments or default values. */ + for (i = pos_args; i < func_args; i++) { + PyObject *name = PyTuple_GET_ITEM(self->argnames, i); + if (self->has_stararg && i >= func_args - 1) { + /* Skip stararg */ + break; + } + if (kws != NULL) { + /* Named argument? */ + PyObject *value = PyDict_GetItem(kws, name); + if (value != NULL) { + Py_INCREF(value); + PyTuple_SET_ITEM(newargs, i, value); + named_args--; + continue; + } + } + if (i >= first_def && i <= last_def) { + /* Argument has a default value? */ + PyObject *value = PyTuple_GET_ITEM(self->defargs, i - first_def); + Py_INCREF(value); + PyTuple_SET_ITEM(newargs, i, value); + continue; + } + else if (i < func_args - 1 || !self->has_stararg) { + PyErr_Format(PyExc_TypeError, + "missing argument '%s'", + PyString_AsString(name)); + Py_DECREF(newargs); + return -1; + } + } + if (named_args) { + PyErr_Format(PyExc_TypeError, + "some keyword arguments unexpected"); + Py_DECREF(newargs); + return -1; + } + *pargs = newargs; + *pkws = NULL; + return 0; +} + + + +/* Based on Dispatcher_call above, with the following differences: + 1. It does not invoke the definition of the function. + 2. It returns the definition, instead of a value returned by the function. + + This is because CUDA functions are, at present, _Kernel objects rather than + compiled functions. */ +static PyObject* +Dispatcher_cuda_call(Dispatcher *self, PyObject *args, PyObject *kws) +{ + PyObject *tmptype, *retval = NULL; + int *tys = NULL; + int argct; + int i; + int prealloc[24]; + int matches; + PyObject *cfunc; + PyThreadState *ts = PyThreadState_Get(); + PyObject *locals = NULL; + + /* If compilation is enabled, ensure that an exact match is found and if + * not compile one */ + int exact_match_required = self->can_compile ? 1 : self->exact_match_required; + +#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 10) + if (ts->tracing && ts->c_profilefunc) { +#else + if (ts->use_tracing && ts->c_profilefunc) { +#endif + locals = PyEval_GetLocals(); + if (locals == NULL) { + goto CLEANUP; + } + } + if (self->fold_args) { + if (find_named_args(self, &args, &kws)) + return NULL; + } + else + Py_INCREF(args); + /* Now we own a reference to args */ + + argct = PySequence_Fast_GET_SIZE(args); + + if (argct < (Py_ssize_t) (sizeof(prealloc) / sizeof(int))) + tys = prealloc; + else + tys = new int[argct]; + + for (i = 0; i < argct; ++i) { + tmptype = PySequence_Fast_GET_ITEM(args, i); + tys[i] = typeof_typecode((PyObject *) self, tmptype); + if (tys[i] == -1) { + if (self->can_fallback){ + /* We will clear the exception if fallback is allowed. */ + PyErr_Clear(); + } else { + goto CLEANUP; + } + } + } + + /* We only allow unsafe conversions if compilation of new specializations + has been disabled. */ + cfunc = self->resolve(tys, matches, !self->can_compile, + exact_match_required); + + if (matches == 0 && !self->can_compile) { + /* + * If we can't compile a new specialization, look for + * matching signatures for which conversions haven't been + * registered on the C++ TypeManager. + */ + int res = search_new_conversions((PyObject *) self, args, kws); + if (res < 0) { + retval = NULL; + goto CLEANUP; + } + if (res > 0) { + /* Retry with the newly registered conversions */ + cfunc = self->resolve(tys, matches, !self->can_compile, + exact_match_required); + } + } + + if (matches == 1) { + /* Definition is found */ + retval = cfunc; + Py_INCREF(retval); + } else if (matches == 0) { + /* No matching definition */ + if (self->can_compile) { + retval = cuda_compile_only(self, args, kws, locals); + } else if (self->fallbackdef) { + /* Have object fallback */ + retval = call_cfunc(self, self->fallbackdef, args, kws, locals); + } else { + /* Raise TypeError */ + explain_matching_error((PyObject *) self, args, kws); + retval = NULL; + } + } else if (self->can_compile) { + /* Ambiguous, but are allowed to compile */ + retval = cuda_compile_only(self, args, kws, locals); + } else { + /* Ambiguous */ + explain_ambiguous((PyObject *) self, args, kws); + retval = NULL; + } + +CLEANUP: + if (tys != prealloc) + delete[] tys; + Py_DECREF(args); + + return retval; +} + +static int +import_devicearray(void) +{ + PyObject *devicearray = PyImport_ImportModule(NUMBA_DEVICEARRAY_IMPORT_NAME); + if (devicearray == NULL) { + return -1; + } + + PyObject *d = PyModule_GetDict(devicearray); + if (d == NULL) { + Py_DECREF(devicearray); + return -1; + } + + PyObject *c_api = PyDict_GetItemString(d, "_DEVICEARRAY_API"); + if (PyCapsule_IsValid(c_api, NUMBA_DEVICEARRAY_IMPORT_NAME "._DEVICEARRAY_API")) { + DeviceArray_API = (void**)PyCapsule_GetPointer(c_api, NUMBA_DEVICEARRAY_IMPORT_NAME "._DEVICEARRAY_API"); + Py_DECREF(devicearray); + return 0; + } else { + Py_DECREF(devicearray); + return -1; + } +} + +static PyMethodDef Dispatcher_methods[] = { + { "_clear", (PyCFunction)Dispatcher_clear, METH_NOARGS, NULL }, + { "_insert", (PyCFunction)Dispatcher_Insert, METH_VARARGS | METH_KEYWORDS, + "insert new definition"}, + { "_cuda_call", (PyCFunction)Dispatcher_cuda_call, + METH_VARARGS | METH_KEYWORDS, "CUDA call resolution" }, + { NULL }, +}; + +static PyMemberDef Dispatcher_members[] = { + {(char*)"_can_compile", T_BOOL, offsetof(Dispatcher, can_compile), 0, NULL }, + {NULL} /* Sentinel */ +}; + + +static PyTypeObject DispatcherType = { + PyVarObject_HEAD_INIT(NULL, 0) + "_dispatcher.Dispatcher", /* tp_name */ + sizeof(Dispatcher), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)Dispatcher_dealloc, /* tp_dealloc */ + 0, /* tp_vectorcall_offset */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_as_async */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call*/ + 0, /* tp_str*/ + 0, /* tp_getattro*/ + 0, /* tp_setattro*/ + 0, /* tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags*/ + "Dispatcher object", /* tp_doc */ + (traverseproc) Dispatcher_traverse, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + Dispatcher_methods, /* tp_methods */ + Dispatcher_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)Dispatcher_init, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ + 0, /* tp_free */ + 0, /* tp_is_gc */ + 0, /* tp_bases */ + 0, /* tp_mro */ + 0, /* tp_cache */ + 0, /* tp_subclasses */ + 0, /* tp_weaklist */ + 0, /* tp_del */ + 0, /* tp_version_tag */ + 0, /* tp_finalize */ + 0, /* tp_vectorcall */ +#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION == 12) +/* This was introduced first in 3.12 + * https://github.com/python/cpython/issues/91051 + */ + 0, /* tp_watched */ +#endif + +/* WARNING: Do not remove this, only modify it! It is a version guard to + * act as a reminder to update this struct on Python version update! */ +#if (PY_MAJOR_VERSION == 3) +#if ! (NB_SUPPORTED_PYTHON_MINOR) +#error "Python minor version is not supported." +#endif +#else +#error "Python major version is not supported." +#endif +/* END WARNING*/ +}; + + + + +static PyObject *compute_fingerprint(PyObject *self, PyObject *args) +{ + PyObject *val; + if (!PyArg_ParseTuple(args, "O:compute_fingerprint", &val)) + return NULL; + return typeof_compute_fingerprint(val); +} + +static PyMethodDef ext_methods[] = { +#define declmethod(func) { #func , ( PyCFunction )func , METH_VARARGS , NULL } + declmethod(typeof_init), + declmethod(compute_fingerprint), + { NULL }, +#undef declmethod +}; + + +MOD_INIT(_dispatcher) { + if (import_devicearray() < 0) { + PyErr_Print(); + PyErr_SetString(PyExc_ImportError, NUMBA_DEVICEARRAY_IMPORT_NAME " failed to import"); + return MOD_ERROR_VAL; + } + + PyObject *m; + MOD_DEF(m, "_dispatcher", "No docs", ext_methods) + if (m == NULL) + return MOD_ERROR_VAL; + + DispatcherType.tp_new = PyType_GenericNew; + if (PyType_Ready(&DispatcherType) < 0) { + return MOD_ERROR_VAL; + } + Py_INCREF(&DispatcherType); + PyModule_AddObject(m, "Dispatcher", (PyObject*)(&DispatcherType)); + + return MOD_SUCCESS_VAL(m); +} diff --git a/numba_cuda/numba/cuda/cext/_hashtable.cpp b/numba_cuda/numba/cuda/cext/_hashtable.cpp new file mode 100644 index 000000000..7d5dda962 --- /dev/null +++ b/numba_cuda/numba/cuda/cext/_hashtable.cpp @@ -0,0 +1,532 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + +/* + * This file and _hashtable.h are from CPython 3.5. The symbols have been + * renamed from _Py_hashxxx to _Numba_hashxxx to avoid name clashes with + * the CPython definitions (including at runtime through dynamic linking). + * Those CPython APIs are private and can change in incompatible ways at + * any time. + * + * Command line used for renaming: + * $ sed -i -r 's/\b_Py_(has[h]table)/_Numba_\1/ig' numba/_hashtable.h numba/_hashtable.c + */ + +/* The implementation of the hash table (_Numba_hashtable_t) is based on the cfuhash + project: + http://sourceforge.net/projects/libcfu/ + + Copyright of cfuhash: + ---------------------------------- + Creation date: 2005-06-24 21:22:40 + Authors: Don + Change log: + + Copyright (c) 2005 Don Owens + All rights reserved. + + This code is released under the BSD license: + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the author nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + OF THE POSSIBILITY OF SUCH DAMAGE. + ---------------------------------- +*/ + +#include "_pymodule.h" +#include "_hashtable.h" + +#define HASHTABLE_MIN_SIZE 16 +#define HASHTABLE_HIGH 0.50 +#define HASHTABLE_LOW 0.10 +#define HASHTABLE_REHASH_FACTOR 2.0 / (HASHTABLE_LOW + HASHTABLE_HIGH) + +#define BUCKETS_HEAD(SLIST) \ + ((_Numba_hashtable_entry_t *)_Py_SLIST_HEAD(&(SLIST))) +#define TABLE_HEAD(HT, BUCKET) \ + ((_Numba_hashtable_entry_t *)_Py_SLIST_HEAD(&(HT)->buckets[BUCKET])) +#define ENTRY_NEXT(ENTRY) \ + ((_Numba_hashtable_entry_t *)_Py_SLIST_ITEM_NEXT(ENTRY)) +#define HASHTABLE_ITEM_SIZE(HT) \ + (sizeof(_Numba_hashtable_entry_t) + (HT)->data_size) + +/* Forward declaration */ +static void hashtable_rehash(_Numba_hashtable_t *ht); + +static void +_Py_slist_init(_Py_slist_t *list) +{ + list->head = NULL; +} + +static void +_Py_slist_prepend(_Py_slist_t *list, _Py_slist_item_t *item) +{ + item->next = list->head; + list->head = item; +} + +static void +_Py_slist_remove(_Py_slist_t *list, _Py_slist_item_t *previous, + _Py_slist_item_t *item) +{ + if (previous != NULL) + previous->next = item->next; + else + list->head = item->next; +} + +extern "C" Py_uhash_t +_Numba_hashtable_hash_int(const void *key) +{ + return (Py_uhash_t)key; +} + +extern "C" Py_uhash_t +_Numba_hashtable_hash_ptr(const void *key) +{ + return (Py_uhash_t)_Py_HashPointer((void *)key); +} + +extern "C" int +_Numba_hashtable_compare_direct(const void *key, const _Numba_hashtable_entry_t *entry) +{ + return entry->key == key; +} + +/* makes sure the real size of the buckets array is a power of 2 */ +static size_t +round_size(size_t s) +{ + size_t i; + if (s < HASHTABLE_MIN_SIZE) + return HASHTABLE_MIN_SIZE; + i = 1; + while (i < s) + i <<= 1; + return i; +} + +extern "C" _Numba_hashtable_t * +_Numba_hashtable_new_full(size_t data_size, size_t init_size, + _Numba_hashtable_hash_func hash_func, + _Numba_hashtable_compare_func compare_func, + _Numba_hashtable_copy_data_func copy_data_func, + _Numba_hashtable_free_data_func free_data_func, + _Numba_hashtable_get_data_size_func get_data_size_func, + _Numba_hashtable_allocator_t *allocator) +{ + _Numba_hashtable_t *ht; + size_t buckets_size; + _Numba_hashtable_allocator_t alloc; + + if (allocator == NULL) { + alloc.malloc = PyMem_RawMalloc; + alloc.free = PyMem_RawFree; + } + else + alloc = *allocator; + + ht = (_Numba_hashtable_t *)alloc.malloc(sizeof(_Numba_hashtable_t)); + if (ht == NULL) + return ht; + + ht->num_buckets = round_size(init_size); + ht->entries = 0; + ht->data_size = data_size; + + buckets_size = ht->num_buckets * sizeof(ht->buckets[0]); + ht->buckets = (_Py_slist_t *) alloc.malloc(buckets_size); + if (ht->buckets == NULL) { + alloc.free(ht); + return NULL; + } + memset(ht->buckets, 0, buckets_size); + + ht->hash_func = hash_func; + ht->compare_func = compare_func; + ht->copy_data_func = copy_data_func; + ht->free_data_func = free_data_func; + ht->get_data_size_func = get_data_size_func; + ht->alloc = alloc; + return ht; +} + +extern "C" _Numba_hashtable_t * +_Numba_hashtable_new(size_t data_size, + _Numba_hashtable_hash_func hash_func, + _Numba_hashtable_compare_func compare_func) +{ + return _Numba_hashtable_new_full(data_size, HASHTABLE_MIN_SIZE, + hash_func, compare_func, + NULL, NULL, NULL, NULL); +} + +extern "C" size_t +_Numba_hashtable_size(_Numba_hashtable_t *ht) +{ + size_t size; + size_t hv; + + size = sizeof(_Numba_hashtable_t); + + /* buckets */ + size += ht->num_buckets * sizeof(_Numba_hashtable_entry_t *); + + /* entries */ + size += ht->entries * HASHTABLE_ITEM_SIZE(ht); + + /* data linked from entries */ + if (ht->get_data_size_func) { + for (hv = 0; hv < ht->num_buckets; hv++) { + _Numba_hashtable_entry_t *entry; + + for (entry = TABLE_HEAD(ht, hv); entry; entry = ENTRY_NEXT(entry)) { + void *data; + + data = _Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry); + size += ht->get_data_size_func(data); + } + } + } + return size; +} + +#ifdef Py_DEBUG +extern "C" void +_Numba_hashtable_print_stats(_Numba_hashtable_t *ht) +{ + size_t size; + size_t chain_len, max_chain_len, total_chain_len, nchains; + _Numba_hashtable_entry_t *entry; + size_t hv; + double load; + + size = _Numba_hashtable_size(ht); + + load = (double)ht->entries / ht->num_buckets; + + max_chain_len = 0; + total_chain_len = 0; + nchains = 0; + for (hv = 0; hv < ht->num_buckets; hv++) { + entry = TABLE_HEAD(ht, hv); + if (entry != NULL) { + chain_len = 0; + for (; entry; entry = ENTRY_NEXT(entry)) { + chain_len++; + } + if (chain_len > max_chain_len) + max_chain_len = chain_len; + total_chain_len += chain_len; + nchains++; + } + } + printf("hash table %p: entries=%" + PY_FORMAT_SIZE_T "u/%" PY_FORMAT_SIZE_T "u (%.0f%%), ", + ht, ht->entries, ht->num_buckets, load * 100.0); + if (nchains) + printf("avg_chain_len=%.1f, ", (double)total_chain_len / nchains); + printf("max_chain_len=%" PY_FORMAT_SIZE_T "u, %" PY_FORMAT_SIZE_T "u kB\n", + max_chain_len, size / 1024); +} +#endif + +/* Get an entry. Return NULL if the key does not exist. */ +extern "C" _Numba_hashtable_entry_t * +_Numba_hashtable_get_entry(_Numba_hashtable_t *ht, const void *key) +{ + Py_uhash_t key_hash; + size_t index; + _Numba_hashtable_entry_t *entry; + + key_hash = ht->hash_func(key); + index = key_hash & (ht->num_buckets - 1); + + for (entry = TABLE_HEAD(ht, index); entry != NULL; entry = ENTRY_NEXT(entry)) { + if (entry->key_hash == key_hash && ht->compare_func(key, entry)) + break; + } + + return entry; +} + +static int +_hashtable_pop_entry(_Numba_hashtable_t *ht, const void *key, void *data, size_t data_size) +{ + Py_uhash_t key_hash; + size_t index; + _Numba_hashtable_entry_t *entry, *previous; + + key_hash = ht->hash_func(key); + index = key_hash & (ht->num_buckets - 1); + + previous = NULL; + for (entry = TABLE_HEAD(ht, index); entry != NULL; entry = ENTRY_NEXT(entry)) { + if (entry->key_hash == key_hash && ht->compare_func(key, entry)) + break; + previous = entry; + } + + if (entry == NULL) + return 0; + + _Py_slist_remove(&ht->buckets[index], (_Py_slist_item_t *)previous, + (_Py_slist_item_t *)entry); + ht->entries--; + + if (data != NULL) + _Numba_HASHTABLE_ENTRY_READ_DATA(ht, data, data_size, entry); + ht->alloc.free(entry); + + if ((float)ht->entries / (float)ht->num_buckets < HASHTABLE_LOW) + hashtable_rehash(ht); + return 1; +} + +/* Add a new entry to the hash. The key must not be present in the hash table. + Return 0 on success, -1 on memory error. */ +extern "C" int +_Numba_hashtable_set(_Numba_hashtable_t *ht, const void *key, + void *data, size_t data_size) +{ + Py_uhash_t key_hash; + size_t index; + _Numba_hashtable_entry_t *entry; + + assert(data != NULL || data_size == 0); +#ifndef NDEBUG + /* Don't write the assertion on a single line because it is interesting + to know the duplicated entry if the assertion failed. The entry can + be read using a debugger. */ + entry = _Numba_hashtable_get_entry(ht, key); + assert(entry == NULL); +#endif + + key_hash = ht->hash_func(key); + index = key_hash & (ht->num_buckets - 1); + + entry = (_Numba_hashtable_entry_t *) ht->alloc.malloc(HASHTABLE_ITEM_SIZE(ht)); + if (entry == NULL) { + /* memory allocation failed */ + return -1; + } + + entry->key = (void *)key; + entry->key_hash = key_hash; + + assert(data_size == ht->data_size); + memcpy(_Numba_HASHTABLE_ENTRY_DATA(entry), data, data_size); + + _Py_slist_prepend(&ht->buckets[index], (_Py_slist_item_t*)entry); + ht->entries++; + + if ((float)ht->entries / (float)ht->num_buckets > HASHTABLE_HIGH) + hashtable_rehash(ht); + return 0; +} + +/* Get data from an entry. Copy entry data into data and return 1 if the entry + exists, return 0 if the entry does not exist. */ +extern "C" int +_Numba_hashtable_get(_Numba_hashtable_t *ht, const void *key, void *data, size_t data_size) +{ + _Numba_hashtable_entry_t *entry; + + assert(data != NULL); + + entry = _Numba_hashtable_get_entry(ht, key); + if (entry == NULL) + return 0; + _Numba_HASHTABLE_ENTRY_READ_DATA(ht, data, data_size, entry); + return 1; +} + +extern "C" int +_Numba_hashtable_pop(_Numba_hashtable_t *ht, const void *key, void *data, size_t data_size) +{ + assert(data != NULL); + assert(ht->free_data_func == NULL); + return _hashtable_pop_entry(ht, key, data, data_size); +} + +/* Delete an entry. The entry must exist. */ +extern "C" void +_Numba_hashtable_delete(_Numba_hashtable_t *ht, const void *key) +{ +#ifndef NDEBUG + int found = _hashtable_pop_entry(ht, key, NULL, 0); + assert(found); +#else + (void)_hashtable_pop_entry(ht, key, NULL, 0); +#endif +} + +/* Prototype for a pointer to a function to be called foreach + key/value pair in the hash by hashtable_foreach(). Iteration + stops if a non-zero value is returned. */ +extern "C" int +_Numba_hashtable_foreach(_Numba_hashtable_t *ht, + int (*func) (_Numba_hashtable_entry_t *entry, void *arg), + void *arg) +{ + _Numba_hashtable_entry_t *entry; + size_t hv; + + for (hv = 0; hv < ht->num_buckets; hv++) { + for (entry = TABLE_HEAD(ht, hv); entry; entry = ENTRY_NEXT(entry)) { + int res = func(entry, arg); + if (res) + return res; + } + } + return 0; +} + +static void +hashtable_rehash(_Numba_hashtable_t *ht) +{ + size_t buckets_size, new_size, bucket; + _Py_slist_t *old_buckets = NULL; + size_t old_num_buckets; + + new_size = round_size((size_t)(ht->entries * HASHTABLE_REHASH_FACTOR)); + if (new_size == ht->num_buckets) + return; + + old_num_buckets = ht->num_buckets; + + buckets_size = new_size * sizeof(ht->buckets[0]); + old_buckets = ht->buckets; + ht->buckets = (_Py_slist_t *) ht->alloc.malloc(buckets_size); + if (ht->buckets == NULL) { + /* cancel rehash on memory allocation failure */ + ht->buckets = old_buckets ; + /* memory allocation failed */ + return; + } + memset(ht->buckets, 0, buckets_size); + + ht->num_buckets = new_size; + + for (bucket = 0; bucket < old_num_buckets; bucket++) { + _Numba_hashtable_entry_t *entry, *next; + for (entry = BUCKETS_HEAD(old_buckets[bucket]); entry != NULL; entry = next) { + size_t entry_index; + + assert(ht->hash_func(entry->key) == entry->key_hash); + next = ENTRY_NEXT(entry); + entry_index = entry->key_hash & (new_size - 1); + + _Py_slist_prepend(&ht->buckets[entry_index], (_Py_slist_item_t*)entry); + } + } + + ht->alloc.free(old_buckets); +} + +extern "C" void +_Numba_hashtable_clear(_Numba_hashtable_t *ht) +{ + _Numba_hashtable_entry_t *entry, *next; + size_t i; + + for (i=0; i < ht->num_buckets; i++) { + for (entry = TABLE_HEAD(ht, i); entry != NULL; entry = next) { + next = ENTRY_NEXT(entry); + if (ht->free_data_func) + ht->free_data_func(_Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry)); + ht->alloc.free(entry); + } + _Py_slist_init(&ht->buckets[i]); + } + ht->entries = 0; + hashtable_rehash(ht); +} + +extern "C" void +_Numba_hashtable_destroy(_Numba_hashtable_t *ht) +{ + size_t i; + + for (i = 0; i < ht->num_buckets; i++) { + _Py_slist_item_t *entry = ht->buckets[i].head; + while (entry) { + _Py_slist_item_t *entry_next = entry->next; + if (ht->free_data_func) + ht->free_data_func(_Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry)); + ht->alloc.free(entry); + entry = entry_next; + } + } + + ht->alloc.free(ht->buckets); + ht->alloc.free(ht); +} + +/* Return a copy of the hash table */ +extern "C" _Numba_hashtable_t * +_Numba_hashtable_copy(_Numba_hashtable_t *src) +{ + _Numba_hashtable_t *dst; + _Numba_hashtable_entry_t *entry; + size_t bucket; + int err; + void *data, *new_data; + + dst = _Numba_hashtable_new_full(src->data_size, src->num_buckets, + src->hash_func, src->compare_func, + src->copy_data_func, src->free_data_func, + src->get_data_size_func, &src->alloc); + if (dst == NULL) + return NULL; + + for (bucket=0; bucket < src->num_buckets; bucket++) { + entry = TABLE_HEAD(src, bucket); + for (; entry; entry = ENTRY_NEXT(entry)) { + if (src->copy_data_func) { + data = _Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry); + new_data = src->copy_data_func(data); + if (new_data != NULL) + err = _Numba_hashtable_set(dst, entry->key, + &new_data, src->data_size); + else + err = 1; + } + else { + data = _Numba_HASHTABLE_ENTRY_DATA(entry); + err = _Numba_hashtable_set(dst, entry->key, data, src->data_size); + } + if (err) { + _Numba_hashtable_destroy(dst); + return NULL; + } + } + } + return dst; +} diff --git a/numba_cuda/numba/cuda/cext/_hashtable.h b/numba_cuda/numba/cuda/cext/_hashtable.h new file mode 100644 index 000000000..9fb719472 --- /dev/null +++ b/numba_cuda/numba/cuda/cext/_hashtable.h @@ -0,0 +1,135 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + +/* + * See _hashtable.c for more information about this file. + */ + +#ifndef Py_HASHTABLE_H +#define Py_HASHTABLE_H + +/* The whole API is private */ +#ifndef Py_LIMITED_API + +typedef struct _Py_slist_item_s { + struct _Py_slist_item_s *next; +} _Py_slist_item_t; + +typedef struct { + _Py_slist_item_t *head; +} _Py_slist_t; + +#define _Py_SLIST_ITEM_NEXT(ITEM) (((_Py_slist_item_t *)ITEM)->next) + +#define _Py_SLIST_HEAD(SLIST) (((_Py_slist_t *)SLIST)->head) + +typedef struct { + /* used by _Numba_hashtable_t.buckets to link entries */ + _Py_slist_item_t _Py_slist_item; + + const void *key; + Py_uhash_t key_hash; + + /* data follows */ +} _Numba_hashtable_entry_t; + +#define _Numba_HASHTABLE_ENTRY_DATA(ENTRY) \ + ((char *)(ENTRY) + sizeof(_Numba_hashtable_entry_t)) + +#define _Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(ENTRY) \ + (*(void **)_Numba_HASHTABLE_ENTRY_DATA(ENTRY)) + +#define _Numba_HASHTABLE_ENTRY_READ_DATA(TABLE, DATA, DATA_SIZE, ENTRY) \ + do { \ + assert((DATA_SIZE) == (TABLE)->data_size); \ + memcpy(DATA, _Numba_HASHTABLE_ENTRY_DATA(ENTRY), DATA_SIZE); \ + } while (0) + +typedef Py_uhash_t (*_Numba_hashtable_hash_func) (const void *key); +typedef int (*_Numba_hashtable_compare_func) (const void *key, const _Numba_hashtable_entry_t *he); +typedef void* (*_Numba_hashtable_copy_data_func)(void *data); +typedef void (*_Numba_hashtable_free_data_func)(void *data); +typedef size_t (*_Numba_hashtable_get_data_size_func)(void *data); + +typedef struct { + /* allocate a memory block */ + void* (*malloc) (size_t size); + + /* release a memory block */ + void (*free) (void *ptr); +} _Numba_hashtable_allocator_t; + +typedef struct { + size_t num_buckets; + size_t entries; /* Total number of entries in the table. */ + _Py_slist_t *buckets; + size_t data_size; + + _Numba_hashtable_hash_func hash_func; + _Numba_hashtable_compare_func compare_func; + _Numba_hashtable_copy_data_func copy_data_func; + _Numba_hashtable_free_data_func free_data_func; + _Numba_hashtable_get_data_size_func get_data_size_func; + _Numba_hashtable_allocator_t alloc; +} _Numba_hashtable_t; + +/* hash and compare functions for integers and pointers */ +extern "C" PyAPI_FUNC(Py_uhash_t) _Numba_hashtable_hash_ptr(const void *key); +extern "C" PyAPI_FUNC(Py_uhash_t) _Numba_hashtable_hash_int(const void *key); +extern "C" PyAPI_FUNC(int) _Numba_hashtable_compare_direct(const void *key, const _Numba_hashtable_entry_t *entry); + +extern "C" PyAPI_FUNC(_Numba_hashtable_t *) _Numba_hashtable_new( + size_t data_size, + _Numba_hashtable_hash_func hash_func, + _Numba_hashtable_compare_func compare_func); +extern "C" PyAPI_FUNC(_Numba_hashtable_t *) _Numba_hashtable_new_full( + size_t data_size, + size_t init_size, + _Numba_hashtable_hash_func hash_func, + _Numba_hashtable_compare_func compare_func, + _Numba_hashtable_copy_data_func copy_data_func, + _Numba_hashtable_free_data_func free_data_func, + _Numba_hashtable_get_data_size_func get_data_size_func, + _Numba_hashtable_allocator_t *allocator); +extern "C" PyAPI_FUNC(_Numba_hashtable_t *) _Numba_hashtable_copy(_Numba_hashtable_t *src); +extern "C" PyAPI_FUNC(void) _Numba_hashtable_clear(_Numba_hashtable_t *ht); +extern "C" PyAPI_FUNC(void) _Numba_hashtable_destroy(_Numba_hashtable_t *ht); + +typedef int (*_Numba_hashtable_foreach_func) (_Numba_hashtable_entry_t *entry, void *arg); + +extern "C" PyAPI_FUNC(int) _Numba_hashtable_foreach( + _Numba_hashtable_t *ht, + _Numba_hashtable_foreach_func func, void *arg); +extern "C" PyAPI_FUNC(size_t) _Numba_hashtable_size(_Numba_hashtable_t *ht); + +extern "C" PyAPI_FUNC(_Numba_hashtable_entry_t*) _Numba_hashtable_get_entry( + _Numba_hashtable_t *ht, + const void *key); +extern "C" PyAPI_FUNC(int) _Numba_hashtable_set( + _Numba_hashtable_t *ht, + const void *key, + void *data, + size_t data_size); +extern "C" PyAPI_FUNC(int) _Numba_hashtable_get( + _Numba_hashtable_t *ht, + const void *key, + void *data, + size_t data_size); +extern "C" PyAPI_FUNC(int) _Numba_hashtable_pop( + _Numba_hashtable_t *ht, + const void *key, + void *data, + size_t data_size); +extern "C" PyAPI_FUNC(void) _Numba_hashtable_delete( + _Numba_hashtable_t *ht, + const void *key); + +#define _Numba_HASHTABLE_SET(TABLE, KEY, DATA) \ + _Numba_hashtable_set(TABLE, KEY, &(DATA), sizeof(DATA)) + +#define _Numba_HASHTABLE_GET(TABLE, KEY, DATA) \ + _Numba_hashtable_get(TABLE, KEY, &(DATA), sizeof(DATA)) + +#endif /* Py_LIMITED_API */ + +#endif diff --git a/numba_cuda/numba/cuda/cext/_pymodule.h b/numba_cuda/numba/cuda/cext/_pymodule.h new file mode 100644 index 000000000..4431dfff2 --- /dev/null +++ b/numba_cuda/numba/cuda/cext/_pymodule.h @@ -0,0 +1,38 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + +#ifndef NUMBA_PY_MODULE_H_ +#define NUMBA_PY_MODULE_H_ + +#define PY_SSIZE_T_CLEAN + +#include "Python.h" +#include "structmember.h" +#include "frameobject.h" + +#define MOD_ERROR_VAL NULL +#define MOD_SUCCESS_VAL(val) val +#define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void) +#define MOD_DEF(ob, name, doc, methods) { \ + static struct PyModuleDef moduledef = { \ + PyModuleDef_HEAD_INIT, name, doc, -1, methods, NULL, NULL, NULL, NULL }; \ + ob = PyModule_Create(&moduledef); } +#define MOD_INIT_EXEC(name) PyInit_##name(); + +#define PyString_AsString PyUnicode_AsUTF8 +#define PyString_Check PyUnicode_Check +#define PyString_FromFormat PyUnicode_FromFormat +#define PyString_FromString PyUnicode_FromString +#define PyString_InternFromString PyUnicode_InternFromString +#define PyInt_Type PyLong_Type +#define PyInt_Check PyLong_Check +#define PyInt_CheckExact PyLong_CheckExact +#define SetAttrStringFromVoidPointer(m, name) do { \ + PyObject *tmp = PyLong_FromVoidPtr((void *) &name); \ + PyObject_SetAttrString(m, #name, tmp); \ + Py_DECREF(tmp); } while (0) + + +#define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13)) + +#endif /* NUMBA_PY_MODULE_H_ */ diff --git a/numba_cuda/numba/cuda/cext/_typeof.cpp b/numba_cuda/numba/cuda/cext/_typeof.cpp new file mode 100644 index 000000000..6730b6d6f --- /dev/null +++ b/numba_cuda/numba/cuda/cext/_typeof.cpp @@ -0,0 +1,1159 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + +#include "_pymodule.h" + +#include +#include +#include + +#include "_typeof.h" +#include "_hashtable.h" +#include "_devicearray.h" +#include "pyerrors.h" + +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#include +#if NPY_ABI_VERSION >= 0x02000000 + #include +#endif + +#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 13) + #ifndef Py_BUILD_CORE + #define Py_BUILD_CORE 1 + #endif + #include "internal/pycore_setobject.h" // _PySet_NextEntry() +#endif + + +/* Cached typecodes for basic scalar types */ +static int tc_int8; +static int tc_int16; +static int tc_int32; +static int tc_int64; +static int tc_uint8; +static int tc_uint16; +static int tc_uint32; +static int tc_uint64; +static int tc_float32; +static int tc_float64; +static int tc_complex64; +static int tc_complex128; +static int BASIC_TYPECODES[12]; + +static int tc_intp; + +/* The type object for the numba .dispatcher.OmittedArg class + * that wraps omitted arguments. + */ +static PyObject *omittedarg_type; + +static PyObject *typecache; +static PyObject *ndarray_typecache; +static PyObject *structured_dtypes; + +static PyObject *str_typeof_pyval = NULL; +static PyObject *str_value = NULL; +static PyObject *str_numba_type = NULL; + +/* CUDA device array API */ +void **DeviceArray_API; + +/* + * Type fingerprint computation. + */ + +typedef struct { + /* A buffer the fingerprint will be written to */ + char *buf; + size_t n; + size_t allocated; + /* A preallocated buffer, sufficient to fit the fingerprint for most types */ + char static_buf[40]; +} string_writer_t; + +static void +string_writer_init(string_writer_t *w) +{ + w->buf = w->static_buf; + w->n = 0; + w->allocated = sizeof(w->static_buf) / sizeof(unsigned char); +} + +static void +string_writer_clear(string_writer_t *w) +{ + if (w->buf != w->static_buf) + free(w->buf); +} + +static void +string_writer_move(string_writer_t *dest, const string_writer_t *src) +{ + dest->n = src->n; + dest->allocated = src->allocated; + if (src->buf == src->static_buf) { + dest->buf = dest->static_buf; + memcpy(dest->buf, src->buf, src->n); + } + else { + dest->buf = src->buf; + } +} + +/* Ensure at least *bytes* can be appended to the string writer's buffer. */ +static int +string_writer_ensure(string_writer_t *w, size_t bytes) +{ + size_t newsize; + bytes += w->n; + if (bytes <= w->allocated) + return 0; + newsize = (w->allocated << 2) + 1; + if (newsize < bytes) + newsize = bytes; + if (w->buf == w->static_buf) { + w->buf = (char *) malloc(newsize); + memcpy(w->buf, w->static_buf, w->allocated); + } + else + w->buf = (char *) realloc(w->buf, newsize); + if (w->buf) { + w->allocated = newsize; + return 0; + } + else { + PyErr_NoMemory(); + return -1; + } +} + +static int +string_writer_put_char(string_writer_t *w, unsigned char c) +{ + if (string_writer_ensure(w, 1)) + return -1; + w->buf[w->n++] = c; + return 0; +} + +static int +string_writer_put_int32(string_writer_t *w, unsigned int v) +{ + if (string_writer_ensure(w, 4)) + return -1; + w->buf[w->n] = v & 0xff; + w->buf[w->n + 1] = (v >> 8) & 0xff; + w->buf[w->n + 2] = (v >> 16) & 0xff; + w->buf[w->n + 3] = (v >> 24) & 0xff; + w->n += 4; + return 0; +} + +static int +string_writer_put_intp(string_writer_t *w, npy_intp v) +{ + if (string_writer_ensure(w, NPY_SIZEOF_PY_INTPTR_T)) + return -1; + w->buf[w->n] = v & 0xff; + w->buf[w->n + 1] = (v >> 8) & 0xff; + w->buf[w->n + 2] = (v >> 16) & 0xff; + w->buf[w->n + 3] = (v >> 24) & 0xff; +#if NPY_SIZEOF_PY_INTPTR_T == 8 + w->buf[w->n + 4] = (v >> 32) & 0xff; + w->buf[w->n + 5] = (v >> 40) & 0xff; + w->buf[w->n + 6] = (v >> 48) & 0xff; + w->buf[w->n + 7] = (v >> 56) & 0xff; +#endif + w->n += NPY_SIZEOF_PY_INTPTR_T; + return 0; +} + +static int +string_writer_put_string(string_writer_t *w, const char *s) +{ + if (s == NULL) { + return string_writer_put_char(w, 0); + } + else { + size_t N = strlen(s) + 1; + if (string_writer_ensure(w, N)) + return -1; + memcpy(w->buf + w->n, s, N); + w->n += N; + return 0; + } +} + +enum opcode { + OP_START_TUPLE = '(', + OP_END_TUPLE = ')', + OP_INT = 'i', + OP_FLOAT = 'f', + OP_COMPLEX = 'c', + OP_BOOL = '?', + OP_OMITTED = '!', + + OP_BYTEARRAY = 'a', + OP_BYTES = 'b', + OP_NONE = 'n', + OP_LIST = '[', + OP_SET = '{', + + OP_BUFFER = 'B', + OP_NP_SCALAR = 'S', + OP_NP_ARRAY = 'A', + OP_NP_DTYPE = 'D' +}; + +#define TRY(func, w, arg) \ + do { \ + if (func(w, arg)) return -1; \ + } while (0) + + +static int +fingerprint_unrecognized(void) +{ + PyErr_SetString(PyExc_NotImplementedError, + "cannot compute type fingerprint for value"); + return -1; +} + +static int +compute_dtype_fingerprint(string_writer_t *w, PyArray_Descr *descr) +{ + int typenum = descr->type_num; + if (typenum < NPY_OBJECT) + return string_writer_put_char(w, (char) typenum); + if (typenum == NPY_VOID) { + /* Structured dtype: serialize the dtype pointer. Unfortunately, + * some structured dtypes can be ephemeral, so we have to + * intern them to avoid pointer reuse and fingerprint collisions. + * (e.g. np.recarray(dtype=some_dtype) creates a new dtype + * equal to some_dtype) + */ + PyObject *interned = PyDict_GetItem(structured_dtypes, + (PyObject *) descr); + if (interned == NULL) { + interned = (PyObject *) descr; + if (PyDict_SetItem(structured_dtypes, interned, interned)) + return -1; + } + TRY(string_writer_put_char, w, (char) typenum); + return string_writer_put_intp(w, (npy_intp) interned); + } +#if NPY_API_VERSION >= 0x00000007 + if (PyTypeNum_ISDATETIME(typenum)) { + PyArray_DatetimeMetaData *md; +#if NPY_ABI_VERSION >= 0x02000000 + md = &(((PyArray_DatetimeDTypeMetaData *)PyDataType_C_METADATA(descr))->meta); +#else + md = &(((PyArray_DatetimeDTypeMetaData *)descr->c_metadata)->meta); +#endif + TRY(string_writer_put_char, w, (char) typenum); + TRY(string_writer_put_char, w, (char) md->base); + return string_writer_put_int32(w, (char) md->num); + } +#endif + + return fingerprint_unrecognized(); +} + +static int +compute_fingerprint(string_writer_t *w, PyObject *val) +{ + /* + * Implementation note: for performance, we start with common + * types that can be tested with fast checks. + */ + if (val == Py_None) + return string_writer_put_char(w, OP_NONE); + if (PyBool_Check(val)) + return string_writer_put_char(w, OP_BOOL); + /* Note we avoid matching int subclasses such as IntEnum */ + if (PyInt_CheckExact(val) || PyLong_CheckExact(val)) + return string_writer_put_char(w, OP_INT); + if (PyFloat_Check(val)) + return string_writer_put_char(w, OP_FLOAT); + if (PyComplex_CheckExact(val)) + return string_writer_put_char(w, OP_COMPLEX); + if (PyTuple_Check(val)) { + if(PyTuple_CheckExact(val)) { + Py_ssize_t i, n; + n = PyTuple_GET_SIZE(val); + TRY(string_writer_put_char, w, OP_START_TUPLE); + for (i = 0; i < n; i++) + TRY(compute_fingerprint, w, PyTuple_GET_ITEM(val, i)); + TRY(string_writer_put_char, w, OP_END_TUPLE); + return 0; + } + /* as per typeof.py, check "_asdict" for namedtuple. */ + else if(PyObject_HasAttrString(val, "_asdict")) + { + /* + * This encodes the class name and field names of a namedtuple into + * the fingerprint on the condition that the number of fields is + * small (<10) and that the class name and field names are encodable + * as ASCII. + */ + PyObject * clazz = NULL; + PyObject * name = NULL; + PyObject * _fields = PyObject_GetAttrString(val, "_fields"); + PyObject * field = NULL; + PyObject * ascii_str = NULL; + Py_ssize_t i, n, j, flen; + char * buf = NULL; + int ret; + + clazz = PyObject_GetAttrString(val, "__class__"); + if (clazz == NULL) + return -1; + + name = PyObject_GetAttrString(clazz, "__name__"); + Py_DECREF(clazz); + if (name == NULL) + return -1; + + ascii_str = PyUnicode_AsEncodedString(name, "ascii", "ignore"); + Py_DECREF(name); + if (ascii_str == NULL) + return -1; + ret = PyBytes_AsStringAndSize(ascii_str, &buf, &flen); + + if (ret == -1) + return -1; + for(j = 0; j < flen; j++) { + TRY(string_writer_put_char, w, buf[j]); + } + Py_DECREF(ascii_str); + + if (_fields == NULL) + return -1; + + n = PyTuple_GET_SIZE(val); + + TRY(string_writer_put_char, w, OP_START_TUPLE); + for (i = 0; i < n; i++) { + field = PyTuple_GET_ITEM(_fields, i); + if (field == NULL) + return -1; + ascii_str = PyUnicode_AsEncodedString(field, "ascii", "ignore"); + if (ascii_str == NULL) + return -1; + ret = PyBytes_AsStringAndSize(ascii_str, &buf, &flen); + if (ret == -1) + return -1; + for(j = 0; j < flen; j++) { + TRY(string_writer_put_char, w, buf[j]); + } + Py_DECREF(ascii_str); + TRY(compute_fingerprint, w, PyTuple_GET_ITEM(val, i)); + } + TRY(string_writer_put_char, w, OP_END_TUPLE); + Py_DECREF(_fields); + return 0; + } + } + if (PyBytes_Check(val)) + return string_writer_put_char(w, OP_BYTES); + if (PyByteArray_Check(val)) + return string_writer_put_char(w, OP_BYTEARRAY); + if ((PyObject *) Py_TYPE(val) == omittedarg_type) { + PyObject *default_val = PyObject_GetAttr(val, str_value); + if (default_val == NULL) + return -1; + TRY(string_writer_put_char, w, OP_OMITTED); + TRY(compute_fingerprint, w, default_val); + Py_DECREF(default_val); + return 0; + } + if (PyArray_IsScalar(val, Generic)) { + /* Note: PyArray_DescrFromScalar() may be a bit slow on + non-trivial types. */ + PyArray_Descr *descr = PyArray_DescrFromScalar(val); + if (descr == NULL) + return -1; + TRY(string_writer_put_char, w, OP_NP_SCALAR); + TRY(compute_dtype_fingerprint, w, descr); + Py_DECREF(descr); + return 0; + } + if (PyArray_Check(val)) { + PyArrayObject *ary = (PyArrayObject *) val; + int ndim = PyArray_NDIM(ary); + + TRY(string_writer_put_char, w, OP_NP_ARRAY); + TRY(string_writer_put_int32, w, ndim); + if (PyArray_IS_C_CONTIGUOUS(ary)) + TRY(string_writer_put_char, w, 'C'); + else if (PyArray_IS_F_CONTIGUOUS(ary)) + TRY(string_writer_put_char, w, 'F'); + else + TRY(string_writer_put_char, w, 'A'); + if (PyArray_ISWRITEABLE(ary)) + TRY(string_writer_put_char, w, 'W'); + else + TRY(string_writer_put_char, w, 'R'); + return compute_dtype_fingerprint(w, PyArray_DESCR(ary)); + } + if (PyList_Check(val)) { + Py_ssize_t n = PyList_GET_SIZE(val); + if (n == 0) { + PyErr_SetString(PyExc_ValueError, + "cannot compute fingerprint of empty list"); + return -1; + } + /* Only the first item is considered, as in typeof.py */ + TRY(string_writer_put_char, w, OP_LIST); + TRY(compute_fingerprint, w, PyList_GET_ITEM(val, 0)); + return 0; + } + /* Note we only accept sets, not frozensets */ + if (Py_TYPE(val) == &PySet_Type) { + Py_hash_t h; + PyObject *item; + Py_ssize_t pos = 0; + /* Only one item is considered, as in typeof.py */ + if (!_PySet_NextEntry(val, &pos, &item, &h)) { + /* Empty set */ + PyErr_SetString(PyExc_ValueError, + "cannot compute fingerprint of empty set"); + return -1; + } + TRY(string_writer_put_char, w, OP_SET); + TRY(compute_fingerprint, w, item); + return 0; + } + if (PyObject_CheckBuffer(val)) { + Py_buffer buf; + int flags = PyBUF_ND | PyBUF_STRIDES | PyBUF_FORMAT; + char contig; + int ndim; + char readonly; + + /* Attempt to get a writable buffer, then fallback on read-only */ + if (PyObject_GetBuffer(val, &buf, flags | PyBUF_WRITABLE)) { + PyErr_Clear(); + if (PyObject_GetBuffer(val, &buf, flags)) + goto _unrecognized; + } + if (PyBuffer_IsContiguous(&buf, 'C')) + contig = 'C'; + else if (PyBuffer_IsContiguous(&buf, 'F')) + contig = 'F'; + else + contig = 'A'; + ndim = buf.ndim; + readonly = buf.readonly ? 'R' : 'W'; + if (string_writer_put_char(w, OP_BUFFER) || + string_writer_put_int32(w, ndim) || + string_writer_put_char(w, contig) || + string_writer_put_char(w, readonly) || + string_writer_put_string(w, buf.format) || + /* We serialize the object's Python type as well, to + distinguish between types which have Numba specializations + (e.g. array.array() vs. memoryview) + */ + string_writer_put_intp(w, (npy_intp) Py_TYPE(val))) { + PyBuffer_Release(&buf); + return -1; + } + PyBuffer_Release(&buf); + return 0; + } + if (PyObject_TypeCheck(val, &PyArrayDescr_Type)) { + TRY(string_writer_put_char, w, OP_NP_DTYPE); + return compute_dtype_fingerprint(w, (PyArray_Descr *) val); + } + +_unrecognized: + /* Type not recognized */ + return fingerprint_unrecognized(); +} + +PyObject * +typeof_compute_fingerprint(PyObject *val) +{ + PyObject *res; + string_writer_t w; + + string_writer_init(&w); + + if (compute_fingerprint(&w, val)) + goto error; + res = PyBytes_FromStringAndSize(w.buf, w.n); + + string_writer_clear(&w); + return res; + +error: + string_writer_clear(&w); + return NULL; +} + +/* + * Getting the typecode from a Type object. + */ +static int +_typecode_from_type_object(PyObject *tyobj) { + int typecode; + PyObject *tmpcode = PyObject_GetAttrString(tyobj, "_code"); + if (tmpcode == NULL) { + return -1; + } + typecode = PyLong_AsLong(tmpcode); + Py_DECREF(tmpcode); + return typecode; +} + +/* When we want to cache the type's typecode for later lookup, we need to + keep a reference to the returned type object so that it cannot be + deleted. This is because of the following events occurring when first + using a @jit function for a given set of types: + + 1. typecode_fallback requests a new typecode for an arbitrary Python value; + this implies creating a Numba type object (on the first dispatcher call); + the typecode cache is then populated. + 2. matching of the typecode list in _dispatcherimpl.cpp fails, since the + typecode is new. + 3. we have to compile: compile_and_invoke() is called, it will invoke + Dispatcher_Insert to register the new signature. + + The reference to the Numba type object returned in step 1 is deleted as + soon as we call Py_DECREF() on it, since we are holding the only + reference. If this happens and we use the typecode we got to populate the + cache, then the cache won't ever return the correct typecode, and the + dispatcher will never successfully match the typecodes with those of + some already-compiled instance. So we need to make sure that we don't + call Py_DECREF() on objects whose typecode will be used to populate the + cache. This is ensured by calling _typecode_fallback with + retain_reference == 0. + + Note that technically we are leaking the reference, since we do not continue + to hold a pointer to the type object that we get back from typeof_pyval. + However, we don't need to refer to it again, we just need to make sure that + it is never deleted. +*/ +static int +_typecode_fallback(PyObject *dispatcher, PyObject *val, + int retain_reference) { + PyObject *numba_type; + int typecode; + + /* + * For values that define "_numba_type_", which holds a numba Type + * instance that should be used as the type of the value. + * Note this is done here, not in typeof_typecode(), so that + * some values can still benefit from fingerprint caching. + */ + if (PyObject_HasAttr(val, str_numba_type)) { + numba_type = PyObject_GetAttrString(val, "_numba_type_"); + if (!numba_type) + return -1; + } + else { + // Go back to the interpreter + numba_type = PyObject_CallMethodObjArgs((PyObject *) dispatcher, + str_typeof_pyval, val, NULL); + } + if (!numba_type) + return -1; + typecode = _typecode_from_type_object(numba_type); + if (!retain_reference) + Py_DECREF(numba_type); + return typecode; +} + +/* Variations on _typecode_fallback for convenience */ + +static +int typecode_fallback(PyObject *dispatcher, PyObject *val) { + return _typecode_fallback(dispatcher, val, 0); +} + +static +int typecode_fallback_keep_ref(PyObject *dispatcher, PyObject *val) { + return _typecode_fallback(dispatcher, val, 1); +} + + +/* A cache mapping fingerprints (string_writer_t *) to typecodes (int). */ +static _Numba_hashtable_t *fingerprint_hashtable = NULL; + +static Py_uhash_t +hash_writer(const void *key) +{ + string_writer_t *writer = (string_writer_t *) key; + Py_uhash_t x = 0; + + /* The old FNV algorithm used by Python 2 */ + if (writer->n > 0) { + unsigned char *p = (unsigned char *) writer->buf; + Py_ssize_t len = writer->n; + x ^= *p << 7; + while (--len >= 0) + x = (1000003*x) ^ *p++; + x ^= writer->n; + if (x == (Py_uhash_t) -1) + x = -2; + } + return x; +} + +static int +compare_writer(const void *key, const _Numba_hashtable_entry_t *entry) +{ + string_writer_t *v = (string_writer_t *) key; + string_writer_t *w = (string_writer_t *) entry->key; + if (v->n != w->n) + return 0; + return memcmp(v->buf, w->buf, v->n) == 0; +} + +/* Try to compute *val*'s typecode using its fingerprint and the + * fingerprint->typecode cache. + */ +static int +typecode_using_fingerprint(PyObject *dispatcher, PyObject *val) +{ + int typecode; + string_writer_t w; + + string_writer_init(&w); + + if (compute_fingerprint(&w, val)) { + string_writer_clear(&w); + if (PyErr_ExceptionMatches(PyExc_NotImplementedError)) { + /* Can't compute a type fingerprint for the given value, + fall back on typeof() without caching. */ + PyErr_Clear(); + return typecode_fallback(dispatcher, val); + } + return -1; + } + if (_Numba_HASHTABLE_GET(fingerprint_hashtable, &w, typecode) > 0) { + /* Cache hit */ + string_writer_clear(&w); + return typecode; + } + + /* Not found in cache: invoke pure Python typeof() and cache result. + * Note we have to keep the type alive forever as explained + * above in _typecode_fallback(). + */ + typecode = typecode_fallback_keep_ref(dispatcher, val); + if (typecode >= 0) { + string_writer_t *key = (string_writer_t *) malloc(sizeof(string_writer_t)); + if (key == NULL) { + string_writer_clear(&w); + PyErr_NoMemory(); + return -1; + } + /* Ownership of the string writer's buffer will be transferred + * to the hash table. + */ + string_writer_move(key, &w); + if (_Numba_HASHTABLE_SET(fingerprint_hashtable, key, typecode)) { + string_writer_clear(&w); + PyErr_NoMemory(); + return -1; + } + } + return typecode; +} + + +/* + * Direct lookup table for extra-fast typecode resolution of simple array types. + */ + +#define N_DTYPES 12 +#define N_NDIM 5 /* Fast path for up to 5D array */ +#define N_LAYOUT 3 +static int cached_arycode[N_NDIM][N_LAYOUT][N_DTYPES]; + +/* Convert a Numpy dtype number to an internal index into cached_arycode. + The returned value must also be a valid index into BASIC_TYPECODES. */ +static int dtype_num_to_typecode(int type_num) { + int dtype; + switch(type_num) { + case NPY_INT8: + dtype = 0; + break; + case NPY_INT16: + dtype = 1; + break; + case NPY_INT32: + dtype = 2; + break; + case NPY_INT64: + dtype = 3; + break; + case NPY_UINT8: + dtype = 4; + break; + case NPY_UINT16: + dtype = 5; + break; + case NPY_UINT32: + dtype = 6; + break; + case NPY_UINT64: + dtype = 7; + break; + case NPY_FLOAT32: + dtype = 8; + break; + case NPY_FLOAT64: + dtype = 9; + break; + case NPY_COMPLEX64: + dtype = 10; + break; + case NPY_COMPLEX128: + dtype = 11; + break; + default: + /* Type not included in the global lookup table */ + dtype = -1; + } + return dtype; +} + +static +int get_cached_typecode(PyArray_Descr* descr) { + PyObject* tmpobject = PyDict_GetItem(typecache, (PyObject*)descr); + if (tmpobject == NULL) + return -1; + + return PyLong_AsLong(tmpobject); +} + +static +void cache_typecode(PyArray_Descr* descr, int typecode) { + PyObject* value = PyLong_FromLong(typecode); + PyDict_SetItem(typecache, (PyObject*)descr, value); + Py_DECREF(value); +} + +static +PyObject* ndarray_key(int ndim, int layout, int readonly, PyArray_Descr* descr) { + PyObject* tmpndim = PyLong_FromLong(ndim); + PyObject* tmplayout = PyLong_FromLong(layout); + PyObject* tmpreadonly = PyLong_FromLong(readonly); + PyObject* key = PyTuple_Pack(4, tmpndim, tmplayout, tmpreadonly, descr); + Py_DECREF(tmpndim); + Py_DECREF(tmplayout); + Py_DECREF(tmpreadonly); + return key; +} + +static +int get_cached_ndarray_typecode(int ndim, int layout, int readonly, PyArray_Descr* descr) { + PyObject* key = ndarray_key(ndim, layout, readonly, descr); + PyObject *tmpobject = PyDict_GetItem(ndarray_typecache, key); + if (tmpobject == NULL) + return -1; + + Py_DECREF(key); + return PyLong_AsLong(tmpobject); +} + +static +void cache_ndarray_typecode(int ndim, int layout, int readonly, PyArray_Descr* descr, + int typecode) { + PyObject* key = ndarray_key(ndim, layout, readonly, descr); + PyObject* value = PyLong_FromLong(typecode); + PyDict_SetItem(ndarray_typecache, key, value); + Py_DECREF(key); + Py_DECREF(value); +} + +static +int typecode_ndarray(PyObject *dispatcher, PyArrayObject *ary) { + int typecode; + int dtype; + int ndim = PyArray_NDIM(ary); + int layout = 0; + int readonly = 0; + + /* The order in which we check for the right contiguous-ness is important. + The order must match the order by numba.numpy_support.map_layout. + Further, only *contiguous-ness* is checked, not alignment, byte order or + write permissions. + */ + if (PyArray_IS_C_CONTIGUOUS(ary)){ + layout = 1; + } else if (PyArray_IS_F_CONTIGUOUS(ary)) { + layout = 2; + } + + /* the typecode cache by convention is for "behaved" arrays (aligned and + * writeable), all others must be forced to the fall back */ + if (!PyArray_ISBEHAVED(ary)) goto FALLBACK; + + if (ndim <= 0 || ndim > N_NDIM) goto FALLBACK; + + dtype = dtype_num_to_typecode(PyArray_TYPE(ary)); + if (dtype == -1) goto FALLBACK; + + /* Fast path, using direct table lookup */ + assert(layout < N_LAYOUT); + assert(ndim <= N_NDIM); + assert(dtype < N_DTYPES); + + typecode = cached_arycode[ndim - 1][layout][dtype]; + if (typecode == -1) { + /* First use of this table entry, so it requires populating */ + typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary); + cached_arycode[ndim - 1][layout][dtype] = typecode; + } + return typecode; + +FALLBACK: + /* Slower path, for non-trivial array types */ + + /* If this isn't a structured array then we can't use the cache */ + if (PyArray_TYPE(ary) != NPY_VOID) + return typecode_using_fingerprint(dispatcher, (PyObject *) ary); + + /* Check type cache */ + readonly = !PyArray_ISWRITEABLE(ary); + typecode = get_cached_ndarray_typecode(ndim, layout, readonly, PyArray_DESCR(ary)); + if (typecode == -1) { + /* First use of this type, use fallback and populate the cache */ + typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary); + cache_ndarray_typecode(ndim, layout, readonly, PyArray_DESCR(ary), typecode); + } + return typecode; +} + +static +int typecode_arrayscalar(PyObject *dispatcher, PyObject* aryscalar) { + int typecode; + PyArray_Descr *descr; + descr = PyArray_DescrFromScalar(aryscalar); + if (!descr) + return typecode_using_fingerprint(dispatcher, aryscalar); + + /* Is it a structured scalar? */ + if (descr->type_num == NPY_VOID) { + typecode = get_cached_typecode(descr); + if (typecode == -1) { + /* Resolve through fallback then populate cache */ + typecode = typecode_fallback_keep_ref(dispatcher, aryscalar); + cache_typecode(descr, typecode); + } + Py_DECREF(descr); + return typecode; + } + + /* Is it one of the well-known basic types? */ + typecode = dtype_num_to_typecode(descr->type_num); + Py_DECREF(descr); + if (typecode == -1) + return typecode_using_fingerprint(dispatcher, aryscalar); + return BASIC_TYPECODES[typecode]; +} + +static +int typecode_devicendarray(PyObject *dispatcher, PyObject *ary) +{ + int typecode; + int dtype; + int ndim; + int layout = 0; + PyObject *ndim_obj = nullptr; + PyObject *num_obj = nullptr; + PyObject *dtype_obj = nullptr; + int dtype_num = 0; + + PyObject* flags = PyObject_GetAttrString(ary, "flags"); + if (flags == NULL) + { + PyErr_Clear(); + goto FALLBACK; + } + + if (PyDict_GetItemString(flags, "C_CONTIGUOUS") == Py_True) { + layout = 1; + } else if (PyDict_GetItemString(flags, "F_CONTIGUOUS") == Py_True) { + layout = 2; + } + + Py_DECREF(flags); + + ndim_obj = PyObject_GetAttrString(ary, "ndim"); + if (ndim_obj == NULL) { + /* If there's no ndim, try to proceed by clearing the error and using the + * fallback. */ + PyErr_Clear(); + goto FALLBACK; + } + + ndim = PyLong_AsLong(ndim_obj); + Py_DECREF(ndim_obj); + + if (PyErr_Occurred()) { + /* ndim wasn't an integer for some reason - unlikely to happen, but try + * the fallback. */ + PyErr_Clear(); + goto FALLBACK; + } + + if (ndim <= 0 || ndim > N_NDIM) + goto FALLBACK; + + dtype_obj = PyObject_GetAttrString(ary, "dtype"); + if (dtype_obj == NULL) { + /* No dtype: try the fallback. */ + PyErr_Clear(); + goto FALLBACK; + } + + num_obj = PyObject_GetAttrString(dtype_obj, "num"); + Py_DECREF(dtype_obj); + + if (num_obj == NULL) { + /* This strange dtype has no num - try the fallback. */ + PyErr_Clear(); + goto FALLBACK; + } + + dtype_num = PyLong_AsLong(num_obj); + Py_DECREF(num_obj); + + if (PyErr_Occurred()) { + /* num wasn't an integer for some reason - unlikely to happen, but try + * the fallback. */ + PyErr_Clear(); + goto FALLBACK; + } + + dtype = dtype_num_to_typecode(dtype_num); + if (dtype == -1) { + /* Not a dtype we have in the global lookup table. */ + goto FALLBACK; + } + + /* Fast path, using direct table lookup */ + assert(layout < N_LAYOUT); + assert(ndim <= N_NDIM); + assert(dtype < N_DTYPES); + typecode = cached_arycode[ndim - 1][layout][dtype]; + + if (typecode == -1) { + /* First use of this table entry, so it requires populating */ + typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary); + cached_arycode[ndim - 1][layout][dtype] = typecode; + } + + return typecode; + +FALLBACK: + /* Slower path, for non-trivial array types. At present this always uses + the fingerprinting to get the typecode. Future optimization might + implement a cache, but this would require some fast equivalent of + PyArray_DESCR for a device array. */ + + return typecode_using_fingerprint(dispatcher, (PyObject *) ary); +} + +extern "C" int +typeof_typecode(PyObject *dispatcher, PyObject *val) +{ + PyTypeObject *tyobj = Py_TYPE(val); + int subtype_attr; + /* This needs to be kept in sync with Dispatcher.typeof_pyval(), + * otherwise funny things may happen. + */ + if (tyobj == &PyInt_Type || tyobj == &PyLong_Type) { +#if SIZEOF_VOID_P < 8 + /* On 32-bit platforms, choose between tc_intp (32-bit) and tc_int64 */ + PY_LONG_LONG ll = PyLong_AsLongLong(val); + if (ll == -1 && PyErr_Occurred()) { + /* The integer is too large, let us truncate it */ + PyErr_Clear(); + return tc_int64; + } + if ((ll & 0xffffffff) != ll) + return tc_int64; +#endif + return tc_intp; + } + else if (tyobj == &PyFloat_Type) + return tc_float64; + else if (tyobj == &PyComplex_Type) + return tc_complex128; + /* Array scalar handling */ + else if (PyArray_CheckScalar(val)) { + return typecode_arrayscalar(dispatcher, val); + } + /* Array handling */ + else if (tyobj == &PyArray_Type) { + return typecode_ndarray(dispatcher, (PyArrayObject*)val); + } + /* Subtype of CUDA device array */ + else if (PyType_IsSubtype(tyobj, &DeviceArrayType)) { + return typecode_devicendarray(dispatcher, val); + } + /* Subtypes of Array handling */ + else if (PyType_IsSubtype(tyobj, &PyArray_Type)) { + /* By default, Numba will treat all numpy.ndarray subtypes as if they + were the base numpy.ndarray type. In this way, ndarray subtypes + can easily use all of the support that Numba has for ndarray + methods. + EXPERIMENTAL: There may be cases where a programmer would NOT want + ndarray subtypes to be treated exactly like the base numpy.ndarray. + For this purpose, a currently experimental feature allows a + programmer to add an attribute named + __numba_array_subtype_dispatch__ to their ndarray subtype. This + attribute can have any value as Numba only checks for the presence + of the attribute and not its value. When present, a ndarray subtype + will NOT be typed by Numba as a regular ndarray but this code will + fallthrough to the typecode_using_fingerprint call, which will + create a new unique Numba typecode for this ndarray subtype. This + behavior has several significant effects. First, since this + ndarray subtype will be treated as a different type by Numba, + the Numba dispatcher would then specialize on this type. So, if + there was a function that had several parameters that were + expected to be either numpy.ndarray or a subtype of ndarray, then + Numba would compile a custom version of this function for each + combination of base and subtypes that were actually passed to the + function. Second, because this subtype would now be treated as + a totally separate type, it will cease to function in Numba unless + an implementation of that type is provided to Numba through the + Numba type extension mechanisms (e.g., overload). This would + typically start with defining a Numba type corresponding to the + ndarray subtype. This is the same concept as how Numba has a + corollary of numpy.ndarray in its type system as types.Array. + Next, one would typically defining boxing and unboxing routines + and the associated memory model. Then, overloads for NumPy + functions on that type would be created. However, + if the same default array memory model is used then there are tricks + one can do to look at Numba's internal types.Array registries and + to quickly apply those to the subtype as well. In this manner, + only those cases where the base ndarray and the ndarray subtype + behavior differ would new custom functions need to be written for + the subtype. Finally, + after adding support for the new type, you would have a separate + ndarray subtype that could operate with other objects of the same + subtype but would not support interoperation with regular NumPy + ndarrays. In standard Python, this interoperation is provided + through the __array_ufunc__ magic method in the ndarray subtype + class and in that case the function operates on ndarrays or their + subtypes. This idea is extended into Numba such that + __array_ufunc__ can be present in a Numba array type object. + In this case, this function is consulted during Numba typing and + so the arguments to __array_ufunc__ are Numba types instead of + ndarray subtypes. The array type __array_ufunc__ returns the + type of the output of the given ufunc. + */ + subtype_attr = PyObject_HasAttrString(val, "__numba_array_subtype_dispatch__"); + if (!subtype_attr) { + return typecode_ndarray(dispatcher, (PyArrayObject*)val); + } + } + + return typecode_using_fingerprint(dispatcher, val); +} + + +static +void* wrap_import_array(void) { + import_array(); /* import array returns NULL on failure */ + return (void*)1; +} + + +static +int init_numpy(void) { + return wrap_import_array() != NULL; +} + + +/* + * typeof_init(omittedarg_type, typecode_dict) + * (called from dispatcher.py to fill in missing information) + */ +extern "C" PyObject * +typeof_init(PyObject *self, PyObject *args) +{ + PyObject *tmpobj; + PyObject *dict; + int index = 0; + + if (!PyArg_ParseTuple(args, "O!O!:typeof_init", + &PyType_Type, &omittedarg_type, + &PyDict_Type, &dict)) + return NULL; + + /* Initialize Numpy API */ + if ( ! init_numpy() ) { + return NULL; + } + + #define UNWRAP_TYPE(S) \ + if(!(tmpobj = PyDict_GetItemString(dict, #S))) return NULL; \ + else { tc_##S = PyLong_AsLong(tmpobj); \ + BASIC_TYPECODES[index++] = tc_##S; } + + UNWRAP_TYPE(int8) + UNWRAP_TYPE(int16) + UNWRAP_TYPE(int32) + UNWRAP_TYPE(int64) + + UNWRAP_TYPE(uint8) + UNWRAP_TYPE(uint16) + UNWRAP_TYPE(uint32) + UNWRAP_TYPE(uint64) + + UNWRAP_TYPE(float32) + UNWRAP_TYPE(float64) + + UNWRAP_TYPE(complex64) + UNWRAP_TYPE(complex128) + + switch(sizeof(void*)) { + case 4: + tc_intp = tc_int32; + break; + case 8: + tc_intp = tc_int64; + break; + default: + PyErr_SetString(PyExc_AssertionError, "sizeof(void*) != {4, 8}"); + return NULL; + } + + #undef UNWRAP_TYPE + + typecache = PyDict_New(); + ndarray_typecache = PyDict_New(); + structured_dtypes = PyDict_New(); + if (typecache == NULL || ndarray_typecache == NULL || + structured_dtypes == NULL) { + PyErr_SetString(PyExc_RuntimeError, "failed to create type cache"); + return NULL; + } + + fingerprint_hashtable = _Numba_hashtable_new(sizeof(int), + hash_writer, + compare_writer); + if (fingerprint_hashtable == NULL) { + PyErr_NoMemory(); + return NULL; + } + + /* initialize cached_arycode to all ones (in bits) */ + memset(cached_arycode, 0xFF, sizeof(cached_arycode)); + + str_typeof_pyval = PyString_InternFromString("typeof_pyval"); + str_value = PyString_InternFromString("value"); + str_numba_type = PyString_InternFromString("_numba_type_"); + if (!str_value || !str_typeof_pyval || !str_numba_type) + return NULL; + + Py_RETURN_NONE; +} diff --git a/numba_cuda/numba/cuda/cext/_typeof.h b/numba_cuda/numba/cuda/cext/_typeof.h new file mode 100644 index 000000000..54e39ff80 --- /dev/null +++ b/numba_cuda/numba/cuda/cext/_typeof.h @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + +#ifndef NUMBA_TYPEOF_H_ +#define NUMBA_TYPEOF_H_ + +#ifdef __cplusplus + extern "C" { +#endif + +extern PyObject *typeof_init(PyObject *self, PyObject *args); +extern int typeof_typecode(PyObject *dispatcher, PyObject *val); +extern PyObject *typeof_compute_fingerprint(PyObject *val); + +#ifdef __cplusplus + } +#endif + +#endif /* NUMBA_TYPEOF_H_ */ diff --git a/numba_cuda/numba/cuda/cext/mviewbuf.c b/numba_cuda/numba/cuda/cext/mviewbuf.c new file mode 100644 index 000000000..eb5075d0b --- /dev/null +++ b/numba_cuda/numba/cuda/cext/mviewbuf.c @@ -0,0 +1,385 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + +#include "_pymodule.h" + +static int get_writable_buffer(PyObject* obj, Py_buffer *buf, int force) +{ + Py_buffer read_buf; + int flags = PyBUF_ND|PyBUF_STRIDES|PyBUF_FORMAT; + int ret; + + /* Attempt to get a writable buffer */ + if (!PyObject_GetBuffer(obj, buf, flags|PyBUF_WRITABLE)) + return 0; + if (!force) + return -1; + + /* Make a writable buffer from a read-only buffer */ + PyErr_Clear(); + if(-1 == PyObject_GetBuffer(obj, &read_buf, flags)) + return -1; + ret = PyBuffer_FillInfo(buf, NULL, read_buf.buf, read_buf.len, 0, + flags|PyBUF_WRITABLE); + PyBuffer_Release(&read_buf); + return ret; +} + +static int get_readonly_buffer(PyObject* obj, Py_buffer *buf) +{ + int flags = PyBUF_ND|PyBUF_STRIDES|PyBUF_FORMAT; + + return PyObject_GetBuffer(obj, buf, flags); +} + + +static void free_buffer(Py_buffer * buf) +{ + PyBuffer_Release(buf); +} + +/** + * Return a pointer to the data of a writable buffer from obj. If only a + * read-only buffer is available and force is True, a read-write buffer based on + * the read-only buffer is obtained. Note that this may have some surprising + * effects on buffers which expect the data from their read-only buffer not to + * be modified. + */ +static PyObject* +memoryview_get_buffer(PyObject *self, PyObject *args){ + PyObject *obj = NULL; + int force = 0; + int readonly = 0; + PyObject *ret = NULL; + Py_buffer buf; + + if (!PyArg_ParseTuple(args, "O|ii", &obj, &force, &readonly)) + return NULL; + + if (readonly) { + if (get_readonly_buffer(obj, &buf)) + return NULL; + } else { + if (get_writable_buffer(obj, &buf, force)) + return NULL; + } + + ret = PyLong_FromVoidPtr(buf.buf); + free_buffer(&buf); + return ret; +} + +/** + * Gets a half-open range [start, end) which contains the array data + * Modified from numpy/core/src/multiarray/array_assign.c + */ +static PyObject* +get_extents(Py_ssize_t *shape, Py_ssize_t *strides, int ndim, + Py_ssize_t itemsize, Py_ssize_t ptr) +{ + Py_ssize_t start, end; + int idim; + Py_ssize_t *dimensions = shape; + PyObject *ret = NULL; + + if (ndim < 0 ){ + PyErr_SetString(PyExc_ValueError, "buffer ndim < 0"); + return NULL; + } + + if (!dimensions) { + if (ndim == 0) { + start = end = ptr; + end += itemsize; + return Py_BuildValue("nn", start, end); + } + PyErr_SetString(PyExc_ValueError, "buffer shape is not defined"); + return NULL; + } + + if (!strides) { + PyErr_SetString(PyExc_ValueError, "buffer strides is not defined"); + return NULL; + } + + /* Calculate with a closed range [start, end] */ + start = end = ptr; + for (idim = 0; idim < ndim; ++idim) { + Py_ssize_t stride = strides[idim], dim = dimensions[idim]; + /* If the array size is zero, return an empty range */ + if (dim == 0) { + start = end = ptr; + ret = Py_BuildValue("nn", start, end); + break; + } + /* Expand either upwards or downwards depending on stride */ + else { + if (stride > 0) { + end += stride * (dim - 1); + } + else if (stride < 0) { + start += stride * (dim - 1); + } + } + } + + if (!ret) { + /* Return a half-open range */ + Py_ssize_t out_start = start; + Py_ssize_t out_end = end + itemsize; + + ret = Py_BuildValue("nn", out_start, out_end); + } + + return ret; +} + +static PyObject* +memoryview_get_extents(PyObject *self, PyObject *args) +{ + PyObject *obj = NULL; + PyObject *ret = NULL; + Py_buffer b; + if (!PyArg_ParseTuple(args, "O", &obj)) + return NULL; + + if (get_readonly_buffer(obj, &b)) + return NULL; + + ret = get_extents(b.shape, b.strides, b.ndim, b.itemsize, + (Py_ssize_t)b.buf); + free_buffer(&b); + return ret; +} + +static PyObject* +memoryview_get_extents_info(PyObject *self, PyObject *args) +{ + int i; + Py_ssize_t *shape_ary = NULL; + Py_ssize_t *strides_ary = NULL; + PyObject *shape_tuple = NULL; + PyObject *strides_tuple = NULL; + PyObject *shape = NULL, *strides = NULL; + Py_ssize_t itemsize = 0; + int ndim = 0; + PyObject* res = NULL; + + if (!PyArg_ParseTuple(args, "OOin", &shape, &strides, &ndim, &itemsize)) + goto cleanup; + + if (ndim < 0) { + PyErr_SetString(PyExc_ValueError, "ndim is negative"); + goto cleanup; + } + + if (itemsize <= 0) { + PyErr_SetString(PyExc_ValueError, "ndim <= 0"); + goto cleanup; + } + + shape_ary = malloc(sizeof(Py_ssize_t) * ndim + 1); + strides_ary = malloc(sizeof(Py_ssize_t) * ndim + 1); + + shape_tuple = PySequence_Fast(shape, "shape is not a sequence"); + if (!shape_tuple) goto cleanup; + + for (i = 0; i < ndim; ++i) { + shape_ary[i] = PyNumber_AsSsize_t( + PySequence_Fast_GET_ITEM(shape_tuple, i), + PyExc_OverflowError); + } + + strides_tuple = PySequence_Fast(strides, "strides is not a sequence"); + if (!strides_tuple) goto cleanup; + + for (i = 0; i < ndim; ++i) { + strides_ary[i] = PyNumber_AsSsize_t( + PySequence_Fast_GET_ITEM(strides_tuple, i), + PyExc_OverflowError); + } + + res = get_extents(shape_ary, strides_ary, ndim, itemsize, 0); +cleanup: + free(shape_ary); + free(strides_ary); + Py_XDECREF(shape_tuple); + Py_XDECREF(strides_tuple); + return res; +} + + +/* new type to expose buffer interface */ +typedef struct { + PyObject_HEAD + /* Type-specific fields go here. */ +} MemAllocObject; + + +static int +get_bufinfo(PyObject *self, Py_ssize_t *psize, void **pptr) +{ + PyObject *buflen = NULL; + PyObject *bufptr = NULL; + Py_ssize_t size = 0; + void* ptr = NULL; + int ret = -1; + + buflen = PyObject_GetAttrString(self, "_buflen_"); + if (!buflen) goto cleanup; + + bufptr = PyObject_GetAttrString(self, "_bufptr_"); + if (!bufptr) goto cleanup; + + size = PyNumber_AsSsize_t(buflen, PyExc_OverflowError); + if (size == -1 && PyErr_Occurred()) goto cleanup; + else if (size < 0) { + PyErr_SetString(PyExc_ValueError, "negative buffer size"); + goto cleanup; + } + + ptr = PyLong_AsVoidPtr(PyNumber_Long(bufptr)); + if (PyErr_Occurred()) + goto cleanup; + else if (!ptr) { + PyErr_SetString(PyExc_ValueError, "null buffer pointer"); + goto cleanup; + } + + *psize = size; + *pptr = ptr; + ret = 0; +cleanup: + Py_XDECREF(buflen); + Py_XDECREF(bufptr); + return ret; +} + + +static int +MemAllocObject_getbuffer(PyObject *self, Py_buffer *view, int flags) +{ + Py_ssize_t size = 0; + void *ptr = 0; + int readonly; + + if(-1 == get_bufinfo(self, &size, &ptr)) + return -1; + + readonly = (PyBUF_WRITABLE & flags) != PyBUF_WRITABLE; + + /* fill buffer */ + if (-1 == PyBuffer_FillInfo(view, self, (void*)ptr, size, readonly, flags)) + return -1; + + return 0; +} + +static void +MemAllocObject_releasebuffer(PyObject *self, Py_buffer *view) +{ + /* Do nothing */ +} + +static PyBufferProcs MemAlloc_as_buffer = { + MemAllocObject_getbuffer, + MemAllocObject_releasebuffer, +}; + + +static PyTypeObject MemAllocType = { + PyVarObject_HEAD_INIT(NULL, 0) + "mviewbuf.MemAlloc", /* tp_name */ + sizeof(MemAllocObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + 0, /* tp_dealloc */ + 0, /* tp_vectorcall_offset */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_as_async */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + &MemAlloc_as_buffer, /* tp_as_buffer */ + (Py_TPFLAGS_DEFAULT| Py_TPFLAGS_BASETYPE), /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ + 0, /* tp_free */ + 0, /* tp_is_gc */ + 0, /* tp_bases */ + 0, /* tp_mro */ + 0, /* tp_cache */ + 0, /* tp_subclasses */ + 0, /* tp_weaklist */ + 0, /* tp_del */ + 0, /* tp_version_tag */ + 0, /* tp_finalize */ + 0, /* tp_vectorcall */ +#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION == 12) +/* This was introduced first in 3.12 + * https://github.com/python/cpython/issues/91051 + */ + 0, /* tp_watched */ +#endif + +/* WARNING: Do not remove this, only modify it! It is a version guard to + * act as a reminder to update this struct on Python version update! */ +#if (PY_MAJOR_VERSION == 3) +#if ! (NB_SUPPORTED_PYTHON_MINOR) +#error "Python minor version is not supported." +#endif +#else +#error "Python major version is not supported." +#endif +/* END WARNING*/ +}; + + +static PyMethodDef core_methods[] = { +#define declmethod(func) { #func , ( PyCFunction )func , METH_VARARGS , NULL } + declmethod(memoryview_get_buffer), + declmethod(memoryview_get_extents), + declmethod(memoryview_get_extents_info), + { NULL }, +#undef declmethod +}; + + +MOD_INIT(mviewbuf) { + PyObject *module; + MOD_DEF(module, "mviewbuf", "No docs", core_methods) + if (module == NULL) + return MOD_ERROR_VAL; + + MemAllocType.tp_new = PyType_GenericNew; + if (PyType_Ready(&MemAllocType) < 0){ + return MOD_ERROR_VAL; + } + + Py_INCREF(&MemAllocType); + PyModule_AddObject(module, "MemAlloc", (PyObject*)&MemAllocType); + + return MOD_SUCCESS_VAL(module); +} diff --git a/numba_cuda/numba/cuda/cext/typeconv.cpp b/numba_cuda/numba/cuda/cext/typeconv.cpp new file mode 100644 index 000000000..5af7d16b3 --- /dev/null +++ b/numba_cuda/numba/cuda/cext/typeconv.cpp @@ -0,0 +1,212 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + +#include +#include +#include +#include + +#include "typeconv.hpp" + + +// ------ TypeManager ------ + +TCCMap::TCCMap() + : nb_records(0) +{ +} + +size_t TCCMap::hash(const TypePair &key) const { + return std::hash()(std::hash()(key.first)) ^ + std::hash()(key.second); +} + +void TCCMap::insert(const TypePair &key, TypeCompatibleCode val) { + size_t i = hash(key) & (TCCMAP_SIZE - 1); + TCCMapBin &bin = records[i]; + for (unsigned int j = 0; j < bin.size(); ++j) { + if (bin[j].key == key) { + bin[j].val = val; + return; + } + } + bin.push_back({key, val}); + nb_records++; +} + +TypeCompatibleCode TCCMap::find(const TypePair &key) const { + size_t i = hash(key) & (TCCMAP_SIZE - 1); + const TCCMapBin &bin = records[i]; + for (unsigned int j = 0; j < bin.size(); ++j) { + if (bin[j].key == key) { + return bin[j].val; + } + } + return TCC_FALSE; +} + +// ----- Ratings ----- +Rating::Rating() : promote(0), safe_convert(0), unsafe_convert(0) { } + +inline bool Rating::operator < (const Rating &other) const { + if (unsafe_convert < other.unsafe_convert) + return true; + else if (unsafe_convert > other.unsafe_convert) + return false; + if (safe_convert < other.safe_convert) + return true; + else if (safe_convert > other.safe_convert) + return false; + return (promote < other.promote); +} + +inline bool Rating::operator == (const Rating &other) const { + return promote == other.promote && safe_convert == other.safe_convert && + unsafe_convert == other.unsafe_convert; +} + +// ------ TypeManager ------ + +bool TypeManager::canPromote(Type from, Type to) const { + return isCompatible(from, to) == TCC_PROMOTE; +} + +bool TypeManager::canSafeConvert(Type from, Type to) const { + return isCompatible(from, to) == TCC_CONVERT_SAFE; +} + +bool TypeManager::canUnsafeConvert(Type from, Type to) const { + return isCompatible(from, to) == TCC_CONVERT_UNSAFE; +} + +void TypeManager::addPromotion(Type from, Type to) { + return addCompatibility(from, to, TCC_PROMOTE); +} + +void TypeManager::addUnsafeConversion(Type from, Type to) { + return addCompatibility(from, to, TCC_CONVERT_UNSAFE); +} + +void TypeManager::addSafeConversion(Type from, Type to) { + return addCompatibility(from, to, TCC_CONVERT_SAFE); +} + +void TypeManager::addCompatibility(Type from, Type to, TypeCompatibleCode tcc) { + TypePair pair(from, to); + tccmap.insert(pair, tcc); +} + +TypeCompatibleCode TypeManager::isCompatible(Type from, Type to) const { + if (from == to) + return TCC_EXACT; + TypePair pair(from, to); + return tccmap.find(pair); +} + + +int TypeManager::selectOverload(const Type sig[], const Type ovsigs[], + int &selected, + int sigsz, int ovct, bool allow_unsafe, + bool exact_match_required + ) const { + int count; + if (ovct <= 16) { + Rating ratings[16]; + int candidates[16]; + count = _selectOverload(sig, ovsigs, selected, sigsz, ovct, + allow_unsafe, exact_match_required, ratings, + candidates); + } + else { + Rating *ratings = new Rating[ovct]; + int *candidates = new int[ovct]; + count = _selectOverload(sig, ovsigs, selected, sigsz, ovct, + allow_unsafe, exact_match_required, ratings, + candidates); + delete [] ratings; + delete [] candidates; + } + return count; +} + +int TypeManager::_selectOverload(const Type sig[], const Type ovsigs[], + int &selected, int sigsz, int ovct, + bool allow_unsafe, bool exact_match_required, + Rating ratings[], int candidates[]) const { + // Generate rating table + // Use a penalize scheme. + int nb_candidates = 0; + + for (int i = 0; i < ovct; ++i) { + const Type *entry = &ovsigs[i * sigsz]; + Rating rate; + + for (int j = 0; j < sigsz; ++j) { + TypeCompatibleCode tcc = isCompatible(sig[j], entry[j]); + if (tcc == TCC_FALSE || + (tcc == TCC_CONVERT_UNSAFE && !allow_unsafe) || + (tcc != TCC_EXACT && exact_match_required)) { + // stop the loop early + goto _incompatible; + } + switch(tcc) { + case TCC_PROMOTE: + rate.promote += 1; + break; + case TCC_CONVERT_SAFE: + rate.safe_convert += 1; + break; + case TCC_CONVERT_UNSAFE: + rate.unsafe_convert += 1; + break; + default: + break; + } + } + ratings[nb_candidates] = rate; + candidates[nb_candidates] = i; + nb_candidates++; + _incompatible: + ; + } + + // Bail if no match + if (nb_candidates == 0) + return 0; + + // Find lowest rating + Rating best = ratings[0]; + selected = candidates[0]; + + int matchcount = 1; + for (int i = 1; i < nb_candidates; ++i) { + if (ratings[i] < best) { + best = ratings[i]; + selected = candidates[i]; + matchcount = 1; + } + else if (ratings[i] == best) { + matchcount += 1; + } + } + return matchcount; +} + +// ----- utils ----- + +const char* TCCString(TypeCompatibleCode tcc) { + switch(tcc) { + case TCC_EXACT: + return "exact"; + case TCC_SUBTYPE: + return "subtype"; + case TCC_PROMOTE: + return "promote"; + case TCC_CONVERT_SAFE: + return "safe_convert"; + case TCC_CONVERT_UNSAFE: + return "unsafe_convert"; + default: + return "false"; + } +} diff --git a/numba_cuda/numba/cuda/cext/typeconv.hpp b/numba_cuda/numba/cuda/cext/typeconv.hpp new file mode 100644 index 000000000..da5d87a77 --- /dev/null +++ b/numba_cuda/numba/cuda/cext/typeconv.hpp @@ -0,0 +1,101 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause + +#ifndef NUMBA_TYPECONV_HPP_ +#define NUMBA_TYPECONV_HPP_ +#include +#include + + +typedef int Type; + +enum TypeCompatibleCode{ + // No match + TCC_FALSE = 0, + // Exact match + TCC_EXACT, + // Subtype is UNUSED + TCC_SUBTYPE, + // Promotion with no precision loss + TCC_PROMOTE, + // Conversion with no precision loss + // e.g. int32 to double + TCC_CONVERT_SAFE, + // Conversion with precision loss + // e.g. int64 to double (53 bits precision) + TCC_CONVERT_UNSAFE, +}; + +typedef std::pair TypePair; + +struct TCCRecord { + TypePair key; + TypeCompatibleCode val; +}; + +typedef std::vector TCCMapBin; + +class TCCMap { +public: + TCCMap(); + + void insert(const TypePair &key, TypeCompatibleCode val); + TypeCompatibleCode find(const TypePair &key) const; +private: + size_t hash(const TypePair &key) const; + + /* Must be a power of two */ + static const size_t TCCMAP_SIZE = 512; + TCCMapBin records[TCCMAP_SIZE]; + int nb_records; +}; + +struct Rating { + unsigned int promote; + unsigned int safe_convert; + unsigned int unsafe_convert; + + Rating(); + + bool operator < (const Rating &other) const; + bool operator == (const Rating &other) const; +}; + + +class TypeManager{ +public: + bool canPromote(Type from, Type to) const; + bool canUnsafeConvert(Type from, Type to) const; + bool canSafeConvert(Type from, Type to) const; + + void addPromotion(Type from, Type to); + void addUnsafeConversion(Type from, Type to); + void addSafeConversion(Type from, Type to); + void addCompatibility(Type from, Type to, TypeCompatibleCode by); + + TypeCompatibleCode isCompatible(Type from, Type to) const; + + /** + Output stored in selected. + Returns + Number of matches + */ + int selectOverload(const Type sig[], const Type ovsigs[], int &selected, + int sigsz, int ovct, bool allow_unsafe, + bool exact_match_required + ) const; + +private: + int _selectOverload(const Type sig[], const Type ovsigs[], int &selected, + int sigsz, int ovct, bool allow_unsafe, + bool exact_match_required, + Rating ratings[], int candidates[]) const; + + TCCMap tccmap; +}; + + +const char* TCCString(TypeCompatibleCode tcc); + + +#endif // NUMBA_TYPECONV_HPP_ diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py index e48e66ac7..5597308d7 100644 --- a/numba_cuda/numba/cuda/cudadrv/devicearray.py +++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py @@ -16,7 +16,7 @@ import numpy as np import numba -from numba import _devicearray +from numba.cuda.cext import _devicearray from numba.cuda.cudadrv import devices, dummyarray from numba.cuda.cudadrv import driver as _driver from numba.core import types, config diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index 7668f8578..9a6988b57 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -46,7 +46,7 @@ from collections import namedtuple, deque -from numba import mviewbuf +from numba.cuda.cext import mviewbuf from numba.core import config from numba.cuda import utils, serialize from .error import CudaSupportError, CudaDriverError diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index aeac72e5e..ad3b57a85 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -13,18 +13,20 @@ import re from warnings import warn -from numba import cuda, _dispatcher +from numba.core import types, config, errors, entrypoints +from numba.cuda import serialize, utils +from numba import cuda -from numba.core import types, config from numba.core.compiler_lock import global_compiler_lock -from numba.core.dispatcher import _DispatcherBase -from numba.core.errors import NumbaPerformanceWarning, TypingError +from numba.core.typeconv.rules import default_type_manager +from numba.cuda.typing.templates import fold_arguments from numba.core.typing.typeof import Purpose, typeof -from numba.cuda import serialize, utils, typing +from numba.cuda import typing from numba.cuda import types as cuda_types from numba.cuda.api import get_current_device from numba.cuda.args import wrap_arg +from numba.core.bytecode import get_code_object from numba.cuda.compiler import ( compile_cuda, CUDACompiler, @@ -41,11 +43,12 @@ missing_launch_config_msg, normalize_kernel_dimensions, ) -from numba.cuda.typing.templates import fold_arguments from numba.cuda.cudadrv.linkable_code import LinkableCode from numba.cuda.cudadrv.devices import get_context from numba.cuda.memory_management.nrt import rtsys, NRT_LIBRARY +from numba.cuda.cext import _dispatcher + cuda_fp16_math_funcs = [ "hsin", @@ -694,7 +697,7 @@ def __init__(self, dispatcher, griddim, blockdim, stream, sharedmem): f"Grid size {grid_size} will likely result in GPU " "under-utilization due to low occupancy." ) - warn(NumbaPerformanceWarning(msg)) + warn(errors.NumbaPerformanceWarning(msg)) def __call__(self, *args): return self.dispatcher.call( @@ -737,6 +740,601 @@ def load_overload(self, sig, target_context): return super().load_overload(sig, target_context) +class OmittedArg(object): + """ + A placeholder for omitted arguments with a default value. + """ + + def __init__(self, value): + self.value = value + + def __repr__(self): + return "omitted arg(%r)" % (self.value,) + + @property + def _numba_type_(self): + return types.Omitted(self.value) + + +class CompilingCounter(object): + """ + A simple counter that increment in __enter__ and decrement in __exit__. + """ + + def __init__(self): + self.counter = 0 + + def __enter__(self): + assert self.counter >= 0 + self.counter += 1 + + def __exit__(self, *args, **kwargs): + self.counter -= 1 + assert self.counter >= 0 + + def __bool__(self): + return self.counter > 0 + + __nonzero__ = __bool__ + + +class _DispatcherBase(_dispatcher.Dispatcher): + """ + Common base class for dispatcher Implementations. + """ + + __numba__ = "py_func" + + def __init__( + self, arg_count, py_func, pysig, can_fallback, exact_match_required + ): + self._tm = default_type_manager + + # A mapping of signatures to compile results + self.overloads = collections.OrderedDict() + + self.py_func = py_func + # other parts of Numba assume the old Python 2 name for code object + self.func_code = get_code_object(py_func) + # but newer python uses a different name + self.__code__ = self.func_code + # a place to keep an active reference to the types of the active call + self._types_active_call = set() + # Default argument values match the py_func + self.__defaults__ = py_func.__defaults__ + + argnames = tuple(pysig.parameters) + default_values = self.py_func.__defaults__ or () + defargs = tuple(OmittedArg(val) for val in default_values) + try: + lastarg = list(pysig.parameters.values())[-1] + except IndexError: + has_stararg = False + else: + has_stararg = lastarg.kind == lastarg.VAR_POSITIONAL + _dispatcher.Dispatcher.__init__( + self, + self._tm.get_pointer(), + arg_count, + self._fold_args, + argnames, + defargs, + can_fallback, + has_stararg, + exact_match_required, + ) + + self.doc = py_func.__doc__ + self._compiling_counter = CompilingCounter() + weakref.finalize(self, self._make_finalizer()) + + def _compilation_chain_init_hook(self): + """ + This will be called ahead of any part of compilation taking place (this + even includes being ahead of working out the types of the arguments). + This permits activities such as initialising extension entry points so + that the compiler knows about additional externally defined types etc + before it does anything. + """ + entrypoints.init_all() + + def _reset_overloads(self): + self._clear() + self.overloads.clear() + + def _make_finalizer(self): + """ + Return a finalizer function that will release references to + related compiled functions. + """ + overloads = self.overloads + targetctx = self.targetctx + + # Early-bind utils.shutting_down() into the function's local namespace + # (see issue #689) + def finalizer(shutting_down=utils.shutting_down): + # The finalizer may crash at shutdown, skip it (resources + # will be cleared by the process exiting, anyway). + if shutting_down(): + return + # This function must *not* hold any reference to self: + # we take care to bind the necessary objects in the closure. + for cres in overloads.values(): + try: + targetctx.remove_user_function(cres.entry_point) + except KeyError: + pass + + return finalizer + + @property + def signatures(self): + """ + Returns a list of compiled function signatures. + """ + return list(self.overloads) + + @property + def nopython_signatures(self): + return [ + cres.signature + for cres in self.overloads.values() + if not cres.objectmode + ] + + def disable_compile(self, val=True): + """Disable the compilation of new signatures at call time.""" + # If disabling compilation then there must be at least one signature + assert (not val) or len(self.signatures) > 0 + self._can_compile = not val + + def add_overload(self, cres): + args = tuple(cres.signature.args) + sig = [a._code for a in args] + self._insert(sig, cres.entry_point, cres.objectmode) + self.overloads[args] = cres + + def fold_argument_types(self, args, kws): + return self._compiler.fold_argument_types(args, kws) + + def get_call_template(self, args, kws): + """ + Get a typing.ConcreteTemplate for this dispatcher and the given + *args* and *kws* types. This allows to resolve the return type. + + A (template, pysig, args, kws) tuple is returned. + """ + # XXX how about a dispatcher template class automating the + # following? + + # Fold keyword arguments and resolve default values + pysig, args = self._compiler.fold_argument_types(args, kws) + kws = {} + # Ensure an overload is available + if self._can_compile: + self.compile(tuple(args)) + + # Create function type for typing + func_name = self.py_func.__name__ + name = "CallTemplate({0})".format(func_name) + # The `key` isn't really used except for diagnosis here, + # so avoid keeping a reference to `cfunc`. + call_template = typing.make_concrete_template( + name, key=func_name, signatures=self.nopython_signatures + ) + return call_template, pysig, args, kws + + def get_overload(self, sig): + """ + Return the compiled function for the given signature. + """ + args, return_type = sigutils.normalize_signature(sig) + return self.overloads[tuple(args)].entry_point + + @property + def is_compiling(self): + """ + Whether a specialization is currently being compiled. + """ + return self._compiling_counter + + def _compile_for_args(self, *args, **kws): + """ + For internal use. Compile a specialized version of the function + for the given *args* and *kws*, and return the resulting callable. + """ + assert not kws + # call any initialisation required for the compilation chain (e.g. + # extension point registration). + self._compilation_chain_init_hook() + + def error_rewrite(e, issue_type): + """ + Rewrite and raise Exception `e` with help supplied based on the + specified issue_type. + """ + if config.SHOW_HELP: + help_msg = errors.error_extras[issue_type] + e.patch_message("\n".join((str(e).rstrip(), help_msg))) + if config.FULL_TRACEBACKS: + raise e + else: + raise e.with_traceback(None) + + argtypes = [] + for a in args: + if isinstance(a, OmittedArg): + argtypes.append(types.Omitted(a.value)) + else: + argtypes.append(self.typeof_pyval(a)) + + return_val = None + try: + return_val = self.compile(tuple(argtypes)) + except errors.ForceLiteralArg as e: + # Received request for compiler re-entry with the list of arguments + # indicated by e.requested_args. + # First, check if any of these args are already Literal-ized + already_lit_pos = [ + i + for i in e.requested_args + if isinstance(args[i], types.Literal) + ] + if already_lit_pos: + # Abort compilation if any argument is already a Literal. + # Letting this continue will cause infinite compilation loop. + m = ( + "Repeated literal typing request.\n" + "{}.\n" + "This is likely caused by an error in typing. " + "Please see nested and suppressed exceptions." + ) + info = ", ".join( + "Arg #{} is {}".format(i, args[i]) + for i in sorted(already_lit_pos) + ) + raise errors.CompilerError(m.format(info)) + # Convert requested arguments into a Literal. + args = [ + (types.literal if i in e.requested_args else lambda x: x)( + args[i] + ) + for i, v in enumerate(args) + ] + # Re-enter compilation with the Literal-ized arguments + return_val = self._compile_for_args(*args) + + except errors.TypingError as e: + # Intercept typing error that may be due to an argument + # that failed inferencing as a Numba type + failed_args = [] + for i, arg in enumerate(args): + val = arg.value if isinstance(arg, OmittedArg) else arg + try: + tp = typeof(val, Purpose.argument) + except (errors.NumbaValueError, ValueError) as typeof_exc: + failed_args.append((i, str(typeof_exc))) + else: + if tp is None: + failed_args.append( + (i, f"cannot determine Numba type of value {val}") + ) + if failed_args: + # Patch error message to ease debugging + args_str = "\n".join( + f"- argument {i}: {err}" for i, err in failed_args + ) + msg = ( + f"{str(e).rstrip()} \n\nThis error may have been caused " + f"by the following argument(s):\n{args_str}\n" + ) + e.patch_message(msg) + + error_rewrite(e, "typing") + except errors.UnsupportedError as e: + # Something unsupported is present in the user code, add help info + error_rewrite(e, "unsupported_error") + except ( + errors.NotDefinedError, + errors.RedefinedError, + errors.VerificationError, + ) as e: + # These errors are probably from an issue with either the code + # supplied being syntactically or otherwise invalid + error_rewrite(e, "interpreter") + except errors.ConstantInferenceError as e: + # this is from trying to infer something as constant when it isn't + # or isn't supported as a constant + error_rewrite(e, "constant_inference") + except Exception as e: + if config.SHOW_HELP: + if hasattr(e, "patch_message"): + help_msg = errors.error_extras["reportable"] + e.patch_message("\n".join((str(e).rstrip(), help_msg))) + # ignore the FULL_TRACEBACKS config, this needs reporting! + raise e + finally: + self._types_active_call.clear() + return return_val + + def inspect_llvm(self, signature=None): + """Get the LLVM intermediate representation generated by compilation. + + Parameters + ---------- + signature : tuple of numba types, optional + Specify a signature for which to obtain the LLVM IR. If None, the + IR is returned for all available signatures. + + Returns + ------- + llvm : dict[signature, str] or str + Either the LLVM IR string for the specified signature, or, if no + signature was given, a dictionary mapping signatures to LLVM IR + strings. + """ + if signature is not None: + lib = self.overloads[signature].library + return lib.get_llvm_str() + + return dict((sig, self.inspect_llvm(sig)) for sig in self.signatures) + + def inspect_asm(self, signature=None): + """Get the generated assembly code. + + Parameters + ---------- + signature : tuple of numba types, optional + Specify a signature for which to obtain the assembly code. If + None, the assembly code is returned for all available signatures. + + Returns + ------- + asm : dict[signature, str] or str + Either the assembly code for the specified signature, or, if no + signature was given, a dictionary mapping signatures to assembly + code. + """ + if signature is not None: + lib = self.overloads[signature].library + return lib.get_asm_str() + + return dict((sig, self.inspect_asm(sig)) for sig in self.signatures) + + def inspect_types( + self, file=None, signature=None, pretty=False, style="default", **kwargs + ): + """Print/return Numba intermediate representation (IR)-annotated code. + + Parameters + ---------- + file : file-like object, optional + File to which to print. Defaults to sys.stdout if None. Must be + None if ``pretty=True``. + signature : tuple of numba types, optional + Print/return the intermediate representation for only the given + signature. If None, the IR is printed for all available signatures. + pretty : bool, optional + If True, an Annotate object will be returned that can render the + IR with color highlighting in Jupyter and IPython. ``file`` must + be None if ``pretty`` is True. Additionally, the ``pygments`` + library must be installed for ``pretty=True``. + style : str, optional + Choose a style for rendering. Ignored if ``pretty`` is ``False``. + This is directly consumed by ``pygments`` formatters. To see a + list of available styles, import ``pygments`` and run + ``list(pygments.styles.get_all_styles())``. + + Returns + ------- + annotated : Annotate object, optional + Only returned if ``pretty=True``, otherwise this function is only + used for its printing side effect. If ``pretty=True``, an Annotate + object is returned that can render itself in Jupyter and IPython. + """ + overloads = self.overloads + if signature is not None: + overloads = {signature: self.overloads[signature]} + + if not pretty: + if file is None: + file = sys.stdout + + for ver, res in overloads.items(): + print("%s %s" % (self.py_func.__name__, ver), file=file) + print("-" * 80, file=file) + print(res.type_annotation, file=file) + print("=" * 80, file=file) + else: + if file is not None: + raise ValueError("`file` must be None if `pretty=True`") + from numba.core.annotations.pretty_annotate import Annotate + + return Annotate(self, signature=signature, style=style) + + def inspect_cfg(self, signature=None, show_wrapper=None, **kwargs): + """ + For inspecting the CFG of the function. + + By default the CFG of the user function is shown. The *show_wrapper* + option can be set to "python" or "cfunc" to show the python wrapper + function or the *cfunc* wrapper function, respectively. + + Parameters accepted in kwargs + ----------------------------- + filename : string, optional + the name of the output file, if given this will write the output to + filename + view : bool, optional + whether to immediately view the optional output file + highlight : bool, set, dict, optional + what, if anything, to highlight, options are: + { incref : bool, # highlight NRT_incref calls + decref : bool, # highlight NRT_decref calls + returns : bool, # highlight exits which are normal returns + raises : bool, # highlight exits which are from raise + meminfo : bool, # highlight calls to NRT*meminfo + branches : bool, # highlight true/false branches + } + Default is True which sets all of the above to True. Supplying a set + of strings is also accepted, these are interpreted as key:True with + respect to the above dictionary. e.g. {'incref', 'decref'} would + switch on highlighting on increfs and decrefs. + interleave: bool, set, dict, optional + what, if anything, to interleave in the LLVM IR, options are: + { python: bool # interleave python source code with the LLVM IR + lineinfo: bool # interleave line information markers with the LLVM + # IR + } + Default is True which sets all of the above to True. Supplying a set + of strings is also accepted, these are interpreted as key:True with + respect to the above dictionary. e.g. {'python',} would + switch on interleaving of python source code in the LLVM IR. + strip_ir : bool, optional + Default is False. If set to True all LLVM IR that is superfluous to + that requested in kwarg `highlight` will be removed. + show_key : bool, optional + Default is True. Create a "key" for the highlighting in the rendered + CFG. + fontsize : int, optional + Default is 8. Set the fontsize in the output to this value. + """ + if signature is not None: + cres = self.overloads[signature] + lib = cres.library + if show_wrapper == "python": + fname = cres.fndesc.llvm_cpython_wrapper_name + elif show_wrapper == "cfunc": + fname = cres.fndesc.llvm_cfunc_wrapper_name + else: + fname = cres.fndesc.mangled_name + return lib.get_function_cfg(fname, py_func=self.py_func, **kwargs) + + return dict( + (sig, self.inspect_cfg(sig, show_wrapper=show_wrapper)) + for sig in self.signatures + ) + + def inspect_disasm_cfg(self, signature=None): + """ + For inspecting the CFG of the disassembly of the function. + + Requires python package: r2pipe + Requires radare2 binary on $PATH. + Notebook rendering requires python package: graphviz + + signature : tuple of Numba types, optional + Print/return the disassembly CFG for only the given signatures. + If None, the IR is printed for all available signatures. + """ + if signature is not None: + cres = self.overloads[signature] + lib = cres.library + return lib.get_disasm_cfg(cres.fndesc.mangled_name) + + return dict( + (sig, self.inspect_disasm_cfg(sig)) for sig in self.signatures + ) + + def get_annotation_info(self, signature=None): + """ + Gets the annotation information for the function specified by + signature. If no signature is supplied a dictionary of signature to + annotation information is returned. + """ + signatures = self.signatures if signature is None else [signature] + out = collections.OrderedDict() + for sig in signatures: + cres = self.overloads[sig] + ta = cres.type_annotation + key = ( + ta.func_id.filename + ":" + str(ta.func_id.firstlineno + 1), + ta.signature, + ) + out[key] = ta.annotate_raw()[key] + return out + + def _explain_ambiguous(self, *args, **kws): + """ + Callback for the C _Dispatcher object. + """ + assert not kws, "kwargs not handled" + args = tuple([self.typeof_pyval(a) for a in args]) + # The order here must be deterministic for testing purposes, which + # is ensured by the OrderedDict. + sigs = self.nopython_signatures + # This will raise + self.typingctx.resolve_overload( + self.py_func, sigs, args, kws, allow_ambiguous=False + ) + + def _explain_matching_error(self, *args, **kws): + """ + Callback for the C _Dispatcher object. + """ + assert not kws, "kwargs not handled" + args = [self.typeof_pyval(a) for a in args] + msg = "No matching definition for argument type(s) %s" % ", ".join( + map(str, args) + ) + raise TypeError(msg) + + def _search_new_conversions(self, *args, **kws): + """ + Callback for the C _Dispatcher object. + Search for approximately matching signatures for the given arguments, + and ensure the corresponding conversions are registered in the C++ + type manager. + """ + assert not kws, "kwargs not handled" + args = [self.typeof_pyval(a) for a in args] + found = False + for sig in self.nopython_signatures: + conv = self.typingctx.install_possible_conversions(args, sig.args) + if conv: + found = True + return found + + def __repr__(self): + return "%s(%s)" % (type(self).__name__, self.py_func) + + def typeof_pyval(self, val): + """ + Resolve the Numba type of Python value *val*. + This is called from numba._dispatcher as a fallback if the native code + cannot decide the type. + """ + try: + tp = typeof(val, Purpose.argument) + except (errors.NumbaValueError, ValueError): + tp = types.pyobject + else: + if tp is None: + tp = types.pyobject + self._types_active_call.add(tp) + return tp + + def _callback_add_timer(self, duration, cres, lock_name): + md = cres.metadata + # md can be None when code is loaded from cache + if md is not None: + timers = md.setdefault("timers", {}) + if lock_name not in timers: + # Only write if the metadata does not exist + timers[lock_name] = duration + else: + msg = f"'{lock_name} metadata is already defined." + raise AssertionError(msg) + + def _callback_add_compiler_timer(self, duration, cres): + return self._callback_add_timer( + duration, cres, lock_name="compiler_lock" + ) + + def _callback_add_llvm_timer(self, duration, cres): + return self._callback_add_timer(duration, cres, lock_name="llvm_lock") + + class _MemoMixin: __uuid = None # A {uuid -> instance} mapping, for deserialization @@ -828,7 +1426,7 @@ def _compile_cached(self, args, return_type): try: retval = self._compile_core(args, return_type) - except TypingError as e: + except errors.TypingError as e: self._failed_cache[key] = e return False, e else: @@ -1316,7 +1914,7 @@ def get_compile_result(self, sig): self.compile(atypes) else: msg = f"{sig} not available and compilation disabled" - raise TypingError(msg) + raise errors.TypingError(msg) return self.overloads[atypes] def recompile(self): @@ -1523,3 +2121,9 @@ def _reduce_states(self): Compiled definitions are discarded. """ return dict(py_func=self.py_func, targetoptions=self.targetoptions) + + +# Initialize typeof machinery +_dispatcher.typeof_init( + OmittedArg, dict((str(t), t._code) for t in types.number_domain) +) diff --git a/pyproject.toml b/pyproject.toml index 550a30057..e81e53950 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ build-backend = "setuptools.build_meta" requires = [ "setuptools", "wheel", + "numpy", ] [project] diff --git a/setup.py b/setup.py index 607822221..cb5d4b4db 100644 --- a/setup.py +++ b/setup.py @@ -2,16 +2,127 @@ # SPDX-License-Identifier: BSD-2-Clause import pathlib +import sys -from setuptools import setup +from setuptools import setup, Extension from setuptools.command.build_py import build_py from setuptools.command.editable_wheel import editable_wheel, _TopLevelFinder +from setuptools.command.build_ext import build_ext REDIRECTOR_PTH = "_numba_cuda_redirector.pth" REDIRECTOR_PY = "_numba_cuda_redirector.py" SITE_PACKAGES = pathlib.Path("site-packages") +def get_version(): + """Read version from VERSION file.""" + version_file = pathlib.Path(__file__).parent / "numba_cuda" / "VERSION" + return version_file.read_text().strip() + + +def get_ext_modules(): + """ + Return a list of Extension instances for the setup() call. + """ + # Note we don't import NumPy at the toplevel, since setup.py + # should be able to run without NumPy for pip to discover the + # build dependencies. Need NumPy headers and libm linkage. + import numpy as np + + np_compile_args = { + "include_dirs": [ + np.get_include(), + ], + } + if sys.platform != "win32": + np_compile_args["libraries"] = [ + "m", + ] + + ext_devicearray = Extension( + name="numba_cuda.numba.cuda.cext._devicearray", + sources=["numba_cuda/numba/cuda/cext/_devicearray.cpp"], + depends=[ + "numba_cuda/numba/cuda/cext/_pymodule.h", + "numba_cuda/numba/cuda/cext/_devicearray.h", + ], + include_dirs=["numba_cuda/numba/cuda/cext"], + extra_compile_args=["-std=c++11"], + ) + + install_name_tool_fixer = [] + if sys.platform == "darwin": + install_name_tool_fixer = ["-headerpad_max_install_names"] + + ext_mviewbuf = Extension( + name="numba_cuda.numba.cuda.cext.mviewbuf", + extra_link_args=install_name_tool_fixer, + sources=["numba_cuda/numba/cuda/cext/mviewbuf.c"], + ) + + dispatcher_sources = [ + "numba_cuda/numba/cuda/cext/_dispatcher.cpp", + "numba_cuda/numba/cuda/cext/_typeof.cpp", + "numba_cuda/numba/cuda/cext/_hashtable.cpp", + "numba_cuda/numba/cuda/cext/typeconv.cpp", + ] + ext_dispatcher = Extension( + name="numba_cuda.numba.cuda.cext._dispatcher", + sources=dispatcher_sources, + depends=[ + "numba_cuda/numba/cuda/cext/_pymodule.h", + "numba_cuda/numba/cuda/cext/_typeof.h", + "numba_cuda/numba/cuda/cext/_hashtable.h", + ], + extra_compile_args=["-std=c++11"], + **np_compile_args, + ) + + # Append our cext dir to include_dirs + ext_dispatcher.include_dirs.append("numba_cuda/numba/cuda/cext") + + return [ext_dispatcher, ext_mviewbuf, ext_devicearray] + + +def is_building(): + """ + Parse the setup.py command and return whether a build is requested. + If False is returned, only an informational command is run. + If True is returned, information about C extensions will have to + be passed to the setup() function. + """ + if len(sys.argv) < 2: + # User forgot to give an argument probably, let setuptools handle that. + return True + + build_commands = [ + "build", + "build_py", + "build_ext", + "build_clib", + "build_scripts", + "install", + "install_lib", + "install_headers", + "install_scripts", + "install_data", + "sdist", + "bdist", + "bdist_dumb", + "bdist_rpm", + "bdist_wininst", + "check", + "build_docs", + "bdist_wheel", + "bdist_egg", + "develop", + "easy_install", + "test", + "editable_wheel", + ] + return any(bc in sys.argv[1:] for bc in build_commands) + + # Adapted from https://stackoverflow.com/a/71137790 class build_py_with_redirector(build_py): # noqa: N801 """Include the redirector files in the generated wheel.""" @@ -72,9 +183,52 @@ def _select_strategy(self, name, tag, build_lib): return TopLevelFinderWithRedirector(self.distribution, name) +cmdclass = {} + +numba_be_user_options = [ + ("werror", None, "Build extensions with -Werror"), + ("wall", None, "Build extensions with -Wall"), + ("noopt", None, "Build extensions without optimization"), +] + + +class NumbaBuildExt(build_ext): + user_options = build_ext.user_options + numba_be_user_options + boolean_options = build_ext.boolean_options + ["werror", "wall", "noopt"] + + def initialize_options(self): + super().initialize_options() + self.werror = 0 + self.wall = 0 + self.noopt = 0 + + def run(self): + extra_compile_args = [] + if self.noopt: + if sys.platform == "win32": + extra_compile_args.append("/Od") + else: + extra_compile_args.append("-O0") + if self.werror: + extra_compile_args.append("-Werror") + if self.wall: + extra_compile_args.append("-Wall") + for ext in self.extensions: + ext.extra_compile_args.extend(extra_compile_args) + + super().run() + + +cmdclass["build_ext"] = NumbaBuildExt +cmdclass["build_py"] = build_py_with_redirector +cmdclass["editable_wheel"] = editable_wheel_with_redirector + +if is_building(): + ext_modules = get_ext_modules() +else: + ext_modules = [] + setup( - cmdclass={ - "build_py": build_py_with_redirector, - "editable_wheel": editable_wheel_with_redirector, - } + cmdclass=cmdclass, + ext_modules=ext_modules, ) diff --git a/site-packages/_numba_cuda_redirector.py b/site-packages/_numba_cuda_redirector.py index 3c1301ba3..a35894baf 100644 --- a/site-packages/_numba_cuda_redirector.py +++ b/site-packages/_numba_cuda_redirector.py @@ -34,7 +34,7 @@ def ensure_initialized(self): numba_cuda_spec = importlib.util.find_spec("numba_cuda") - if numba_spec is None: + if numba_cuda_spec is None: warnings.warn(no_spec_msg.format("numba_cuda")) self.initialized = False return False