diff --git a/.github/workflows/conda-python-build.yaml b/.github/workflows/conda-python-build.yaml
index eaa45fdbf..ef72c9ac6 100644
--- a/.github/workflows/conda-python-build.yaml
+++ b/.github/workflows/conda-python-build.yaml
@@ -104,7 +104,7 @@ jobs:
         if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
-          name: conda-repo
+          name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
           path: "/tmp/conda-bld-output"
       - name: Publish conda package
         if: inputs.upload_to_anaconda
diff --git a/.github/workflows/conda-python-tests.yaml b/.github/workflows/conda-python-tests.yaml
index 4e30a4814..c438ebbb8 100644
--- a/.github/workflows/conda-python-tests.yaml
+++ b/.github/workflows/conda-python-tests.yaml
@@ -97,7 +97,9 @@ jobs:
           ref: ${{ inputs.sha }}
           fetch-depth: 0
       - uses: actions/download-artifact@v4
-        name: conda-repo
+        with:
+          name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
+          path: conda-repo
       - name: Display structure of downloaded files
         run: ls -R
       - name: Standardize repository information
diff --git a/.github/workflows/docs-build.yaml b/.github/workflows/docs-build.yaml
index 7973ce568..83c78fe8d 100644
--- a/.github/workflows/docs-build.yaml
+++ b/.github/workflows/docs-build.yaml
@@ -49,7 +49,7 @@ jobs:
     env:
       RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts
     container:
-      image: rapidsai/ci-conda:latest
+      image: rapidsai/ci-conda:cuda13.0.0-ubuntu24.04-py3.13
       env:
         RAPIDS_BUILD_TYPE: ${{ inputs.build_type }}
     steps:
@@ -59,7 +59,9 @@ jobs:
           ref: ${{ inputs.sha }}
           fetch-depth: 0
       - uses: actions/download-artifact@v4
-        name: conda-repo
+        with:
+          name: conda-repo-py3.13-amd64
+          path: conda-repo
       - name: Display structure of downloaded files
         run: ls -R
       - name: Standardize repository information
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index da52b2d39..b2b5a2a6a 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -22,6 +22,7 @@ jobs:
       - test-conda-ctypes-binding
       - test-simulator
       - build-wheels
+      - build-wheels-windows
       - test-wheels-windows
       - test-wheels
       - test-wheels-ctypes-binding
@@ -108,10 +109,11 @@ jobs:
       build_type: pull-request
       script: "ci/build_wheel.sh"
       matrix: ${{ needs.compute-matrix.outputs.BUILD_MATRIX }}
+  build-wheels-windows:
+    uses: ./.github/workflows/wheel-windows-build.yaml
   test-wheels-windows:
     needs:
-      - build-wheels
-      - compute-matrix
+      - build-wheels-windows
     uses: ./.github/workflows/wheel-windows-tests.yaml
   test-wheels:
     needs:
@@ -169,4 +171,4 @@ jobs:
       build_type: pull-request
       script: "ci/coverage_report.sh"
       matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
-      matrix_filter: 'map(select(.ARCH == "amd64" and .CUDA_VER == "12.9.1" and .PY_VER == "3.11")) | .[0:1]'
+      matrix_filter: 'map(select(.ARCH == "amd64" and .CUDA_VER == "12.9.1" and .PY_VER == "3.12")) | .[0:1]'
diff --git a/.github/workflows/simulator-test.yaml b/.github/workflows/simulator-test.yaml
index 3202b20e0..fe306bc26 100644
--- a/.github/workflows/simulator-test.yaml
+++ b/.github/workflows/simulator-test.yaml
@@ -86,7 +86,9 @@ jobs:
           ref: ${{ inputs.sha }}
           fetch-depth: 0
       - uses: actions/download-artifact@v4
-        name: conda-repo
+        with:
+          name: conda-repo-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
+          path: conda-repo
       - name: Display structure of downloaded files
         run: ls -R
       - name: Standardize repository information
diff --git a/.github/workflows/wheel-windows-build.yaml b/.github/workflows/wheel-windows-build.yaml
new file mode 100644
index 000000000..6a861db5c
--- /dev/null
+++ b/.github/workflows/wheel-windows-build.yaml
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+name: "CI: Build wheel on Windows"
+
+on:
+  workflow_call:
+    inputs:
+      sha:
+        type: string
+      repo:
+        type: string
+      script:
+        type: string
+        default: "./ci/build_wheel.ps1"
+
+jobs:
+  build:
+    name: ${{ matrix.PY_VER }}, windows
+    strategy:
+      fail-fast: false
+      matrix:
+        PY_VER:
+          - "3.10"
+          - "3.11"
+          - "3.12"
+          - "3.13"
+    runs-on: windows-2022
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          repository: ${{ inputs.repo }}
+          ref: ${{ inputs.sha }}
+          fetch-depth: 0
+
+      - name: Set up Python ${{ matrix.PY_VER }}
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
+        with:
+          python-version: ${{ matrix.PY_VER }}
+
+      - name: Set up MSVC
+        uses: ilammy/msvc-dev-cmd@v1  # TODO: ask admin to allow pinning commits
+
+      - name: Python build
+        run: |
+          ${{ inputs.script }}
+        env:
+          CUDA_VER: "12.8.0"
+
+      - name: Upload wheel
+        if: "!cancelled()"
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-py${{ matrix.PY_VER }}-windows
+          path: ${{ env.wheel_path }}
diff --git a/.github/workflows/wheel-windows-tests.yaml b/.github/workflows/wheel-windows-tests.yaml
index 1847ccf7e..766a2f9a5 100644
--- a/.github/workflows/wheel-windows-tests.yaml
+++ b/.github/workflows/wheel-windows-tests.yaml
@@ -15,24 +15,16 @@ on:
         default: "./ci/test_wheel.ps1"
 
 jobs:
-  compute-matrix:
-    runs-on: ubuntu-latest
-    outputs:
-      MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
-    steps:
-      - name: Compute Python Test Matrix
-        id: compute-matrix
-        run: |
-          set -eo pipefail
-          export TEST_MATRIX="{ ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0' }"
-          MATRIX=$(yq -n -o json 'env(TEST_MATRIX)' | jq -c)
-          echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
   tests:
-    name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, ${{ matrix.ARCH }}, windows
-    needs: compute-matrix
+    name: ${{ matrix.PY_VER }}, windows
     strategy:
       fail-fast: false
-      matrix: ${{ fromJSON('{"ARCH":["amd64"],"PY_VER":["3.12"],"CUDA_VER":["12.8.0"]}') }}
+      matrix:
+        PY_VER:
+          - "3.10"
+          - "3.11"
+          - "3.12"
+          - "3.13"
     runs-on: "cuda-python-windows-gpu-github"
     steps:
       - uses: actions/checkout@v4
@@ -49,7 +41,9 @@ jobs:
         run: nvidia-smi
 
       - uses: actions/download-artifact@v4
-        name: wheel
+        with:
+          name: wheel-py${{ matrix.PY_VER }}-windows
+          path: wheel
 
       - name: Display structure of downloaded files
         run: Get-ChildItem -Recurse
@@ -63,4 +57,4 @@ jobs:
         run: |
           ${{ inputs.script }}
         env:
-          CUDA_VER: ${{ matrix.CUDA_VER }}
+          CUDA_VER: "12.8.0"
diff --git a/.github/workflows/wheels-build.yaml b/.github/workflows/wheels-build.yaml
index f184b9e08..bc3110b70 100644
--- a/.github/workflows/wheels-build.yaml
+++ b/.github/workflows/wheels-build.yaml
@@ -152,11 +152,11 @@ jobs:
         if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
-          name: sdist
+          name: sdist-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
           path: ${{ env.sdist_path }}
       - name: Upload wheel
         if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
-          name: wheel
+          name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
           path: ${{ env.wheel_path }}
diff --git a/.github/workflows/wheels-test.yaml b/.github/workflows/wheels-test.yaml
index 415a54489..afa05076f 100644
--- a/.github/workflows/wheels-test.yaml
+++ b/.github/workflows/wheels-test.yaml
@@ -115,7 +115,9 @@ jobs:
         fetch-depth: 0 # unshallow fetch for setuptools-scm
         persist-credentials: false
     - uses: actions/download-artifact@v4
-      name: wheel
+      with:
+        name: wheel-py${{ matrix.PY_VER }}-${{ matrix.ARCH }}
+        path: wheel
     - name: Display structure of downloaded files
       run: ls -R
     - name: Standardize repository information
diff --git a/ci/build_wheel.ps1 b/ci/build_wheel.ps1
new file mode 100644
index 000000000..77608ac54
--- /dev/null
+++ b/ci/build_wheel.ps1
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+$ErrorActionPreference = 'Stop'
+Set-StrictMode -Version Latest
+
+function rapids-logger {
+    param (
+        [Parameter(Mandatory=$true)]
+        [string]$Text
+    )
+
+    # Determine padding and box width
+    $padding = 2
+    $boxWidth = $Text.Length + ($padding * 2)
+    $topBottom = '+' + ('-' * $boxWidth) + '+'
+    $middle = '|' + (' ' * $padding) + $Text + (' ' * $padding) + '|'
+
+    # Print the box in green
+    Write-Host $topBottom -ForegroundColor Green
+    Write-Host $middle    -ForegroundColor Green
+    Write-Host $topBottom -ForegroundColor Green
+}
+
+rapids-logger "Install build package"
+python -m pip install build
+
+rapids-logger "Build sdist and wheel"
+python -m build .
+
+$wheel_path = Resolve-Path dist\numba_cuda*.whl | Select-Object -ExpandProperty Path
+echo "Wheel path: $wheel_path"
+echo "wheel_path=$wheel_path" >> $env:GITHUB_ENV
diff --git a/ci/matrix.yml b/ci/matrix.yml
index 6cf2e4dac..f810738c1 100644
--- a/ci/matrix.yml
+++ b/ci/matrix.yml
@@ -5,23 +5,30 @@
 #     [ARCH, PY_VER, CUDA_VER, LINUX_VER, GPU, DRIVER, DEPENDENCIES]
 #
 build-matrix:
+  - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
   - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
+  - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
 simulator-matrix:
   - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.8.0', LINUX_VER: 'rockylinux8' }
 # We test "oldest" dependencies with the oldest supported Python version and
 # the second-newest Python version.
 test-matrix:
-  - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.9',  LINUX_VER: 'rockylinux8', GPU: 'l4',   DRIVER: 'earliest', DEPENDENCIES: 'oldest' }
-  - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.9',  LINUX_VER: 'rockylinux8', GPU: 'l4',   DRIVER: 'earliest', DEPENDENCIES: 'latest' }
-  - { CUDA_VER: '12.2.2', ARCH: 'amd64', PY_VER: '3.10', LINUX_VER: 'ubuntu22.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'latest' }
-  - { CUDA_VER: '12.9.1', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'ubuntu24.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.10',  LINUX_VER: 'rockylinux8', GPU: 'l4',   DRIVER: 'earliest', DEPENDENCIES: 'oldest' }
+  - { CUDA_VER: '12.0.1', ARCH: 'amd64', PY_VER: '3.11',  LINUX_VER: 'rockylinux8', GPU: 'l4',   DRIVER: 'earliest', DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '12.2.2', ARCH: 'amd64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '12.9.1', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'ubuntu24.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'latest' }
   - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'oldest' }
-  - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.13', LINUX_VER: 'ubuntu22.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'latest' }
   - { CUDA_VER: '13.0.0', ARCH: 'amd64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'l4',   DRIVER: 'latest',   DEPENDENCIES: 'latest' }
-  - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.9',  LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'oldest' }
-  - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.9',  LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'latest' }
-  - { CUDA_VER: '12.2.2', ARCH: 'arm64', PY_VER: '3.10', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'latest' }
-  - { CUDA_VER: '12.9.1', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.10',  LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'oldest' }
+  - { CUDA_VER: '12.0.1', ARCH: 'arm64', PY_VER: '3.11',  LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'earliest', DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '12.2.2', ARCH: 'arm64', PY_VER: '3.11', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '12.9.1', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'latest' }
   - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'oldest' }
-  - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.12', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'latest' }
+  - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.13', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'latest' }
   - { CUDA_VER: '13.0.0', ARCH: 'arm64', PY_VER: '3.13', LINUX_VER: 'ubuntu24.04', GPU: 'a100', DRIVER: 'latest',   DEPENDENCIES: 'latest' }
diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml
index 7e2b49541..b03309822 100644
--- a/conda/recipes/numba-cuda/meta.yaml
+++ b/conda/recipes/numba-cuda/meta.yaml
@@ -17,7 +17,6 @@ source:
   path: ../../..
 
 build:
-  noarch: python
   script:
     - {{ PYTHON }} -m pip install . -vv
 
@@ -26,6 +25,7 @@ requirements:
     - python
     - pip
     - setuptools
+    - numpy >=2.1.0
   run:
     - python
     - numba >=0.59.1
diff --git a/numba_cuda/numba/cuda/cext/__init__.py b/numba_cuda/numba/cuda/cext/__init__.py
new file mode 100644
index 000000000..79b94ffc4
--- /dev/null
+++ b/numba_cuda/numba/cuda/cext/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
diff --git a/numba_cuda/numba/cuda/cext/_devicearray.cpp b/numba_cuda/numba/cuda/cext/_devicearray.cpp
new file mode 100644
index 000000000..216612cf8
--- /dev/null
+++ b/numba_cuda/numba/cuda/cext/_devicearray.cpp
@@ -0,0 +1,161 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
+/* This file contains the base class implementation for all device arrays. The
+ * base class is implemented in C so that computing typecodes for device arrays
+ * can be implemented efficiently. */
+
+#include "_pymodule.h"
+
+
+/* Include _devicearray., but make sure we don't get the definitions intended
+ * for consumers of the Device Array API.
+ */
+#define NUMBA_IN_DEVICEARRAY_CPP_
+#include "_devicearray.h"
+
+/* DeviceArray PyObject implementation. Note that adding more members here is
+ * presently prohibited because mapped and managed arrays derive from both
+ * DeviceArray and NumPy's ndarray, which is also a C extension class - the
+ * layout of the object cannot be resolved if this class also has members beyond
+ * PyObject_HEAD. */
+class DeviceArray {
+    PyObject_HEAD
+};
+
+/* Trivial traversal - DeviceArray instances own nothing. */
+static int
+DeviceArray_traverse(DeviceArray *self, visitproc visit, void *arg)
+{
+    return 0;
+}
+
+/* Trivial clear of all references - DeviceArray instances own nothing. */
+static int
+DeviceArray_clear(DeviceArray *self)
+{
+    return 0;
+}
+
+/* The _devicearray.DeviceArray type */
+PyTypeObject DeviceArrayType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    "_devicearray.DeviceArray",                  /* tp_name */
+    sizeof(DeviceArray),                         /* tp_basicsize */
+    0,                                           /* tp_itemsize */
+    0,                                           /* tp_dealloc */
+    0,                                           /* tp_vectorcall_offset */
+    0,                                           /* tp_getattr */
+    0,                                           /* tp_setattr */
+    0,                                           /* tp_as_async */
+    0,                                           /* tp_repr */
+    0,                                           /* tp_as_number */
+    0,                                           /* tp_as_sequence */
+    0,                                           /* tp_as_mapping */
+    0,                                           /* tp_hash */
+    0,                                           /* tp_call*/
+    0,                                           /* tp_str*/
+    0,                                           /* tp_getattro*/
+    0,                                           /* tp_setattro*/
+    0,                                           /* tp_as_buffer*/
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC,
+                                                 /* tp_flags*/
+    "DeviceArray object",                        /* tp_doc */
+    (traverseproc) DeviceArray_traverse,         /* tp_traverse */
+    (inquiry) DeviceArray_clear,                 /* tp_clear */
+    0,                                           /* tp_richcompare */
+    0,                                           /* tp_weaklistoffset */
+    0,                                           /* tp_iter */
+    0,                                           /* tp_iternext */
+    0,                                           /* tp_methods */
+    0,                                           /* tp_members */
+    0,                                           /* tp_getset */
+    0,                                           /* tp_base */
+    0,                                           /* tp_dict */
+    0,                                           /* tp_descr_get */
+    0,                                           /* tp_descr_set */
+    0,                                           /* tp_dictoffset */
+    0,                                           /* tp_init */
+    0,                                           /* tp_alloc */
+    0,                                           /* tp_new */
+    0,                                           /* tp_free */
+    0,                                           /* tp_is_gc */
+    0,                                           /* tp_bases */
+    0,                                           /* tp_mro */
+    0,                                           /* tp_cache */
+    0,                                           /* tp_subclasses */
+    0,                                           /* tp_weaklist */
+    0,                                           /* tp_del */
+    0,                                           /* tp_version_tag */
+    0,                                           /* tp_finalize */
+    0,                                           /* tp_vectorcall */
+#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION == 12)
+/* This was introduced first in 3.12
+ * https://github.com/python/cpython/issues/91051
+ */
+    0,                                           /* tp_watched */
+#endif
+
+/* WARNING: Do not remove this, only modify it! It is a version guard to
+ * act as a reminder to update this struct on Python version update! */
+#if (PY_MAJOR_VERSION == 3)
+#if ! (NB_SUPPORTED_PYTHON_MINOR)
+#error "Python minor version is not supported."
+#endif
+#else
+#error "Python major version is not supported."
+#endif
+/* END WARNING*/
+};
+
+/* CUDA device array C API */
+static void *_DeviceArray_API[1] = {
+    (void*)&DeviceArrayType
+};
+
+MOD_INIT(_devicearray) {
+    PyObject *m = nullptr;
+    PyObject *d = nullptr;
+    PyObject *c_api = nullptr;
+    int error = 0;
+
+
+    MOD_DEF(m, "_devicearray", "No docs", NULL)
+    if (m == NULL)
+        goto error_occurred;
+
+    c_api = PyCapsule_New((void *)_DeviceArray_API, NUMBA_DEVICEARRAY_IMPORT_NAME "._DEVICEARRAY_API", NULL);
+    if (c_api == NULL)
+        goto error_occurred;
+
+    DeviceArrayType.tp_new = PyType_GenericNew;
+    if (PyType_Ready(&DeviceArrayType) < 0)
+        goto error_occurred;
+
+
+    Py_INCREF(&DeviceArrayType);
+    error = PyModule_AddObject(m, "DeviceArray", (PyObject*)(&DeviceArrayType));
+    if (error)
+        goto error_occurred;
+
+    d = PyModule_GetDict(m);
+    if (d == NULL)
+        goto error_occurred;
+
+    error = PyDict_SetItemString(d, "_DEVICEARRAY_API", c_api);
+    /* Decref and set c_api to NULL, Py_XDECREF in error_occurred will have no
+     * effect. */
+    Py_CLEAR(c_api);
+
+    if (error)
+        goto error_occurred;
+
+    return MOD_SUCCESS_VAL(m);
+
+error_occurred:
+    Py_XDECREF(m);
+    Py_XDECREF(c_api);
+    Py_XDECREF((PyObject*)&DeviceArrayType);
+
+    return MOD_ERROR_VAL;
+}
diff --git a/numba_cuda/numba/cuda/cext/_devicearray.h b/numba_cuda/numba/cuda/cext/_devicearray.h
new file mode 100644
index 000000000..2e9df6cbb
--- /dev/null
+++ b/numba_cuda/numba/cuda/cext/_devicearray.h
@@ -0,0 +1,29 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef NUMBA_DEVICEARRAY_H_
+#define NUMBA_DEVICEARRAY_H_
+
+#ifdef __cplusplus
+    extern "C" {
+#endif
+
+#define NUMBA_DEVICEARRAY_IMPORT_NAME "numba.cuda.cext._devicearray"
+/* These definitions should only be used by consumers of the Device Array API.
+ * Consumers access the API through the opaque pointer stored in
+ * _devicearray._DEVICEARRAY_API.  We don't want these definitions in
+ * _devicearray.cpp itself because they would conflict with the actual
+ * implementations there.
+ */
+#ifndef NUMBA_IN_DEVICEARRAY_CPP_
+
+    extern void **DeviceArray_API;
+    #define DeviceArrayType (*(PyTypeObject*)DeviceArray_API[0])
+
+#endif /* ndef NUMBA_IN_DEVICEARRAY_CPP */
+
+#ifdef __cplusplus
+    }
+#endif
+
+#endif  /* NUMBA_DEVICEARRAY_H_ */
diff --git a/numba_cuda/numba/cuda/cext/_dispatcher.cpp b/numba_cuda/numba/cuda/cext/_dispatcher.cpp
new file mode 100644
index 000000000..653421f9e
--- /dev/null
+++ b/numba_cuda/numba/cuda/cext/_dispatcher.cpp
@@ -0,0 +1,1092 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "_pymodule.h"
+
+#include <cstring>
+#include <ctime>
+#include <cassert>
+#include <vector>
+
+#include "_typeof.h"
+#include "frameobject.h"
+#include "traceback.h"
+#include "typeconv.hpp"
+#include "_devicearray.h"
+
+/*
+ * Notes on the C_TRACE macro:
+ *
+ * The original C_TRACE macro (from ceval.c) would call
+ * PyTrace_C_CALL et al., for which the frame argument wouldn't
+ * be usable. Since we explicitly synthesize a frame using the
+ * original Python code object, we call PyTrace_CALL instead so
+ * the profiler can report the correct source location.
+ *
+ * Likewise, while ceval.c would call PyTrace_C_EXCEPTION in case
+ * of error, the profiler would simply expect a RETURN in case of
+ * a Python function, so we generate that here (making sure the
+ * exception state is preserved correctly).
+ *
+ */
+
+#if (PY_MAJOR_VERSION >= 3) && ((PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
+
+#ifndef Py_BUILD_CORE
+    #define Py_BUILD_CORE 1
+#endif
+#include "internal/pycore_frame.h"
+// This is a fix suggested in the comments in https://github.com/python/cpython/issues/108216
+// specifically https://github.com/python/cpython/issues/108216#issuecomment-1696565797
+#ifdef HAVE_STD_ATOMIC
+#  undef HAVE_STD_ATOMIC
+#endif
+#undef _PyGC_FINALIZED
+
+/* dynamic_annotations.h is needed for building Python with --with-valgrind 
+ * support. The following include is to workaround issues described in
+ * https://github.com/numba/numba/pull/10073
+ */
+#include "dynamic_annotations.h"
+#if (PY_MINOR_VERSION == 12)
+    #include "internal/pycore_atomic.h"
+#endif
+#include "internal/pycore_interp.h"
+#include "internal/pycore_pyerrors.h"
+#include "internal/pycore_instruments.h"
+#include "internal/pycore_call.h"
+#include "cpython/code.h"
+
+#elif (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 11)
+#ifndef Py_BUILD_CORE
+    #define Py_BUILD_CORE 1
+#endif
+#include "internal/pycore_frame.h"
+#include "internal/pycore_pyerrors.h"
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/deaf509e8fc6e0363bd6f26d52ad42f976ec42f2/Python/ceval.c#L6804
+ */
+static int
+call_trace(Py_tracefunc func, PyObject *obj,
+           PyThreadState *tstate, PyFrameObject *frame,
+           int what, PyObject *arg)
+{
+    int result;
+    if (tstate->tracing) {
+        return 0;
+    }
+    if (frame == NULL) {
+        return -1;
+    }
+    int old_what = tstate->tracing_what;
+    tstate->tracing_what = what;
+    PyThreadState_EnterTracing(tstate);
+    result = func(obj, frame, what, NULL);
+    PyThreadState_LeaveTracing(tstate);
+    tstate->tracing_what = old_what;
+    return result;
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4220-L4240
+ */
+static int
+call_trace_protected(Py_tracefunc func, PyObject *obj,
+                     PyThreadState *tstate, PyFrameObject *frame,
+                     int what, PyObject *arg)
+{
+    PyObject *type, *value, *traceback;
+    int err;
+    _PyErr_Fetch(tstate, &type, &value, &traceback);
+    err = call_trace(func, obj, tstate, frame, what, arg);
+    if (err == 0)
+    {
+        _PyErr_Restore(tstate, type, value, traceback);
+        return 0;
+    }
+    else {
+        Py_XDECREF(type);
+        Py_XDECREF(value);
+        Py_XDECREF(traceback);
+        return -1;
+    }
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/deaf509e8fc6e0363bd6f26d52ad42f976ec42f2/Python/ceval.c#L7245
+ * NOTE: The state test https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4521
+ * has been removed, it's dealt with in call_cfunc.
+ */
+#define C_TRACE(x, call, frame) \
+if (call_trace(tstate->c_profilefunc, tstate->c_profileobj, \
+    tstate, frame, \
+    PyTrace_CALL, cfunc)) { \
+    x = NULL; \
+} \
+else { \
+    x = call; \
+    if (tstate->c_profilefunc != NULL) { \
+        if (x == NULL) { \
+            call_trace_protected(tstate->c_profilefunc, \
+                tstate->c_profileobj, \
+                tstate, frame, \
+                PyTrace_RETURN, cfunc); \
+            /* XXX should pass (type, value, tb) */ \
+        } else { \
+            if (call_trace(tstate->c_profilefunc, \
+                tstate->c_profileobj, \
+                tstate, frame, \
+                PyTrace_RETURN, cfunc)) { \
+                Py_DECREF(x); \
+                x = NULL; \
+            } \
+        } \
+    } \
+} \
+
+#elif (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 10 || PY_MINOR_VERSION == 11)
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L36-L40
+ */
+typedef struct {
+    PyCodeObject *code; // The code object for the bounds. May be NULL.
+    PyCodeAddressRange bounds; // Only valid if code != NULL.
+    CFrame cframe;
+} PyTraceInfo;
+
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Objects/codeobject.c#L1257-L1266
+ * NOTE: The function is renamed.
+ */
+static void
+_nb_PyLineTable_InitAddressRange(const char *linetable, Py_ssize_t length, int firstlineno, PyCodeAddressRange *range)
+{
+    range->opaque.lo_next = linetable;
+    range->opaque.limit = range->opaque.lo_next + length;
+    range->ar_start = -1;
+    range->ar_end = 0;
+    range->opaque.computed_line = firstlineno;
+    range->ar_line = -1;
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Objects/codeobject.c#L1269-L1275
+ * NOTE: The function is renamed.
+ */
+static int
+_nb_PyCode_InitAddressRange(PyCodeObject* co, PyCodeAddressRange *bounds)
+{
+    const char *linetable = PyBytes_AS_STRING(co->co_linetable);
+    Py_ssize_t length = PyBytes_GET_SIZE(co->co_linetable);
+    _nb_PyLineTable_InitAddressRange(linetable, length, co->co_firstlineno, bounds);
+    return bounds->ar_line;
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5468-L5475
+ * NOTE: The call to _PyCode_InitAddressRange is renamed.
+ */
+static void
+initialize_trace_info(PyTraceInfo *trace_info, PyFrameObject *frame)
+{
+    if (trace_info->code != frame->f_code) {
+        trace_info->code = frame->f_code;
+        _nb_PyCode_InitAddressRange(frame->f_code, &trace_info->bounds);
+    }
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5477-L5501
+ */
+static int
+call_trace(Py_tracefunc func, PyObject *obj,
+           PyThreadState *tstate, PyFrameObject *frame,
+           PyTraceInfo *trace_info,
+           int what, PyObject *arg)
+{
+    int result;
+    if (tstate->tracing)
+        return 0;
+    tstate->tracing++;
+    tstate->cframe->use_tracing = 0;
+    if (frame->f_lasti < 0) {
+        frame->f_lineno = frame->f_code->co_firstlineno;
+    }
+    else {
+        initialize_trace_info(trace_info, frame);
+        frame->f_lineno = _PyCode_CheckLineNumber(frame->f_lasti*sizeof(_Py_CODEUNIT), &trace_info->bounds);
+    }
+    result = func(obj, frame, what, arg);
+    frame->f_lineno = 0;
+    tstate->cframe->use_tracing = ((tstate->c_tracefunc != NULL)
+                           || (tstate->c_profilefunc != NULL));
+    tstate->tracing--;
+    return result;
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5445-L5466
+ */
+static int
+call_trace_protected(Py_tracefunc func, PyObject *obj,
+                     PyThreadState *tstate, PyFrameObject *frame,
+                     PyTraceInfo *trace_info,
+                     int what, PyObject *arg)
+{
+    PyObject *type, *value, *traceback;
+    int err;
+    PyErr_Fetch(&type, &value, &traceback);
+    err = call_trace(func, obj, tstate, frame, trace_info, what, arg);
+    if (err == 0)
+    {
+        PyErr_Restore(type, value, traceback);
+        return 0;
+    }
+    else
+    {
+        Py_XDECREF(type);
+        Py_XDECREF(value);
+        Py_XDECREF(traceback);
+        return -1;
+    }
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5810-L5839
+ * NOTE: The state test https://github.com/python/cpython/blob/c5bfb88eb6f82111bb1603ae9d78d0476b552d66/Python/ceval.c#L5811
+ * has been removed, it's dealt with in call_cfunc.
+ */
+#define C_TRACE(x, call)                                        \
+if (call_trace(tstate->c_profilefunc, tstate->c_profileobj,     \
+               tstate, tstate->frame, &trace_info, PyTrace_CALL,\
+               cfunc))	                                        \
+    x = NULL;                                                   \
+else                                                            \
+{                                                               \
+    x = call;                                                   \
+    if (tstate->c_profilefunc != NULL)                          \
+    {                                                           \
+        if (x == NULL)                                          \
+        {                                                       \
+            call_trace_protected(tstate->c_profilefunc,         \
+                                 tstate->c_profileobj,          \
+                                 tstate, tstate->frame,         \
+                                 &trace_info,                   \
+                                 PyTrace_RETURN, cfunc);	\
+            /* XXX should pass (type, value, tb) */             \
+        }                                                       \
+        else                                                    \
+        {                                                       \
+            if (call_trace(tstate->c_profilefunc,               \
+                           tstate->c_profileobj,                \
+                           tstate, tstate->frame,               \
+                           &trace_info,                         \
+                           PyTrace_RETURN, cfunc))		\
+            {                                                   \
+                Py_DECREF(x);                                   \
+                x = NULL;                                       \
+            }                                                   \
+        }                                                       \
+    }                                                           \
+}
+
+#else  // Python <3.10
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4242-L4257
+ */
+static int
+call_trace(Py_tracefunc func, PyObject *obj,
+           PyThreadState *tstate, PyFrameObject *frame,
+           int what, PyObject *arg)
+{
+    int result;
+    if (tstate->tracing)
+        return 0;
+    tstate->tracing++;
+    tstate->use_tracing = 0;
+    result = func(obj, frame, what, arg);
+    tstate->use_tracing = ((tstate->c_tracefunc != NULL)
+                           || (tstate->c_profilefunc != NULL));
+    tstate->tracing--;
+    return result;
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4220-L4240
+ */
+static int
+call_trace_protected(Py_tracefunc func, PyObject *obj,
+                     PyThreadState *tstate, PyFrameObject *frame,
+                     int what, PyObject *arg)
+{
+    PyObject *type, *value, *traceback;
+    int err;
+    PyErr_Fetch(&type, &value, &traceback);
+    err = call_trace(func, obj, tstate, frame, what, arg);
+    if (err == 0)
+    {
+        PyErr_Restore(type, value, traceback);
+        return 0;
+    }
+    else
+    {
+        Py_XDECREF(type);
+        Py_XDECREF(value);
+        Py_XDECREF(traceback);
+        return -1;
+    }
+}
+
+/*
+ * Code originally from:
+ * https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4520-L4549
+ * NOTE: The state test https://github.com/python/cpython/blob/d5650a1738fe34f6e1db4af5f4c4edb7cae90a36/Python/ceval.c#L4521
+ * has been removed, it's dealt with in call_cfunc.
+ */
+#define C_TRACE(x, call)                                        \
+if (call_trace(tstate->c_profilefunc, tstate->c_profileobj,     \
+               tstate, tstate->frame, PyTrace_CALL, cfunc))     \
+    x = NULL;                                                   \
+else                                                            \
+{                                                               \
+    x = call;                                                   \
+    if (tstate->c_profilefunc != NULL)                          \
+    {                                                           \
+        if (x == NULL)                                          \
+        {                                                       \
+            call_trace_protected(tstate->c_profilefunc,         \
+                                 tstate->c_profileobj,          \
+                                 tstate, tstate->frame,         \
+                                 PyTrace_RETURN, cfunc);        \
+            /* XXX should pass (type, value, tb) */             \
+        }                                                       \
+        else                                                    \
+        {                                                       \
+            if (call_trace(tstate->c_profilefunc,               \
+                           tstate->c_profileobj,                \
+                           tstate, tstate->frame,               \
+                           PyTrace_RETURN, cfunc))              \
+            {                                                   \
+                Py_DECREF(x);                                   \
+                x = NULL;                                       \
+            }                                                   \
+        }                                                       \
+    }                                                           \
+}
+
+
+#endif
+
+typedef std::vector<Type> TypeTable;
+typedef std::vector<PyObject*> Functions;
+
+/* The Dispatcher class is the base class of all dispatchers in the CUDA target.
+    Its main responsibilities are:
+
+   - Resolving the best overload to call for a given set of arguments, and
+   - Calling the resolved overload.
+
+   This logic is implemented within this class for efficiency (lookup of the
+   appropriate overload needs to be fast) and ease of implementation (calling
+   directly into a compiled function using a function pointer is easier within
+   the C++ code where the overload has been resolved). */
+class Dispatcher {
+public:
+    PyObject_HEAD
+    /* Whether compilation of new overloads is permitted */
+    char can_compile;
+    /* Whether fallback to object mode is permitted */
+    char can_fallback;
+    /* Whether types must match exactly when resolving overloads.
+       If not, conversions (e.g. float32 -> float64) are permitted when
+       searching for a match. */
+    char exact_match_required;
+    /* Borrowed reference */
+    PyObject *fallbackdef;
+    /* Whether to fold named arguments and default values
+      (false for lifted loops) */
+    int fold_args;
+    /* Whether the last positional argument is a stararg */
+    int has_stararg;
+    /* Tuple of argument names */
+    PyObject *argnames;
+    /* Tuple of default values */
+    PyObject *defargs;
+    /* Number of arguments to function */
+    int argct;
+    /* Used for selecting overloaded function implementations */
+    TypeManager *tm;
+    /* An array of overloads */
+    Functions functions;
+    /* A flattened array of argument types to all overloads
+     * (invariant: sizeof(overloads) == argct * sizeof(functions)) */
+    TypeTable overloads;
+
+    /* Add a new overload. Parameters:
+
+       - args: An array of Type objects, one for each parameter
+       - callable: The callable implementing this overload. */
+    void addDefinition(Type args[], PyObject *callable) {
+        overloads.reserve(argct + overloads.size());
+        for (int i=0; i<argct; ++i) {
+            overloads.push_back(args[i]);
+        }
+        functions.push_back(callable);
+    }
+
+    /* Given a list of types, find the overloads that have a matching signature.
+       Returns the best match, as well as the number of matches found.
+
+       Parameters:
+
+       - sig: an array of Type objects, one for each parameter.
+       - matches: the number of matches found (mutated by this function).
+       - allow_unsafe: whether to match overloads that would require an unsafe
+                       cast.
+       - exact_match_required: Whether all arguments types must match the
+                               overload's types exactly. When false,
+                               overloads that would require a type conversion
+                               can also be matched. */
+    PyObject* resolve(Type sig[], int &matches, bool allow_unsafe,
+                      bool exact_match_required) const {
+        const int ovct = functions.size();
+        int selected;
+        matches = 0;
+        if (0 == ovct) {
+            // No overloads registered
+            return NULL;
+        }
+        if (argct == 0) {
+            // Nullary function: trivial match on first overload
+            matches = 1;
+            selected = 0;
+        }
+        else {
+            matches = tm->selectOverload(sig, &overloads[0], selected, argct,
+                                         ovct, allow_unsafe,
+                                         exact_match_required);
+        }
+        if (matches == 1) {
+            return functions[selected];
+        }
+        return NULL;
+    }
+
+    /* Remove all overloads */
+    void clear() {
+        functions.clear();
+        overloads.clear();
+    }
+
+};
+
+
+static int
+Dispatcher_traverse(Dispatcher *self, visitproc visit, void *arg)
+{
+    Py_VISIT(self->defargs);
+    return 0;
+}
+
+static void
+Dispatcher_dealloc(Dispatcher *self)
+{
+    Py_XDECREF(self->argnames);
+    Py_XDECREF(self->defargs);
+    self->clear();
+    Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+
+static int
+Dispatcher_init(Dispatcher *self, PyObject *args, PyObject *kwds)
+{
+    PyObject *tmaddrobj;
+    void *tmaddr;
+    int argct;
+    int can_fallback;
+    int has_stararg = 0;
+    int exact_match_required = 0;
+
+    if (!PyArg_ParseTuple(args, "OiiO!O!i|ii", &tmaddrobj, &argct,
+                          &self->fold_args,
+                          &PyTuple_Type, &self->argnames,
+                          &PyTuple_Type, &self->defargs,
+                          &can_fallback,
+                          &has_stararg,
+                          &exact_match_required
+                         )) {
+        return -1;
+    }
+    Py_INCREF(self->argnames);
+    Py_INCREF(self->defargs);
+    tmaddr = PyLong_AsVoidPtr(tmaddrobj);
+    self->tm = static_cast<TypeManager*>(tmaddr);
+    self->argct = argct;
+    self->can_compile = 1;
+    self->can_fallback = can_fallback;
+    self->fallbackdef = NULL;
+    self->has_stararg = has_stararg;
+    self->exact_match_required = exact_match_required;
+    return 0;
+}
+
+static PyObject *
+Dispatcher_clear(Dispatcher *self, PyObject *args)
+{
+    self->clear();
+    Py_RETURN_NONE;
+}
+
+static
+PyObject*
+Dispatcher_Insert(Dispatcher *self, PyObject *args, PyObject *kwds)
+{
+    /* The cuda kwarg is a temporary addition until CUDA overloads are compiled
+     * functions. Once they are compiled functions, kwargs can be removed from
+     * this function. */
+    static char *keywords[] = {
+        (char*)"sig",
+        (char*)"func",
+        (char*)"objectmode",
+        (char*)"cuda",
+        NULL
+    };
+
+    PyObject *sigtup, *cfunc;
+    int i, sigsz;
+    int *sig;
+    int objectmode = 0;
+    int cuda = 0;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|ip", keywords, &sigtup,
+                                     &cfunc, &objectmode, &cuda)) {
+        return NULL;
+    }
+
+    if (!cuda && !PyObject_TypeCheck(cfunc, &PyCFunction_Type) ) {
+        PyErr_SetString(PyExc_TypeError, "must be builtin_function_or_method");
+        return NULL;
+    }
+
+    sigsz = PySequence_Fast_GET_SIZE(sigtup);
+    sig = new int[sigsz];
+
+    for (i = 0; i < sigsz; ++i) {
+        sig[i] = PyLong_AsLong(PySequence_Fast_GET_ITEM(sigtup, i));
+    }
+
+    /* The reference to cfunc is borrowed; this only works because the
+       derived Python class also stores an (owned) reference to cfunc. */
+    self->addDefinition(sig, cfunc);
+
+    /* Add pure python fallback */
+    if (!self->fallbackdef && objectmode){
+        self->fallbackdef = cfunc;
+    }
+
+    delete[] sig;
+
+    Py_RETURN_NONE;
+}
+
+static
+void explain_issue(PyObject *dispatcher, PyObject *args, PyObject *kws,
+                   const char *method_name, const char *default_msg)
+{
+    PyObject *callback, *result;
+    callback = PyObject_GetAttrString(dispatcher, method_name);
+    if (!callback) {
+        PyErr_SetString(PyExc_TypeError, default_msg);
+        return;
+    }
+    result = PyObject_Call(callback, args, kws);
+    Py_DECREF(callback);
+    if (result != NULL) {
+        PyErr_Format(PyExc_RuntimeError, "%s must raise an exception",
+                     method_name);
+        Py_DECREF(result);
+    }
+}
+
+static
+void explain_ambiguous(PyObject *dispatcher, PyObject *args, PyObject *kws)
+{
+    explain_issue(dispatcher, args, kws, "_explain_ambiguous",
+                  "Ambiguous overloading");
+}
+
+static
+void explain_matching_error(PyObject *dispatcher, PyObject *args, PyObject *kws)
+{
+    explain_issue(dispatcher, args, kws, "_explain_matching_error",
+                  "No matching definition");
+}
+
+static
+int search_new_conversions(PyObject *dispatcher, PyObject *args, PyObject *kws)
+{
+    PyObject *callback, *result;
+    int res;
+
+    callback = PyObject_GetAttrString(dispatcher,
+                                      "_search_new_conversions");
+    if (!callback) {
+        return -1;
+    }
+    result = PyObject_Call(callback, args, kws);
+    Py_DECREF(callback);
+    if (result == NULL) {
+        return -1;
+    }
+    if (!PyBool_Check(result)) {
+        Py_DECREF(result);
+        PyErr_SetString(PyExc_TypeError,
+                        "_search_new_conversions() should return a boolean");
+        return -1;
+    }
+    res = (result == Py_True) ? 1 : 0;
+    Py_DECREF(result);
+    return res;
+}
+
+
+/* A custom, fast, inlinable version of PyCFunction_Call() */
+static PyObject *
+call_cfunc(Dispatcher *self, PyObject *cfunc, PyObject *args, PyObject *kws, PyObject *locals)
+{
+    PyCFunctionWithKeywords fn = NULL;
+    PyObject * pyresult = NULL;
+
+    assert(PyCFunction_Check(cfunc));
+    assert(PyCFunction_GET_FLAGS(cfunc) == (METH_VARARGS | METH_KEYWORDS));
+    fn = (PyCFunctionWithKeywords) PyCFunction_GET_FUNCTION(cfunc);
+
+    // make call
+    pyresult = fn(PyCFunction_GET_SELF(cfunc), args, kws);
+
+    return pyresult;
+}
+
+
+
+
+/* A copy of compile_and_invoke, that only compiles. This is needed for CUDA
+ * kernels, because its overloads are Python instances of the _Kernel class,
+ * rather than compiled functions. Once CUDA overloads are compiled functions,
+ * cuda_compile_only can be removed. */
+static
+PyObject*
+cuda_compile_only(Dispatcher *self, PyObject *args, PyObject *kws, PyObject *locals)
+{
+    /* Compile a new one */
+    PyObject *cfa, *cfunc;
+    cfa = PyObject_GetAttrString((PyObject*)self, "_compile_for_args");
+    if (cfa == NULL)
+        return NULL;
+
+    cfunc = PyObject_Call(cfa, args, kws);
+    Py_DECREF(cfa);
+
+    return cfunc;
+}
+
+static int
+find_named_args(Dispatcher *self, PyObject **pargs, PyObject **pkws)
+{
+    PyObject *oldargs = *pargs, *newargs;
+    PyObject *kws = *pkws;
+    Py_ssize_t pos_args = PyTuple_GET_SIZE(oldargs);
+    Py_ssize_t named_args, total_args, i;
+    Py_ssize_t func_args = PyTuple_GET_SIZE(self->argnames);
+    Py_ssize_t defaults = PyTuple_GET_SIZE(self->defargs);
+    /* Last parameter with a default value */
+    Py_ssize_t last_def = (self->has_stararg)
+                          ? func_args - 2
+                          : func_args - 1;
+    /* First parameter with a default value */
+    Py_ssize_t first_def = last_def - defaults + 1;
+    /* Minimum number of required arguments */
+    Py_ssize_t minargs = first_def;
+
+    if (kws != NULL)
+        named_args = PyDict_Size(kws);
+    else
+        named_args = 0;
+    total_args = pos_args + named_args;
+    if (!self->has_stararg && total_args > func_args) {
+        PyErr_Format(PyExc_TypeError,
+                     "too many arguments: expected %d, got %d",
+                     (int) func_args, (int) total_args);
+        return -1;
+    }
+    else if (total_args < minargs) {
+        if (minargs == func_args)
+            PyErr_Format(PyExc_TypeError,
+                         "not enough arguments: expected %d, got %d",
+                         (int) minargs, (int) total_args);
+        else
+            PyErr_Format(PyExc_TypeError,
+                         "not enough arguments: expected at least %d, got %d",
+                         (int) minargs, (int) total_args);
+        return -1;
+    }
+    newargs = PyTuple_New(func_args);
+    if (!newargs)
+        return -1;
+    /* First pack the stararg */
+    if (self->has_stararg) {
+        Py_ssize_t stararg_size = Py_MAX(0, pos_args - func_args + 1);
+        PyObject *stararg = PyTuple_New(stararg_size);
+        if (!stararg) {
+            Py_DECREF(newargs);
+            return -1;
+        }
+        for (i = 0; i < stararg_size; i++) {
+            PyObject *value = PyTuple_GET_ITEM(oldargs, func_args - 1 + i);
+            Py_INCREF(value);
+            PyTuple_SET_ITEM(stararg, i, value);
+        }
+        /* Put it in last position */
+        PyTuple_SET_ITEM(newargs, func_args - 1, stararg);
+
+    }
+    for (i = 0; i < pos_args; i++) {
+        PyObject *value = PyTuple_GET_ITEM(oldargs, i);
+        if (self->has_stararg && i >= func_args - 1) {
+            /* Skip stararg */
+            break;
+        }
+        Py_INCREF(value);
+        PyTuple_SET_ITEM(newargs, i, value);
+    }
+
+    /* Iterate over missing positional arguments, try to find them in
+       named arguments or default values. */
+    for (i = pos_args; i < func_args; i++) {
+        PyObject *name = PyTuple_GET_ITEM(self->argnames, i);
+        if (self->has_stararg && i >= func_args - 1) {
+            /* Skip stararg */
+            break;
+        }
+        if (kws != NULL) {
+            /* Named argument? */
+            PyObject *value = PyDict_GetItem(kws, name);
+            if (value != NULL) {
+                Py_INCREF(value);
+                PyTuple_SET_ITEM(newargs, i, value);
+                named_args--;
+                continue;
+            }
+        }
+        if (i >= first_def && i <= last_def) {
+            /* Argument has a default value? */
+            PyObject *value = PyTuple_GET_ITEM(self->defargs, i - first_def);
+            Py_INCREF(value);
+            PyTuple_SET_ITEM(newargs, i, value);
+            continue;
+        }
+        else if (i < func_args - 1 || !self->has_stararg) {
+            PyErr_Format(PyExc_TypeError,
+                         "missing argument '%s'",
+                         PyString_AsString(name));
+            Py_DECREF(newargs);
+            return -1;
+        }
+    }
+    if (named_args) {
+        PyErr_Format(PyExc_TypeError,
+                     "some keyword arguments unexpected");
+        Py_DECREF(newargs);
+        return -1;
+    }
+    *pargs = newargs;
+    *pkws = NULL;
+    return 0;
+}
+
+
+
+/* Based on Dispatcher_call above, with the following differences:
+   1. It does not invoke the definition of the function.
+   2. It returns the definition, instead of a value returned by the function.
+
+   This is because CUDA functions are, at present, _Kernel objects rather than
+   compiled functions. */
+static PyObject*
+Dispatcher_cuda_call(Dispatcher *self, PyObject *args, PyObject *kws)
+{
+    PyObject *tmptype, *retval = NULL;
+    int *tys = NULL;
+    int argct;
+    int i;
+    int prealloc[24];
+    int matches;
+    PyObject *cfunc;
+    PyThreadState *ts = PyThreadState_Get();
+    PyObject *locals = NULL;
+
+    /* If compilation is enabled, ensure that an exact match is found and if
+     * not compile one */
+    int exact_match_required = self->can_compile ? 1 : self->exact_match_required;
+
+#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 10)
+    if (ts->tracing && ts->c_profilefunc) {
+#else
+    if (ts->use_tracing && ts->c_profilefunc) {
+#endif
+        locals = PyEval_GetLocals();
+        if (locals == NULL) {
+            goto CLEANUP;
+        }
+    }
+    if (self->fold_args) {
+        if (find_named_args(self, &args, &kws))
+            return NULL;
+    }
+    else
+        Py_INCREF(args);
+    /* Now we own a reference to args */
+
+    argct = PySequence_Fast_GET_SIZE(args);
+
+    if (argct < (Py_ssize_t) (sizeof(prealloc) / sizeof(int)))
+        tys = prealloc;
+    else
+        tys = new int[argct];
+
+    for (i = 0; i < argct; ++i) {
+        tmptype = PySequence_Fast_GET_ITEM(args, i);
+        tys[i] = typeof_typecode((PyObject *) self, tmptype);
+        if (tys[i] == -1) {
+            if (self->can_fallback){
+                /* We will clear the exception if fallback is allowed. */
+                PyErr_Clear();
+            } else {
+                goto CLEANUP;
+            }
+        }
+    }
+
+    /* We only allow unsafe conversions if compilation of new specializations
+       has been disabled. */
+    cfunc = self->resolve(tys, matches, !self->can_compile,
+                          exact_match_required);
+
+    if (matches == 0 && !self->can_compile) {
+        /*
+         * If we can't compile a new specialization, look for
+         * matching signatures for which conversions haven't been
+         * registered on the C++ TypeManager.
+         */
+        int res = search_new_conversions((PyObject *) self, args, kws);
+        if (res < 0) {
+            retval = NULL;
+            goto CLEANUP;
+        }
+        if (res > 0) {
+            /* Retry with the newly registered conversions */
+            cfunc = self->resolve(tys, matches, !self->can_compile,
+                                  exact_match_required);
+        }
+    }
+
+    if (matches == 1) {
+        /* Definition is found */
+        retval = cfunc;
+        Py_INCREF(retval);
+    } else if (matches == 0) {
+        /* No matching definition */
+        if (self->can_compile) {
+            retval = cuda_compile_only(self, args, kws, locals);
+        } else if (self->fallbackdef) {
+            /* Have object fallback */
+            retval = call_cfunc(self, self->fallbackdef, args, kws, locals);
+        } else {
+            /* Raise TypeError */
+            explain_matching_error((PyObject *) self, args, kws);
+            retval = NULL;
+        }
+    } else if (self->can_compile) {
+        /* Ambiguous, but are allowed to compile */
+        retval = cuda_compile_only(self, args, kws, locals);
+    } else {
+        /* Ambiguous */
+        explain_ambiguous((PyObject *) self, args, kws);
+        retval = NULL;
+    }
+
+CLEANUP:
+    if (tys != prealloc)
+        delete[] tys;
+    Py_DECREF(args);
+
+    return retval;
+}
+
+static int
+import_devicearray(void)
+{
+    PyObject *devicearray = PyImport_ImportModule(NUMBA_DEVICEARRAY_IMPORT_NAME);
+    if (devicearray == NULL) {
+        return -1;
+    }
+
+    PyObject *d = PyModule_GetDict(devicearray);
+    if (d == NULL) {
+      Py_DECREF(devicearray);
+      return -1;
+    }
+
+    PyObject *c_api = PyDict_GetItemString(d, "_DEVICEARRAY_API");
+    if (PyCapsule_IsValid(c_api, NUMBA_DEVICEARRAY_IMPORT_NAME "._DEVICEARRAY_API")) {
+      DeviceArray_API = (void**)PyCapsule_GetPointer(c_api, NUMBA_DEVICEARRAY_IMPORT_NAME "._DEVICEARRAY_API");
+      Py_DECREF(devicearray);
+      return 0;
+    } else {
+      Py_DECREF(devicearray);
+      return -1;
+    }
+}
+
+static PyMethodDef Dispatcher_methods[] = {
+    { "_clear", (PyCFunction)Dispatcher_clear, METH_NOARGS, NULL },
+    { "_insert", (PyCFunction)Dispatcher_Insert, METH_VARARGS | METH_KEYWORDS,
+      "insert new definition"},
+    { "_cuda_call", (PyCFunction)Dispatcher_cuda_call,
+      METH_VARARGS | METH_KEYWORDS, "CUDA call resolution" },
+    { NULL },
+};
+
+static PyMemberDef Dispatcher_members[] = {
+    {(char*)"_can_compile", T_BOOL, offsetof(Dispatcher, can_compile), 0, NULL },
+    {NULL}  /* Sentinel */
+};
+
+
+static PyTypeObject DispatcherType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    "_dispatcher.Dispatcher",                    /* tp_name */
+    sizeof(Dispatcher),                          /* tp_basicsize */
+    0,                                           /* tp_itemsize */
+    (destructor)Dispatcher_dealloc,              /* tp_dealloc */
+    0,                                           /* tp_vectorcall_offset */
+    0,                                           /* tp_getattr */
+    0,                                           /* tp_setattr */
+    0,                                           /* tp_as_async */
+    0,                                           /* tp_repr */
+    0,                                           /* tp_as_number */
+    0,                                           /* tp_as_sequence */
+    0,                                           /* tp_as_mapping */
+    0,                                           /* tp_hash */
+    0,                                           /* tp_call*/
+    0,                                           /* tp_str*/
+    0,                                           /* tp_getattro*/
+    0,                                           /* tp_setattro*/
+    0,                                           /* tp_as_buffer*/
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /* tp_flags*/
+    "Dispatcher object",                         /* tp_doc */
+    (traverseproc) Dispatcher_traverse,          /* tp_traverse */
+    0,                                           /* tp_clear */
+    0,                                           /* tp_richcompare */
+    0,                                           /* tp_weaklistoffset */
+    0,                                           /* tp_iter */
+    0,                                           /* tp_iternext */
+    Dispatcher_methods,                          /* tp_methods */
+    Dispatcher_members,                          /* tp_members */
+    0,                                           /* tp_getset */
+    0,                                           /* tp_base */
+    0,                                           /* tp_dict */
+    0,                                           /* tp_descr_get */
+    0,                                           /* tp_descr_set */
+    0,                                           /* tp_dictoffset */
+    (initproc)Dispatcher_init,                   /* tp_init */
+    0,                                           /* tp_alloc */
+    0,                                           /* tp_new */
+    0,                                           /* tp_free */
+    0,                                           /* tp_is_gc */
+    0,                                           /* tp_bases */
+    0,                                           /* tp_mro */
+    0,                                           /* tp_cache */
+    0,                                           /* tp_subclasses */
+    0,                                           /* tp_weaklist */
+    0,                                           /* tp_del */
+    0,                                           /* tp_version_tag */
+    0,                                           /* tp_finalize */
+    0,                                           /* tp_vectorcall */
+#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION == 12)
+/* This was introduced first in 3.12
+ * https://github.com/python/cpython/issues/91051
+ */
+    0,                                           /* tp_watched */
+#endif
+
+/* WARNING: Do not remove this, only modify it! It is a version guard to
+ * act as a reminder to update this struct on Python version update! */
+#if (PY_MAJOR_VERSION == 3)
+#if ! (NB_SUPPORTED_PYTHON_MINOR)
+#error "Python minor version is not supported."
+#endif
+#else
+#error "Python major version is not supported."
+#endif
+/* END WARNING*/
+};
+
+
+
+
+static PyObject *compute_fingerprint(PyObject *self, PyObject *args)
+{
+    PyObject *val;
+    if (!PyArg_ParseTuple(args, "O:compute_fingerprint", &val))
+        return NULL;
+    return typeof_compute_fingerprint(val);
+}
+
+static PyMethodDef ext_methods[] = {
+#define declmethod(func) { #func , ( PyCFunction )func , METH_VARARGS , NULL }
+    declmethod(typeof_init),
+    declmethod(compute_fingerprint),
+    { NULL },
+#undef declmethod
+};
+
+
+MOD_INIT(_dispatcher) {
+    if (import_devicearray() < 0) {
+      PyErr_Print();
+      PyErr_SetString(PyExc_ImportError, NUMBA_DEVICEARRAY_IMPORT_NAME " failed to import");
+      return MOD_ERROR_VAL;
+    }
+
+    PyObject *m;
+    MOD_DEF(m, "_dispatcher", "No docs", ext_methods)
+    if (m == NULL)
+        return MOD_ERROR_VAL;
+
+    DispatcherType.tp_new = PyType_GenericNew;
+    if (PyType_Ready(&DispatcherType) < 0) {
+        return MOD_ERROR_VAL;
+    }
+    Py_INCREF(&DispatcherType);
+    PyModule_AddObject(m, "Dispatcher", (PyObject*)(&DispatcherType));
+
+    return MOD_SUCCESS_VAL(m);
+}
diff --git a/numba_cuda/numba/cuda/cext/_hashtable.cpp b/numba_cuda/numba/cuda/cext/_hashtable.cpp
new file mode 100644
index 000000000..7d5dda962
--- /dev/null
+++ b/numba_cuda/numba/cuda/cext/_hashtable.cpp
@@ -0,0 +1,532 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
+/*
+ * This file and _hashtable.h are from CPython 3.5.  The symbols have been
+ * renamed from _Py_hashxxx to _Numba_hashxxx to avoid name clashes with
+ * the CPython definitions (including at runtime through dynamic linking).
+ * Those CPython APIs are private and can change in incompatible ways at
+ * any time.
+ *
+ * Command line used for renaming:
+ * $ sed -i -r 's/\b_Py_(has[h]table)/_Numba_\1/ig' numba/_hashtable.h numba/_hashtable.c
+ */
+
+/* The implementation of the hash table (_Numba_hashtable_t) is based on the cfuhash
+   project:
+   http://sourceforge.net/projects/libcfu/
+
+   Copyright of cfuhash:
+   ----------------------------------
+   Creation date: 2005-06-24 21:22:40
+   Authors: Don
+   Change log:
+
+   Copyright (c) 2005 Don Owens
+   All rights reserved.
+
+   This code is released under the BSD license:
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+     * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+     * Neither the name of the author nor the names of its
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+   COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+   OF THE POSSIBILITY OF SUCH DAMAGE.
+   ----------------------------------
+*/
+
+#include "_pymodule.h"
+#include "_hashtable.h"
+
+#define HASHTABLE_MIN_SIZE 16
+#define HASHTABLE_HIGH 0.50
+#define HASHTABLE_LOW 0.10
+#define HASHTABLE_REHASH_FACTOR 2.0 / (HASHTABLE_LOW + HASHTABLE_HIGH)
+
+#define BUCKETS_HEAD(SLIST) \
+        ((_Numba_hashtable_entry_t *)_Py_SLIST_HEAD(&(SLIST)))
+#define TABLE_HEAD(HT, BUCKET) \
+        ((_Numba_hashtable_entry_t *)_Py_SLIST_HEAD(&(HT)->buckets[BUCKET]))
+#define ENTRY_NEXT(ENTRY) \
+        ((_Numba_hashtable_entry_t *)_Py_SLIST_ITEM_NEXT(ENTRY))
+#define HASHTABLE_ITEM_SIZE(HT) \
+        (sizeof(_Numba_hashtable_entry_t) + (HT)->data_size)
+
+/* Forward declaration */
+static void hashtable_rehash(_Numba_hashtable_t *ht);
+
+static void
+_Py_slist_init(_Py_slist_t *list)
+{
+    list->head = NULL;
+}
+
+static void
+_Py_slist_prepend(_Py_slist_t *list, _Py_slist_item_t *item)
+{
+    item->next = list->head;
+    list->head = item;
+}
+
+static void
+_Py_slist_remove(_Py_slist_t *list, _Py_slist_item_t *previous,
+                 _Py_slist_item_t *item)
+{
+    if (previous != NULL)
+        previous->next = item->next;
+    else
+        list->head = item->next;
+}
+
+extern "C" Py_uhash_t
+_Numba_hashtable_hash_int(const void *key)
+{
+    return (Py_uhash_t)key;
+}
+
+extern "C" Py_uhash_t
+_Numba_hashtable_hash_ptr(const void *key)
+{
+    return (Py_uhash_t)_Py_HashPointer((void *)key);
+}
+
+extern "C" int
+_Numba_hashtable_compare_direct(const void *key, const _Numba_hashtable_entry_t *entry)
+{
+    return entry->key == key;
+}
+
+/* makes sure the real size of the buckets array is a power of 2 */
+static size_t
+round_size(size_t s)
+{
+    size_t i;
+    if (s < HASHTABLE_MIN_SIZE)
+        return HASHTABLE_MIN_SIZE;
+    i = 1;
+    while (i < s)
+        i <<= 1;
+    return i;
+}
+
+extern "C" _Numba_hashtable_t *
+_Numba_hashtable_new_full(size_t data_size, size_t init_size,
+                       _Numba_hashtable_hash_func hash_func,
+                       _Numba_hashtable_compare_func compare_func,
+                       _Numba_hashtable_copy_data_func copy_data_func,
+                       _Numba_hashtable_free_data_func free_data_func,
+                       _Numba_hashtable_get_data_size_func get_data_size_func,
+                       _Numba_hashtable_allocator_t *allocator)
+{
+    _Numba_hashtable_t *ht;
+    size_t buckets_size;
+    _Numba_hashtable_allocator_t alloc;
+
+    if (allocator == NULL) {
+        alloc.malloc = PyMem_RawMalloc;
+        alloc.free = PyMem_RawFree;
+    }
+    else
+        alloc = *allocator;
+
+    ht = (_Numba_hashtable_t *)alloc.malloc(sizeof(_Numba_hashtable_t));
+    if (ht == NULL)
+        return ht;
+
+    ht->num_buckets = round_size(init_size);
+    ht->entries = 0;
+    ht->data_size = data_size;
+
+    buckets_size = ht->num_buckets * sizeof(ht->buckets[0]);
+    ht->buckets = (_Py_slist_t *) alloc.malloc(buckets_size);
+    if (ht->buckets == NULL) {
+        alloc.free(ht);
+        return NULL;
+    }
+    memset(ht->buckets, 0, buckets_size);
+
+    ht->hash_func = hash_func;
+    ht->compare_func = compare_func;
+    ht->copy_data_func = copy_data_func;
+    ht->free_data_func = free_data_func;
+    ht->get_data_size_func = get_data_size_func;
+    ht->alloc = alloc;
+    return ht;
+}
+
+extern "C" _Numba_hashtable_t *
+_Numba_hashtable_new(size_t data_size,
+                  _Numba_hashtable_hash_func hash_func,
+                  _Numba_hashtable_compare_func compare_func)
+{
+    return _Numba_hashtable_new_full(data_size, HASHTABLE_MIN_SIZE,
+                                  hash_func, compare_func,
+                                  NULL, NULL, NULL, NULL);
+}
+
+extern "C" size_t
+_Numba_hashtable_size(_Numba_hashtable_t *ht)
+{
+    size_t size;
+    size_t hv;
+
+    size = sizeof(_Numba_hashtable_t);
+
+    /* buckets */
+    size += ht->num_buckets * sizeof(_Numba_hashtable_entry_t *);
+
+    /* entries */
+    size += ht->entries * HASHTABLE_ITEM_SIZE(ht);
+
+    /* data linked from entries */
+    if (ht->get_data_size_func) {
+        for (hv = 0; hv < ht->num_buckets; hv++) {
+            _Numba_hashtable_entry_t *entry;
+
+            for (entry = TABLE_HEAD(ht, hv); entry; entry = ENTRY_NEXT(entry)) {
+                void *data;
+
+                data = _Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry);
+                size += ht->get_data_size_func(data);
+            }
+        }
+    }
+    return size;
+}
+
+#ifdef Py_DEBUG
+extern "C" void
+_Numba_hashtable_print_stats(_Numba_hashtable_t *ht)
+{
+    size_t size;
+    size_t chain_len, max_chain_len, total_chain_len, nchains;
+    _Numba_hashtable_entry_t *entry;
+    size_t hv;
+    double load;
+
+    size = _Numba_hashtable_size(ht);
+
+    load = (double)ht->entries / ht->num_buckets;
+
+    max_chain_len = 0;
+    total_chain_len = 0;
+    nchains = 0;
+    for (hv = 0; hv < ht->num_buckets; hv++) {
+        entry = TABLE_HEAD(ht, hv);
+        if (entry != NULL) {
+            chain_len = 0;
+            for (; entry; entry = ENTRY_NEXT(entry)) {
+                chain_len++;
+            }
+            if (chain_len > max_chain_len)
+                max_chain_len = chain_len;
+            total_chain_len += chain_len;
+            nchains++;
+        }
+    }
+    printf("hash table %p: entries=%"
+           PY_FORMAT_SIZE_T "u/%" PY_FORMAT_SIZE_T "u (%.0f%%), ",
+           ht, ht->entries, ht->num_buckets, load * 100.0);
+    if (nchains)
+        printf("avg_chain_len=%.1f, ", (double)total_chain_len / nchains);
+    printf("max_chain_len=%" PY_FORMAT_SIZE_T "u, %" PY_FORMAT_SIZE_T "u kB\n",
+           max_chain_len, size / 1024);
+}
+#endif
+
+/* Get an entry. Return NULL if the key does not exist. */
+extern "C" _Numba_hashtable_entry_t *
+_Numba_hashtable_get_entry(_Numba_hashtable_t *ht, const void *key)
+{
+    Py_uhash_t key_hash;
+    size_t index;
+    _Numba_hashtable_entry_t *entry;
+
+    key_hash = ht->hash_func(key);
+    index = key_hash & (ht->num_buckets - 1);
+
+    for (entry = TABLE_HEAD(ht, index); entry != NULL; entry = ENTRY_NEXT(entry)) {
+        if (entry->key_hash == key_hash && ht->compare_func(key, entry))
+            break;
+    }
+
+    return entry;
+}
+
+static int
+_hashtable_pop_entry(_Numba_hashtable_t *ht, const void *key, void *data, size_t data_size)
+{
+    Py_uhash_t key_hash;
+    size_t index;
+    _Numba_hashtable_entry_t *entry, *previous;
+
+    key_hash = ht->hash_func(key);
+    index = key_hash & (ht->num_buckets - 1);
+
+    previous = NULL;
+    for (entry = TABLE_HEAD(ht, index); entry != NULL; entry = ENTRY_NEXT(entry)) {
+        if (entry->key_hash == key_hash && ht->compare_func(key, entry))
+            break;
+        previous = entry;
+    }
+
+    if (entry == NULL)
+        return 0;
+
+    _Py_slist_remove(&ht->buckets[index], (_Py_slist_item_t *)previous,
+                     (_Py_slist_item_t *)entry);
+    ht->entries--;
+
+    if (data != NULL)
+        _Numba_HASHTABLE_ENTRY_READ_DATA(ht, data, data_size, entry);
+    ht->alloc.free(entry);
+
+    if ((float)ht->entries / (float)ht->num_buckets < HASHTABLE_LOW)
+        hashtable_rehash(ht);
+    return 1;
+}
+
+/* Add a new entry to the hash. The key must not be present in the hash table.
+   Return 0 on success, -1 on memory error. */
+extern "C" int
+_Numba_hashtable_set(_Numba_hashtable_t *ht, const void *key,
+                  void *data, size_t data_size)
+{
+    Py_uhash_t key_hash;
+    size_t index;
+    _Numba_hashtable_entry_t *entry;
+
+    assert(data != NULL || data_size == 0);
+#ifndef NDEBUG
+    /* Don't write the assertion on a single line because it is interesting
+       to know the duplicated entry if the assertion failed. The entry can
+       be read using a debugger. */
+    entry = _Numba_hashtable_get_entry(ht, key);
+    assert(entry == NULL);
+#endif
+
+    key_hash = ht->hash_func(key);
+    index = key_hash & (ht->num_buckets - 1);
+
+    entry = (_Numba_hashtable_entry_t *) ht->alloc.malloc(HASHTABLE_ITEM_SIZE(ht));
+    if (entry == NULL) {
+        /* memory allocation failed */
+        return -1;
+    }
+
+    entry->key = (void *)key;
+    entry->key_hash = key_hash;
+
+    assert(data_size == ht->data_size);
+    memcpy(_Numba_HASHTABLE_ENTRY_DATA(entry), data, data_size);
+
+    _Py_slist_prepend(&ht->buckets[index], (_Py_slist_item_t*)entry);
+    ht->entries++;
+
+    if ((float)ht->entries / (float)ht->num_buckets > HASHTABLE_HIGH)
+        hashtable_rehash(ht);
+    return 0;
+}
+
+/* Get data from an entry. Copy entry data into data and return 1 if the entry
+   exists, return 0 if the entry does not exist. */
+extern "C" int
+_Numba_hashtable_get(_Numba_hashtable_t *ht, const void *key, void *data, size_t data_size)
+{
+    _Numba_hashtable_entry_t *entry;
+
+    assert(data != NULL);
+
+    entry = _Numba_hashtable_get_entry(ht, key);
+    if (entry == NULL)
+        return 0;
+    _Numba_HASHTABLE_ENTRY_READ_DATA(ht, data, data_size, entry);
+    return 1;
+}
+
+extern "C" int
+_Numba_hashtable_pop(_Numba_hashtable_t *ht, const void *key, void *data, size_t data_size)
+{
+    assert(data != NULL);
+    assert(ht->free_data_func == NULL);
+    return _hashtable_pop_entry(ht, key, data, data_size);
+}
+
+/* Delete an entry. The entry must exist. */
+extern "C" void
+_Numba_hashtable_delete(_Numba_hashtable_t *ht, const void *key)
+{
+#ifndef NDEBUG
+    int found = _hashtable_pop_entry(ht, key, NULL, 0);
+    assert(found);
+#else
+    (void)_hashtable_pop_entry(ht, key, NULL, 0);
+#endif
+}
+
+/* Prototype for a pointer to a function to be called foreach
+   key/value pair in the hash by hashtable_foreach().  Iteration
+   stops if a non-zero value is returned. */
+extern "C" int
+_Numba_hashtable_foreach(_Numba_hashtable_t *ht,
+                      int (*func) (_Numba_hashtable_entry_t *entry, void *arg),
+                      void *arg)
+{
+    _Numba_hashtable_entry_t *entry;
+    size_t hv;
+
+    for (hv = 0; hv < ht->num_buckets; hv++) {
+        for (entry = TABLE_HEAD(ht, hv); entry; entry = ENTRY_NEXT(entry)) {
+            int res = func(entry, arg);
+            if (res)
+                return res;
+        }
+    }
+    return 0;
+}
+
+static void
+hashtable_rehash(_Numba_hashtable_t *ht)
+{
+    size_t buckets_size, new_size, bucket;
+    _Py_slist_t *old_buckets = NULL;
+    size_t old_num_buckets;
+
+    new_size = round_size((size_t)(ht->entries * HASHTABLE_REHASH_FACTOR));
+    if (new_size == ht->num_buckets)
+        return;
+
+    old_num_buckets = ht->num_buckets;
+
+    buckets_size = new_size * sizeof(ht->buckets[0]);
+    old_buckets = ht->buckets;
+    ht->buckets = (_Py_slist_t *) ht->alloc.malloc(buckets_size);
+    if (ht->buckets == NULL) {
+        /* cancel rehash on memory allocation failure */
+        ht->buckets = old_buckets ;
+        /* memory allocation failed */
+        return;
+    }
+    memset(ht->buckets, 0, buckets_size);
+
+    ht->num_buckets = new_size;
+
+    for (bucket = 0; bucket < old_num_buckets; bucket++) {
+        _Numba_hashtable_entry_t *entry, *next;
+        for (entry = BUCKETS_HEAD(old_buckets[bucket]); entry != NULL; entry = next) {
+            size_t entry_index;
+
+            assert(ht->hash_func(entry->key) == entry->key_hash);
+            next = ENTRY_NEXT(entry);
+            entry_index = entry->key_hash & (new_size - 1);
+
+            _Py_slist_prepend(&ht->buckets[entry_index], (_Py_slist_item_t*)entry);
+        }
+    }
+
+    ht->alloc.free(old_buckets);
+}
+
+extern "C" void
+_Numba_hashtable_clear(_Numba_hashtable_t *ht)
+{
+    _Numba_hashtable_entry_t *entry, *next;
+    size_t i;
+
+    for (i=0; i < ht->num_buckets; i++) {
+        for (entry = TABLE_HEAD(ht, i); entry != NULL; entry = next) {
+            next = ENTRY_NEXT(entry);
+            if (ht->free_data_func)
+                ht->free_data_func(_Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry));
+            ht->alloc.free(entry);
+        }
+        _Py_slist_init(&ht->buckets[i]);
+    }
+    ht->entries = 0;
+    hashtable_rehash(ht);
+}
+
+extern "C" void
+_Numba_hashtable_destroy(_Numba_hashtable_t *ht)
+{
+    size_t i;
+
+    for (i = 0; i < ht->num_buckets; i++) {
+        _Py_slist_item_t *entry = ht->buckets[i].head;
+        while (entry) {
+            _Py_slist_item_t *entry_next = entry->next;
+            if (ht->free_data_func)
+                ht->free_data_func(_Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry));
+            ht->alloc.free(entry);
+            entry = entry_next;
+        }
+    }
+
+    ht->alloc.free(ht->buckets);
+    ht->alloc.free(ht);
+}
+
+/* Return a copy of the hash table */
+extern "C" _Numba_hashtable_t *
+_Numba_hashtable_copy(_Numba_hashtable_t *src)
+{
+    _Numba_hashtable_t *dst;
+    _Numba_hashtable_entry_t *entry;
+    size_t bucket;
+    int err;
+    void *data, *new_data;
+
+    dst = _Numba_hashtable_new_full(src->data_size, src->num_buckets,
+                            src->hash_func, src->compare_func,
+                            src->copy_data_func, src->free_data_func,
+                            src->get_data_size_func, &src->alloc);
+    if (dst == NULL)
+        return NULL;
+
+    for (bucket=0; bucket < src->num_buckets; bucket++) {
+        entry = TABLE_HEAD(src, bucket);
+        for (; entry; entry = ENTRY_NEXT(entry)) {
+            if (src->copy_data_func) {
+                data = _Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(entry);
+                new_data = src->copy_data_func(data);
+                if (new_data != NULL)
+                    err = _Numba_hashtable_set(dst, entry->key,
+                                        &new_data, src->data_size);
+                else
+                    err = 1;
+            }
+            else {
+                data = _Numba_HASHTABLE_ENTRY_DATA(entry);
+                err = _Numba_hashtable_set(dst, entry->key, data, src->data_size);
+            }
+            if (err) {
+                _Numba_hashtable_destroy(dst);
+                return NULL;
+            }
+        }
+    }
+    return dst;
+}
diff --git a/numba_cuda/numba/cuda/cext/_hashtable.h b/numba_cuda/numba/cuda/cext/_hashtable.h
new file mode 100644
index 000000000..9fb719472
--- /dev/null
+++ b/numba_cuda/numba/cuda/cext/_hashtable.h
@@ -0,0 +1,135 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
+/*
+ * See _hashtable.c for more information about this file.
+ */
+
+#ifndef Py_HASHTABLE_H
+#define Py_HASHTABLE_H
+
+/* The whole API is private */
+#ifndef Py_LIMITED_API
+
+typedef struct _Py_slist_item_s {
+    struct _Py_slist_item_s *next;
+} _Py_slist_item_t;
+
+typedef struct {
+    _Py_slist_item_t *head;
+} _Py_slist_t;
+
+#define _Py_SLIST_ITEM_NEXT(ITEM) (((_Py_slist_item_t *)ITEM)->next)
+
+#define _Py_SLIST_HEAD(SLIST) (((_Py_slist_t *)SLIST)->head)
+
+typedef struct {
+    /* used by _Numba_hashtable_t.buckets to link entries */
+    _Py_slist_item_t _Py_slist_item;
+
+    const void *key;
+    Py_uhash_t key_hash;
+
+    /* data follows */
+} _Numba_hashtable_entry_t;
+
+#define _Numba_HASHTABLE_ENTRY_DATA(ENTRY) \
+        ((char *)(ENTRY) + sizeof(_Numba_hashtable_entry_t))
+
+#define _Numba_HASHTABLE_ENTRY_DATA_AS_VOID_P(ENTRY) \
+        (*(void **)_Numba_HASHTABLE_ENTRY_DATA(ENTRY))
+
+#define _Numba_HASHTABLE_ENTRY_READ_DATA(TABLE, DATA, DATA_SIZE, ENTRY) \
+    do { \
+        assert((DATA_SIZE) == (TABLE)->data_size); \
+        memcpy(DATA, _Numba_HASHTABLE_ENTRY_DATA(ENTRY), DATA_SIZE); \
+    } while (0)
+
+typedef Py_uhash_t (*_Numba_hashtable_hash_func) (const void *key);
+typedef int (*_Numba_hashtable_compare_func) (const void *key, const _Numba_hashtable_entry_t *he);
+typedef void* (*_Numba_hashtable_copy_data_func)(void *data);
+typedef void (*_Numba_hashtable_free_data_func)(void *data);
+typedef size_t (*_Numba_hashtable_get_data_size_func)(void *data);
+
+typedef struct {
+    /* allocate a memory block */
+    void* (*malloc) (size_t size);
+
+    /* release a memory block */
+    void (*free) (void *ptr);
+} _Numba_hashtable_allocator_t;
+
+typedef struct {
+    size_t num_buckets;
+    size_t entries; /* Total number of entries in the table. */
+    _Py_slist_t *buckets;
+    size_t data_size;
+
+    _Numba_hashtable_hash_func hash_func;
+    _Numba_hashtable_compare_func compare_func;
+    _Numba_hashtable_copy_data_func copy_data_func;
+    _Numba_hashtable_free_data_func free_data_func;
+    _Numba_hashtable_get_data_size_func get_data_size_func;
+    _Numba_hashtable_allocator_t alloc;
+} _Numba_hashtable_t;
+
+/* hash and compare functions for integers and pointers */
+extern "C" PyAPI_FUNC(Py_uhash_t) _Numba_hashtable_hash_ptr(const void *key);
+extern "C" PyAPI_FUNC(Py_uhash_t) _Numba_hashtable_hash_int(const void *key);
+extern "C" PyAPI_FUNC(int) _Numba_hashtable_compare_direct(const void *key, const _Numba_hashtable_entry_t *entry);
+
+extern "C" PyAPI_FUNC(_Numba_hashtable_t *) _Numba_hashtable_new(
+    size_t data_size,
+    _Numba_hashtable_hash_func hash_func,
+    _Numba_hashtable_compare_func compare_func);
+extern "C" PyAPI_FUNC(_Numba_hashtable_t *) _Numba_hashtable_new_full(
+    size_t data_size,
+    size_t init_size,
+    _Numba_hashtable_hash_func hash_func,
+    _Numba_hashtable_compare_func compare_func,
+    _Numba_hashtable_copy_data_func copy_data_func,
+    _Numba_hashtable_free_data_func free_data_func,
+    _Numba_hashtable_get_data_size_func get_data_size_func,
+    _Numba_hashtable_allocator_t *allocator);
+extern "C" PyAPI_FUNC(_Numba_hashtable_t *) _Numba_hashtable_copy(_Numba_hashtable_t *src);
+extern "C" PyAPI_FUNC(void) _Numba_hashtable_clear(_Numba_hashtable_t *ht);
+extern "C" PyAPI_FUNC(void) _Numba_hashtable_destroy(_Numba_hashtable_t *ht);
+
+typedef int (*_Numba_hashtable_foreach_func) (_Numba_hashtable_entry_t *entry, void *arg);
+
+extern "C" PyAPI_FUNC(int) _Numba_hashtable_foreach(
+    _Numba_hashtable_t *ht,
+    _Numba_hashtable_foreach_func func, void *arg);
+extern "C" PyAPI_FUNC(size_t) _Numba_hashtable_size(_Numba_hashtable_t *ht);
+
+extern "C" PyAPI_FUNC(_Numba_hashtable_entry_t*) _Numba_hashtable_get_entry(
+    _Numba_hashtable_t *ht,
+    const void *key);
+extern "C" PyAPI_FUNC(int) _Numba_hashtable_set(
+    _Numba_hashtable_t *ht,
+    const void *key,
+    void *data,
+    size_t data_size);
+extern "C" PyAPI_FUNC(int) _Numba_hashtable_get(
+    _Numba_hashtable_t *ht,
+    const void *key,
+    void *data,
+    size_t data_size);
+extern "C" PyAPI_FUNC(int) _Numba_hashtable_pop(
+    _Numba_hashtable_t *ht,
+    const void *key,
+    void *data,
+    size_t data_size);
+extern "C" PyAPI_FUNC(void) _Numba_hashtable_delete(
+    _Numba_hashtable_t *ht,
+    const void *key);
+
+#define _Numba_HASHTABLE_SET(TABLE, KEY, DATA) \
+    _Numba_hashtable_set(TABLE, KEY, &(DATA), sizeof(DATA))
+
+#define _Numba_HASHTABLE_GET(TABLE, KEY, DATA) \
+    _Numba_hashtable_get(TABLE, KEY, &(DATA), sizeof(DATA))
+
+#endif   /* Py_LIMITED_API */
+
+#endif
diff --git a/numba_cuda/numba/cuda/cext/_pymodule.h b/numba_cuda/numba/cuda/cext/_pymodule.h
new file mode 100644
index 000000000..4431dfff2
--- /dev/null
+++ b/numba_cuda/numba/cuda/cext/_pymodule.h
@@ -0,0 +1,38 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef NUMBA_PY_MODULE_H_
+#define NUMBA_PY_MODULE_H_
+
+#define PY_SSIZE_T_CLEAN
+
+#include "Python.h"
+#include "structmember.h"
+#include "frameobject.h"
+
+#define MOD_ERROR_VAL NULL
+#define MOD_SUCCESS_VAL(val) val
+#define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void)
+#define MOD_DEF(ob, name, doc, methods) { \
+        static struct PyModuleDef moduledef = { \
+          PyModuleDef_HEAD_INIT, name, doc, -1, methods, NULL, NULL, NULL, NULL }; \
+        ob = PyModule_Create(&moduledef); }
+#define MOD_INIT_EXEC(name) PyInit_##name();
+
+#define PyString_AsString PyUnicode_AsUTF8
+#define PyString_Check PyUnicode_Check
+#define PyString_FromFormat PyUnicode_FromFormat
+#define PyString_FromString PyUnicode_FromString
+#define PyString_InternFromString PyUnicode_InternFromString
+#define PyInt_Type PyLong_Type
+#define PyInt_Check PyLong_Check
+#define PyInt_CheckExact PyLong_CheckExact
+#define SetAttrStringFromVoidPointer(m, name) do { \
+        PyObject *tmp = PyLong_FromVoidPtr((void *) &name); \
+        PyObject_SetAttrString(m, #name, tmp); \
+        Py_DECREF(tmp); } while (0)
+
+
+#define NB_SUPPORTED_PYTHON_MINOR ((PY_MINOR_VERSION == 10) || (PY_MINOR_VERSION == 11) || (PY_MINOR_VERSION == 12) || (PY_MINOR_VERSION == 13))
+
+#endif /* NUMBA_PY_MODULE_H_ */
diff --git a/numba_cuda/numba/cuda/cext/_typeof.cpp b/numba_cuda/numba/cuda/cext/_typeof.cpp
new file mode 100644
index 000000000..6730b6d6f
--- /dev/null
+++ b/numba_cuda/numba/cuda/cext/_typeof.cpp
@@ -0,0 +1,1159 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "_pymodule.h"
+
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+
+#include "_typeof.h"
+#include "_hashtable.h"
+#include "_devicearray.h"
+#include "pyerrors.h"
+
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/ndarrayobject.h>
+#if NPY_ABI_VERSION >= 0x02000000
+    #include <numpy/npy_2_compat.h>
+#endif
+
+#if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION == 13)
+    #ifndef Py_BUILD_CORE
+        #define Py_BUILD_CORE 1
+    #endif
+    #include "internal/pycore_setobject.h"  // _PySet_NextEntry()
+#endif
+
+
+/* Cached typecodes for basic scalar types */
+static int tc_int8;
+static int tc_int16;
+static int tc_int32;
+static int tc_int64;
+static int tc_uint8;
+static int tc_uint16;
+static int tc_uint32;
+static int tc_uint64;
+static int tc_float32;
+static int tc_float64;
+static int tc_complex64;
+static int tc_complex128;
+static int BASIC_TYPECODES[12];
+
+static int tc_intp;
+
+/* The type object for the numba .dispatcher.OmittedArg class
+ * that wraps omitted arguments.
+ */
+static PyObject *omittedarg_type;
+
+static PyObject *typecache;
+static PyObject *ndarray_typecache;
+static PyObject *structured_dtypes;
+
+static PyObject *str_typeof_pyval = NULL;
+static PyObject *str_value = NULL;
+static PyObject *str_numba_type = NULL;
+
+/* CUDA device array API */
+void **DeviceArray_API;
+
+/*
+ * Type fingerprint computation.
+ */
+
+typedef struct {
+    /* A buffer the fingerprint will be written to */
+    char *buf;
+    size_t n;
+    size_t allocated;
+    /* A preallocated buffer, sufficient to fit the fingerprint for most types */
+    char static_buf[40];
+} string_writer_t;
+
+static void
+string_writer_init(string_writer_t *w)
+{
+    w->buf = w->static_buf;
+    w->n = 0;
+    w->allocated = sizeof(w->static_buf) / sizeof(unsigned char);
+}
+
+static void
+string_writer_clear(string_writer_t *w)
+{
+    if (w->buf != w->static_buf)
+        free(w->buf);
+}
+
+static void
+string_writer_move(string_writer_t *dest, const string_writer_t *src)
+{
+    dest->n = src->n;
+    dest->allocated = src->allocated;
+    if (src->buf == src->static_buf) {
+        dest->buf = dest->static_buf;
+        memcpy(dest->buf, src->buf, src->n);
+    }
+    else {
+        dest->buf = src->buf;
+    }
+}
+
+/* Ensure at least *bytes* can be appended to the string writer's buffer. */
+static int
+string_writer_ensure(string_writer_t *w, size_t bytes)
+{
+    size_t newsize;
+    bytes += w->n;
+    if (bytes <= w->allocated)
+        return 0;
+    newsize = (w->allocated << 2) + 1;
+    if (newsize < bytes)
+        newsize = bytes;
+    if (w->buf == w->static_buf) {
+        w->buf = (char *) malloc(newsize);
+        memcpy(w->buf, w->static_buf, w->allocated);
+    }
+    else
+        w->buf = (char *) realloc(w->buf, newsize);
+    if (w->buf) {
+        w->allocated = newsize;
+        return 0;
+    }
+    else {
+        PyErr_NoMemory();
+        return -1;
+    }
+}
+
+static int
+string_writer_put_char(string_writer_t *w, unsigned char c)
+{
+    if (string_writer_ensure(w, 1))
+        return -1;
+    w->buf[w->n++] = c;
+    return 0;
+}
+
+static int
+string_writer_put_int32(string_writer_t *w, unsigned int v)
+{
+    if (string_writer_ensure(w, 4))
+        return -1;
+    w->buf[w->n] = v & 0xff;
+    w->buf[w->n + 1] = (v >> 8) & 0xff;
+    w->buf[w->n + 2] = (v >> 16) & 0xff;
+    w->buf[w->n + 3] = (v >> 24) & 0xff;
+    w->n += 4;
+    return 0;
+}
+
+static int
+string_writer_put_intp(string_writer_t *w, npy_intp v)
+{
+    if (string_writer_ensure(w, NPY_SIZEOF_PY_INTPTR_T))
+        return -1;
+    w->buf[w->n] = v & 0xff;
+    w->buf[w->n + 1] = (v >> 8) & 0xff;
+    w->buf[w->n + 2] = (v >> 16) & 0xff;
+    w->buf[w->n + 3] = (v >> 24) & 0xff;
+#if NPY_SIZEOF_PY_INTPTR_T == 8
+    w->buf[w->n + 4] = (v >> 32) & 0xff;
+    w->buf[w->n + 5] = (v >> 40) & 0xff;
+    w->buf[w->n + 6] = (v >> 48) & 0xff;
+    w->buf[w->n + 7] = (v >> 56) & 0xff;
+#endif
+    w->n += NPY_SIZEOF_PY_INTPTR_T;
+    return 0;
+}
+
+static int
+string_writer_put_string(string_writer_t *w, const char *s)
+{
+    if (s == NULL) {
+        return string_writer_put_char(w, 0);
+    }
+    else {
+        size_t N = strlen(s) + 1;
+        if (string_writer_ensure(w, N))
+            return -1;
+        memcpy(w->buf + w->n, s, N);
+        w->n += N;
+        return 0;
+    }
+}
+
+enum opcode {
+    OP_START_TUPLE = '(',
+    OP_END_TUPLE = ')',
+    OP_INT = 'i',
+    OP_FLOAT = 'f',
+    OP_COMPLEX = 'c',
+    OP_BOOL = '?',
+    OP_OMITTED = '!',
+
+    OP_BYTEARRAY = 'a',
+    OP_BYTES = 'b',
+    OP_NONE = 'n',
+    OP_LIST = '[',
+    OP_SET = '{',
+
+    OP_BUFFER = 'B',
+    OP_NP_SCALAR = 'S',
+    OP_NP_ARRAY = 'A',
+    OP_NP_DTYPE = 'D'
+};
+
+#define TRY(func, w, arg) \
+    do { \
+        if (func(w, arg)) return -1; \
+    } while (0)
+
+
+static int
+fingerprint_unrecognized(void)
+{
+    PyErr_SetString(PyExc_NotImplementedError,
+                    "cannot compute type fingerprint for value");
+    return -1;
+}
+
+static int
+compute_dtype_fingerprint(string_writer_t *w, PyArray_Descr *descr)
+{
+    int typenum = descr->type_num;
+    if (typenum < NPY_OBJECT)
+        return string_writer_put_char(w, (char) typenum);
+    if (typenum == NPY_VOID) {
+        /* Structured dtype: serialize the dtype pointer.  Unfortunately,
+         * some structured dtypes can be ephemeral, so we have to
+         * intern them to avoid pointer reuse and fingerprint collisions.
+         * (e.g. np.recarray(dtype=some_dtype) creates a new dtype
+         *  equal to some_dtype)
+         */
+        PyObject *interned = PyDict_GetItem(structured_dtypes,
+                                            (PyObject *) descr);
+        if (interned == NULL) {
+            interned = (PyObject *) descr;
+            if (PyDict_SetItem(structured_dtypes, interned, interned))
+                return -1;
+        }
+        TRY(string_writer_put_char, w, (char) typenum);
+        return string_writer_put_intp(w, (npy_intp) interned);
+    }
+#if NPY_API_VERSION >= 0x00000007
+    if (PyTypeNum_ISDATETIME(typenum)) {
+        PyArray_DatetimeMetaData *md;
+#if NPY_ABI_VERSION >= 0x02000000
+        md = &(((PyArray_DatetimeDTypeMetaData *)PyDataType_C_METADATA(descr))->meta);
+#else
+        md = &(((PyArray_DatetimeDTypeMetaData *)descr->c_metadata)->meta);
+#endif
+        TRY(string_writer_put_char, w, (char) typenum);
+        TRY(string_writer_put_char, w, (char) md->base);
+        return string_writer_put_int32(w, (char) md->num);
+    }
+#endif
+
+    return fingerprint_unrecognized();
+}
+
+static int
+compute_fingerprint(string_writer_t *w, PyObject *val)
+{
+    /*
+     * Implementation note: for performance, we start with common
+     * types that can be tested with fast checks.
+     */
+    if (val == Py_None)
+        return string_writer_put_char(w, OP_NONE);
+    if (PyBool_Check(val))
+        return string_writer_put_char(w, OP_BOOL);
+    /* Note we avoid matching int subclasses such as IntEnum */
+    if (PyInt_CheckExact(val) || PyLong_CheckExact(val))
+        return string_writer_put_char(w, OP_INT);
+    if (PyFloat_Check(val))
+        return string_writer_put_char(w, OP_FLOAT);
+    if (PyComplex_CheckExact(val))
+        return string_writer_put_char(w, OP_COMPLEX);
+    if (PyTuple_Check(val)) {
+        if(PyTuple_CheckExact(val)) {
+            Py_ssize_t i, n;
+            n = PyTuple_GET_SIZE(val);
+            TRY(string_writer_put_char, w, OP_START_TUPLE);
+            for (i = 0; i < n; i++)
+                TRY(compute_fingerprint, w, PyTuple_GET_ITEM(val, i));
+            TRY(string_writer_put_char, w, OP_END_TUPLE);
+            return 0;
+        }
+        /* as per typeof.py, check "_asdict" for namedtuple. */
+        else if(PyObject_HasAttrString(val, "_asdict"))
+        {
+            /*
+             * This encodes the class name and field names of a namedtuple into
+             * the fingerprint on the condition that the number of fields is
+             * small (<10) and that the class name and field names are encodable
+             * as ASCII.
+             */
+            PyObject * clazz = NULL;
+            PyObject * name = NULL;
+            PyObject * _fields =  PyObject_GetAttrString(val, "_fields");
+            PyObject * field = NULL;
+            PyObject * ascii_str = NULL;
+            Py_ssize_t i, n, j, flen;
+            char * buf = NULL;
+            int ret;
+
+            clazz = PyObject_GetAttrString(val, "__class__");
+            if (clazz == NULL)
+                return -1;
+
+            name = PyObject_GetAttrString(clazz, "__name__");
+            Py_DECREF(clazz);
+            if (name == NULL)
+                return -1;
+
+            ascii_str = PyUnicode_AsEncodedString(name, "ascii", "ignore");
+            Py_DECREF(name);
+            if (ascii_str == NULL)
+                return -1;
+            ret = PyBytes_AsStringAndSize(ascii_str, &buf, &flen);
+
+            if (ret == -1)
+                return -1;
+            for(j = 0; j < flen; j++) {
+                TRY(string_writer_put_char, w, buf[j]);
+            }
+            Py_DECREF(ascii_str);
+
+            if (_fields == NULL)
+                return -1;
+
+            n = PyTuple_GET_SIZE(val);
+
+            TRY(string_writer_put_char, w, OP_START_TUPLE);
+            for (i = 0; i < n; i++) {
+                field = PyTuple_GET_ITEM(_fields, i);
+                if (field == NULL)
+                    return -1;
+                ascii_str = PyUnicode_AsEncodedString(field, "ascii", "ignore");
+                if (ascii_str == NULL)
+                    return -1;
+                ret = PyBytes_AsStringAndSize(ascii_str, &buf, &flen);
+                if (ret == -1)
+                    return -1;
+                for(j = 0; j < flen; j++) {
+                    TRY(string_writer_put_char, w, buf[j]);
+                }
+                Py_DECREF(ascii_str);
+                TRY(compute_fingerprint, w, PyTuple_GET_ITEM(val, i));
+            }
+            TRY(string_writer_put_char, w, OP_END_TUPLE);
+            Py_DECREF(_fields);
+            return 0;
+        }
+    }
+    if (PyBytes_Check(val))
+        return string_writer_put_char(w, OP_BYTES);
+    if (PyByteArray_Check(val))
+        return string_writer_put_char(w, OP_BYTEARRAY);
+    if ((PyObject *) Py_TYPE(val) == omittedarg_type) {
+        PyObject *default_val = PyObject_GetAttr(val, str_value);
+        if (default_val == NULL)
+            return -1;
+        TRY(string_writer_put_char, w, OP_OMITTED);
+        TRY(compute_fingerprint, w, default_val);
+        Py_DECREF(default_val);
+        return 0;
+    }
+    if (PyArray_IsScalar(val, Generic)) {
+        /* Note: PyArray_DescrFromScalar() may be a bit slow on
+           non-trivial types. */
+        PyArray_Descr *descr = PyArray_DescrFromScalar(val);
+        if (descr == NULL)
+            return -1;
+        TRY(string_writer_put_char, w, OP_NP_SCALAR);
+        TRY(compute_dtype_fingerprint, w, descr);
+        Py_DECREF(descr);
+        return 0;
+    }
+    if (PyArray_Check(val)) {
+        PyArrayObject *ary = (PyArrayObject *) val;
+        int ndim = PyArray_NDIM(ary);
+
+        TRY(string_writer_put_char, w, OP_NP_ARRAY);
+        TRY(string_writer_put_int32, w, ndim);
+        if (PyArray_IS_C_CONTIGUOUS(ary))
+            TRY(string_writer_put_char, w, 'C');
+        else if (PyArray_IS_F_CONTIGUOUS(ary))
+            TRY(string_writer_put_char, w, 'F');
+        else
+            TRY(string_writer_put_char, w, 'A');
+        if (PyArray_ISWRITEABLE(ary))
+            TRY(string_writer_put_char, w, 'W');
+        else
+            TRY(string_writer_put_char, w, 'R');
+        return compute_dtype_fingerprint(w, PyArray_DESCR(ary));
+    }
+    if (PyList_Check(val)) {
+        Py_ssize_t n = PyList_GET_SIZE(val);
+        if (n == 0) {
+            PyErr_SetString(PyExc_ValueError,
+                            "cannot compute fingerprint of empty list");
+            return -1;
+        }
+        /* Only the first item is considered, as in typeof.py */
+        TRY(string_writer_put_char, w, OP_LIST);
+        TRY(compute_fingerprint, w, PyList_GET_ITEM(val, 0));
+        return 0;
+    }
+    /* Note we only accept sets, not frozensets */
+    if (Py_TYPE(val) == &PySet_Type) {
+        Py_hash_t h;
+        PyObject *item;
+        Py_ssize_t pos = 0;
+        /* Only one item is considered, as in typeof.py */
+        if (!_PySet_NextEntry(val, &pos, &item, &h)) {
+            /* Empty set */
+            PyErr_SetString(PyExc_ValueError,
+                            "cannot compute fingerprint of empty set");
+            return -1;
+        }
+        TRY(string_writer_put_char, w, OP_SET);
+        TRY(compute_fingerprint, w, item);
+        return 0;
+    }
+    if (PyObject_CheckBuffer(val)) {
+        Py_buffer buf;
+        int flags = PyBUF_ND | PyBUF_STRIDES | PyBUF_FORMAT;
+        char contig;
+        int ndim;
+        char readonly;
+
+        /* Attempt to get a writable buffer, then fallback on read-only */
+        if (PyObject_GetBuffer(val, &buf, flags | PyBUF_WRITABLE)) {
+            PyErr_Clear();
+            if (PyObject_GetBuffer(val, &buf, flags))
+                goto _unrecognized;
+        }
+        if (PyBuffer_IsContiguous(&buf, 'C'))
+            contig = 'C';
+        else if (PyBuffer_IsContiguous(&buf, 'F'))
+            contig = 'F';
+        else
+            contig = 'A';
+        ndim = buf.ndim;
+        readonly = buf.readonly ? 'R' : 'W';
+        if (string_writer_put_char(w, OP_BUFFER) ||
+            string_writer_put_int32(w, ndim) ||
+            string_writer_put_char(w, contig) ||
+            string_writer_put_char(w, readonly) ||
+            string_writer_put_string(w, buf.format) ||
+            /* We serialize the object's Python type as well, to
+               distinguish between types which have Numba specializations
+               (e.g. array.array() vs. memoryview)
+            */
+            string_writer_put_intp(w, (npy_intp) Py_TYPE(val))) {
+            PyBuffer_Release(&buf);
+            return -1;
+        }
+        PyBuffer_Release(&buf);
+        return 0;
+    }
+    if (PyObject_TypeCheck(val, &PyArrayDescr_Type)) {
+        TRY(string_writer_put_char, w, OP_NP_DTYPE);
+        return compute_dtype_fingerprint(w, (PyArray_Descr *) val);
+    }
+
+_unrecognized:
+    /* Type not recognized */
+    return fingerprint_unrecognized();
+}
+
+PyObject *
+typeof_compute_fingerprint(PyObject *val)
+{
+    PyObject *res;
+    string_writer_t w;
+
+    string_writer_init(&w);
+
+    if (compute_fingerprint(&w, val))
+        goto error;
+    res = PyBytes_FromStringAndSize(w.buf, w.n);
+
+    string_writer_clear(&w);
+    return res;
+
+error:
+    string_writer_clear(&w);
+    return NULL;
+}
+
+/*
+ * Getting the typecode from a Type object.
+ */
+static int
+_typecode_from_type_object(PyObject *tyobj) {
+    int typecode;
+    PyObject *tmpcode = PyObject_GetAttrString(tyobj, "_code");
+    if (tmpcode == NULL) {
+        return -1;
+    }
+    typecode = PyLong_AsLong(tmpcode);
+    Py_DECREF(tmpcode);
+    return typecode;
+}
+
+/* When we want to cache the type's typecode for later lookup, we need to
+   keep a reference to the returned type object so that it cannot be
+   deleted. This is because of the following events occurring when first
+   using a @jit function for a given set of types:
+
+    1. typecode_fallback requests a new typecode for an arbitrary Python value;
+       this implies creating a Numba type object (on the first dispatcher call);
+       the typecode cache is then populated.
+    2. matching of the typecode list in _dispatcherimpl.cpp fails, since the
+       typecode is new.
+    3. we have to compile: compile_and_invoke() is called, it will invoke
+       Dispatcher_Insert to register the new signature.
+
+   The reference to the Numba type object returned in step 1 is deleted as
+   soon as we call Py_DECREF() on it, since we are holding the only
+   reference. If this happens and we use the typecode we got to populate the
+   cache, then the cache won't ever return the correct typecode, and the
+   dispatcher will never successfully match the typecodes with those of
+   some already-compiled instance. So we need to make sure that we don't
+   call Py_DECREF() on objects whose typecode will be used to populate the
+   cache. This is ensured by calling _typecode_fallback with
+   retain_reference == 0.
+
+   Note that technically we are leaking the reference, since we do not continue
+   to hold a pointer to the type object that we get back from typeof_pyval.
+   However, we don't need to refer to it again, we just need to make sure that
+   it is never deleted.
+*/
+static int
+_typecode_fallback(PyObject *dispatcher, PyObject *val,
+                   int retain_reference) {
+    PyObject *numba_type;
+    int typecode;
+
+    /*
+     * For values that define "_numba_type_", which holds a numba Type
+     * instance that should be used as the type of the value.
+     * Note this is done here, not in typeof_typecode(), so that
+     * some values can still benefit from fingerprint caching.
+     */
+    if (PyObject_HasAttr(val, str_numba_type)) {
+        numba_type = PyObject_GetAttrString(val, "_numba_type_");
+        if (!numba_type)
+            return -1;
+    }
+    else {
+        // Go back to the interpreter
+        numba_type = PyObject_CallMethodObjArgs((PyObject *) dispatcher,
+                                                str_typeof_pyval, val, NULL);
+    }
+    if (!numba_type)
+        return -1;
+    typecode = _typecode_from_type_object(numba_type);
+    if (!retain_reference)
+        Py_DECREF(numba_type);
+    return typecode;
+}
+
+/* Variations on _typecode_fallback for convenience */
+
+static
+int typecode_fallback(PyObject *dispatcher, PyObject *val) {
+    return _typecode_fallback(dispatcher, val, 0);
+}
+
+static
+int typecode_fallback_keep_ref(PyObject *dispatcher, PyObject *val) {
+    return _typecode_fallback(dispatcher, val, 1);
+}
+
+
+/* A cache mapping fingerprints (string_writer_t *) to typecodes (int). */
+static _Numba_hashtable_t *fingerprint_hashtable = NULL;
+
+static Py_uhash_t
+hash_writer(const void *key)
+{
+    string_writer_t *writer = (string_writer_t *) key;
+    Py_uhash_t x = 0;
+
+    /* The old FNV algorithm used by Python 2 */
+    if (writer->n > 0) {
+        unsigned char *p = (unsigned char *) writer->buf;
+        Py_ssize_t len = writer->n;
+        x ^= *p << 7;
+        while (--len >= 0)
+            x = (1000003*x) ^ *p++;
+        x ^= writer->n;
+        if (x == (Py_uhash_t) -1)
+            x = -2;
+    }
+    return x;
+}
+
+static int
+compare_writer(const void *key, const _Numba_hashtable_entry_t *entry)
+{
+    string_writer_t *v = (string_writer_t *) key;
+    string_writer_t *w = (string_writer_t *) entry->key;
+    if (v->n != w->n)
+        return 0;
+    return memcmp(v->buf, w->buf, v->n) == 0;
+}
+
+/* Try to compute *val*'s typecode using its fingerprint and the
+ * fingerprint->typecode cache.
+ */
+static int
+typecode_using_fingerprint(PyObject *dispatcher, PyObject *val)
+{
+    int typecode;
+    string_writer_t w;
+
+    string_writer_init(&w);
+
+    if (compute_fingerprint(&w, val)) {
+        string_writer_clear(&w);
+        if (PyErr_ExceptionMatches(PyExc_NotImplementedError)) {
+            /* Can't compute a type fingerprint for the given value,
+               fall back on typeof() without caching. */
+            PyErr_Clear();
+            return typecode_fallback(dispatcher, val);
+        }
+        return -1;
+    }
+    if (_Numba_HASHTABLE_GET(fingerprint_hashtable, &w, typecode) > 0) {
+        /* Cache hit */
+        string_writer_clear(&w);
+        return typecode;
+    }
+
+    /* Not found in cache: invoke pure Python typeof() and cache result.
+     * Note we have to keep the type alive forever as explained
+     * above in _typecode_fallback().
+     */
+    typecode = typecode_fallback_keep_ref(dispatcher, val);
+    if (typecode >= 0) {
+        string_writer_t *key = (string_writer_t *) malloc(sizeof(string_writer_t));
+        if (key == NULL) {
+            string_writer_clear(&w);
+            PyErr_NoMemory();
+            return -1;
+        }
+        /* Ownership of the string writer's buffer will be transferred
+         * to the hash table.
+         */
+        string_writer_move(key, &w);
+        if (_Numba_HASHTABLE_SET(fingerprint_hashtable, key, typecode)) {
+            string_writer_clear(&w);
+            PyErr_NoMemory();
+            return -1;
+        }
+    }
+    return typecode;
+}
+
+
+/*
+ * Direct lookup table for extra-fast typecode resolution of simple array types.
+ */
+
+#define N_DTYPES 12
+#define N_NDIM 5    /* Fast path for up to 5D array */
+#define N_LAYOUT 3
+static int cached_arycode[N_NDIM][N_LAYOUT][N_DTYPES];
+
+/* Convert a Numpy dtype number to an internal index into cached_arycode.
+   The returned value must also be a valid index into BASIC_TYPECODES. */
+static int dtype_num_to_typecode(int type_num) {
+    int dtype;
+    switch(type_num) {
+    case NPY_INT8:
+        dtype = 0;
+        break;
+    case NPY_INT16:
+        dtype = 1;
+        break;
+    case NPY_INT32:
+        dtype = 2;
+        break;
+    case NPY_INT64:
+        dtype = 3;
+        break;
+    case NPY_UINT8:
+        dtype = 4;
+        break;
+    case NPY_UINT16:
+        dtype = 5;
+        break;
+    case NPY_UINT32:
+        dtype = 6;
+        break;
+    case NPY_UINT64:
+        dtype = 7;
+        break;
+    case NPY_FLOAT32:
+        dtype = 8;
+        break;
+    case NPY_FLOAT64:
+        dtype = 9;
+        break;
+    case NPY_COMPLEX64:
+        dtype = 10;
+        break;
+    case NPY_COMPLEX128:
+        dtype = 11;
+        break;
+    default:
+        /* Type not included in the global lookup table */
+        dtype = -1;
+    }
+    return dtype;
+}
+
+static
+int get_cached_typecode(PyArray_Descr* descr) {
+    PyObject* tmpobject = PyDict_GetItem(typecache, (PyObject*)descr);
+    if (tmpobject == NULL)
+        return -1;
+
+    return PyLong_AsLong(tmpobject);
+}
+
+static
+void cache_typecode(PyArray_Descr* descr, int typecode) {
+    PyObject* value = PyLong_FromLong(typecode);
+    PyDict_SetItem(typecache, (PyObject*)descr, value);
+    Py_DECREF(value);
+}
+
+static
+PyObject* ndarray_key(int ndim, int layout, int readonly, PyArray_Descr* descr) {
+    PyObject* tmpndim = PyLong_FromLong(ndim);
+    PyObject* tmplayout = PyLong_FromLong(layout);
+    PyObject* tmpreadonly = PyLong_FromLong(readonly);
+    PyObject* key = PyTuple_Pack(4, tmpndim, tmplayout, tmpreadonly, descr);
+    Py_DECREF(tmpndim);
+    Py_DECREF(tmplayout);
+    Py_DECREF(tmpreadonly);
+    return key;
+}
+
+static
+int get_cached_ndarray_typecode(int ndim, int layout, int readonly, PyArray_Descr* descr) {
+    PyObject* key = ndarray_key(ndim, layout, readonly, descr);
+    PyObject *tmpobject = PyDict_GetItem(ndarray_typecache, key);
+    if (tmpobject == NULL)
+        return -1;
+
+    Py_DECREF(key);
+    return PyLong_AsLong(tmpobject);
+}
+
+static
+void cache_ndarray_typecode(int ndim, int layout, int readonly, PyArray_Descr* descr,
+                            int typecode) {
+    PyObject* key = ndarray_key(ndim, layout, readonly, descr);
+    PyObject* value = PyLong_FromLong(typecode);
+    PyDict_SetItem(ndarray_typecache, key, value);
+    Py_DECREF(key);
+    Py_DECREF(value);
+}
+
+static
+int typecode_ndarray(PyObject *dispatcher, PyArrayObject *ary) {
+    int typecode;
+    int dtype;
+    int ndim = PyArray_NDIM(ary);
+    int layout = 0;
+    int readonly = 0;
+
+    /* The order in which we check for the right contiguous-ness is important.
+       The order must match the order by numba.numpy_support.map_layout.
+       Further, only *contiguous-ness* is checked, not alignment, byte order or
+       write permissions.
+    */
+    if (PyArray_IS_C_CONTIGUOUS(ary)){
+        layout = 1;
+    } else if (PyArray_IS_F_CONTIGUOUS(ary)) {
+        layout = 2;
+    }
+
+    /* the typecode cache by convention is for "behaved" arrays (aligned and
+     * writeable), all others must be forced to the fall back */
+    if (!PyArray_ISBEHAVED(ary)) goto FALLBACK;
+
+    if (ndim <= 0 || ndim > N_NDIM) goto FALLBACK;
+
+    dtype = dtype_num_to_typecode(PyArray_TYPE(ary));
+    if (dtype == -1) goto FALLBACK;
+
+    /* Fast path, using direct table lookup */
+    assert(layout < N_LAYOUT);
+    assert(ndim <= N_NDIM);
+    assert(dtype < N_DTYPES);
+
+    typecode = cached_arycode[ndim - 1][layout][dtype];
+    if (typecode == -1) {
+        /* First use of this table entry, so it requires populating */
+        typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary);
+        cached_arycode[ndim - 1][layout][dtype] = typecode;
+    }
+    return typecode;
+
+FALLBACK:
+    /* Slower path, for non-trivial array types */
+
+    /* If this isn't a structured array then we can't use the cache */
+    if (PyArray_TYPE(ary) != NPY_VOID)
+        return typecode_using_fingerprint(dispatcher, (PyObject *) ary);
+
+    /* Check type cache */
+    readonly = !PyArray_ISWRITEABLE(ary);
+    typecode = get_cached_ndarray_typecode(ndim, layout, readonly, PyArray_DESCR(ary));
+    if (typecode == -1) {
+        /* First use of this type, use fallback and populate the cache */
+        typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary);
+        cache_ndarray_typecode(ndim, layout, readonly, PyArray_DESCR(ary), typecode);
+    }
+    return typecode;
+}
+
+static
+int typecode_arrayscalar(PyObject *dispatcher, PyObject* aryscalar) {
+    int typecode;
+    PyArray_Descr *descr;
+    descr = PyArray_DescrFromScalar(aryscalar);
+    if (!descr)
+        return typecode_using_fingerprint(dispatcher, aryscalar);
+
+    /* Is it a structured scalar? */
+    if (descr->type_num == NPY_VOID) {
+        typecode = get_cached_typecode(descr);
+        if (typecode == -1) {
+            /* Resolve through fallback then populate cache */
+            typecode = typecode_fallback_keep_ref(dispatcher, aryscalar);
+            cache_typecode(descr, typecode);
+        }
+        Py_DECREF(descr);
+        return typecode;
+    }
+
+    /* Is it one of the well-known basic types? */
+    typecode = dtype_num_to_typecode(descr->type_num);
+    Py_DECREF(descr);
+    if (typecode == -1)
+        return typecode_using_fingerprint(dispatcher, aryscalar);
+    return BASIC_TYPECODES[typecode];
+}
+
+static
+int typecode_devicendarray(PyObject *dispatcher, PyObject *ary)
+{
+    int typecode;
+    int dtype;
+    int ndim;
+    int layout = 0;
+    PyObject *ndim_obj = nullptr;
+    PyObject *num_obj = nullptr;
+    PyObject *dtype_obj = nullptr;
+    int dtype_num = 0;
+
+    PyObject* flags = PyObject_GetAttrString(ary, "flags");
+    if (flags == NULL)
+    {
+        PyErr_Clear();
+        goto FALLBACK;
+    }
+
+    if (PyDict_GetItemString(flags, "C_CONTIGUOUS") == Py_True) {
+        layout = 1;
+    } else if (PyDict_GetItemString(flags, "F_CONTIGUOUS") == Py_True) {
+        layout = 2;
+    }
+
+    Py_DECREF(flags);
+
+    ndim_obj = PyObject_GetAttrString(ary, "ndim");
+    if (ndim_obj == NULL) {
+        /* If there's no ndim, try to proceed by clearing the error and using the
+         * fallback. */
+        PyErr_Clear();
+        goto FALLBACK;
+    }
+
+    ndim = PyLong_AsLong(ndim_obj);
+    Py_DECREF(ndim_obj);
+
+    if (PyErr_Occurred()) {
+        /* ndim wasn't an integer for some reason - unlikely to happen, but try
+         * the fallback. */
+        PyErr_Clear();
+        goto FALLBACK;
+    }
+
+    if (ndim <= 0 || ndim > N_NDIM)
+        goto FALLBACK;
+
+    dtype_obj = PyObject_GetAttrString(ary, "dtype");
+    if (dtype_obj == NULL) {
+        /* No dtype: try the fallback. */
+        PyErr_Clear();
+        goto FALLBACK;
+    }
+
+    num_obj = PyObject_GetAttrString(dtype_obj, "num");
+    Py_DECREF(dtype_obj);
+
+    if (num_obj == NULL) {
+        /* This strange dtype has no num - try the fallback. */
+        PyErr_Clear();
+        goto FALLBACK;
+    }
+
+    dtype_num = PyLong_AsLong(num_obj);
+    Py_DECREF(num_obj);
+
+    if (PyErr_Occurred()) {
+        /* num wasn't an integer for some reason - unlikely to happen, but try
+         * the fallback. */
+        PyErr_Clear();
+        goto FALLBACK;
+    }
+
+    dtype = dtype_num_to_typecode(dtype_num);
+    if (dtype == -1) {
+        /* Not a dtype we have in the global lookup table. */
+        goto FALLBACK;
+    }
+
+    /* Fast path, using direct table lookup */
+    assert(layout < N_LAYOUT);
+    assert(ndim <= N_NDIM);
+    assert(dtype < N_DTYPES);
+    typecode = cached_arycode[ndim - 1][layout][dtype];
+
+    if (typecode == -1) {
+        /* First use of this table entry, so it requires populating */
+        typecode = typecode_fallback_keep_ref(dispatcher, (PyObject*)ary);
+        cached_arycode[ndim - 1][layout][dtype] = typecode;
+    }
+
+    return typecode;
+
+FALLBACK:
+    /* Slower path, for non-trivial array types. At present this always uses
+       the fingerprinting to get the typecode. Future optimization might
+       implement a cache, but this would require some fast equivalent of
+       PyArray_DESCR for a device array. */
+
+    return typecode_using_fingerprint(dispatcher, (PyObject *) ary);
+}
+
+extern "C" int
+typeof_typecode(PyObject *dispatcher, PyObject *val)
+{
+    PyTypeObject *tyobj = Py_TYPE(val);
+    int subtype_attr;
+    /* This needs to be kept in sync with Dispatcher.typeof_pyval(),
+     * otherwise funny things may happen.
+     */
+    if (tyobj == &PyInt_Type || tyobj == &PyLong_Type) {
+#if SIZEOF_VOID_P < 8
+        /* On 32-bit platforms, choose between tc_intp (32-bit) and tc_int64 */
+        PY_LONG_LONG ll = PyLong_AsLongLong(val);
+        if (ll == -1 && PyErr_Occurred()) {
+            /* The integer is too large, let us truncate it */
+            PyErr_Clear();
+            return tc_int64;
+        }
+        if ((ll & 0xffffffff) != ll)
+            return tc_int64;
+#endif
+        return tc_intp;
+    }
+    else if (tyobj == &PyFloat_Type)
+        return tc_float64;
+    else if (tyobj == &PyComplex_Type)
+        return tc_complex128;
+    /* Array scalar handling */
+    else if (PyArray_CheckScalar(val)) {
+        return typecode_arrayscalar(dispatcher, val);
+    }
+    /* Array handling */
+    else if (tyobj == &PyArray_Type) {
+        return typecode_ndarray(dispatcher, (PyArrayObject*)val);
+    }
+    /* Subtype of CUDA device array */
+    else if (PyType_IsSubtype(tyobj, &DeviceArrayType)) {
+        return typecode_devicendarray(dispatcher, val);
+    }
+    /* Subtypes of Array handling */
+    else if (PyType_IsSubtype(tyobj, &PyArray_Type)) {
+        /* By default, Numba will treat all numpy.ndarray subtypes as if they
+           were the base numpy.ndarray type.  In this way, ndarray subtypes
+           can easily use all of the support that Numba has for ndarray
+           methods.
+           EXPERIMENTAL: There may be cases where a programmer would NOT want
+           ndarray subtypes to be treated exactly like the base numpy.ndarray.
+           For this purpose, a currently experimental feature allows a
+           programmer to add an attribute named
+           __numba_array_subtype_dispatch__ to their ndarray subtype.  This
+           attribute can have any value as Numba only checks for the presence
+           of the attribute and not its value.  When present, a ndarray subtype
+           will NOT be typed by Numba as a regular ndarray but this code will
+           fallthrough to the typecode_using_fingerprint call, which will
+           create a new unique Numba typecode for this ndarray subtype.  This
+           behavior has several significant effects.  First, since this
+           ndarray subtype will be treated as a different type by Numba,
+           the Numba dispatcher would then specialize on this type.  So, if
+           there was a function that had several parameters that were
+           expected to be either numpy.ndarray or a subtype of ndarray, then
+           Numba would compile a custom version of this function for each
+           combination of base and subtypes that were actually passed to the
+           function.  Second, because this subtype would now be treated as
+           a totally separate type, it will cease to function in Numba unless
+           an implementation of that type is provided to Numba through the
+           Numba type extension mechanisms (e.g., overload).  This would
+           typically start with defining a Numba type corresponding to the
+           ndarray subtype. This is the same concept as how Numba has a
+           corollary of numpy.ndarray in its type system as types.Array.
+           Next, one would typically defining boxing and unboxing routines
+           and the associated memory model.  Then, overloads for NumPy
+           functions on that type would be created.  However,
+           if the same default array memory model is used then there are tricks
+           one can do to look at Numba's internal types.Array registries and
+           to quickly apply those to the subtype as well.  In this manner,
+           only those cases where the base ndarray and the ndarray subtype
+           behavior differ would new custom functions need to be written for
+           the subtype. Finally,
+           after adding support for the new type, you would have a separate
+           ndarray subtype that could operate with other objects of the same
+           subtype but would not support interoperation with regular NumPy
+           ndarrays.  In standard Python, this interoperation is provided
+           through the __array_ufunc__ magic method in the ndarray subtype
+           class and in that case the function operates on ndarrays or their
+           subtypes.  This idea is extended into Numba such that
+           __array_ufunc__ can be present in a Numba array type object.
+           In this case, this function is consulted during Numba typing and
+           so the arguments to __array_ufunc__ are Numba types instead of
+           ndarray subtypes.  The array type __array_ufunc__ returns the
+           type of the output of the given ufunc.
+         */
+        subtype_attr = PyObject_HasAttrString(val, "__numba_array_subtype_dispatch__");
+        if (!subtype_attr) {
+            return typecode_ndarray(dispatcher, (PyArrayObject*)val);
+        }
+    }
+
+    return typecode_using_fingerprint(dispatcher, val);
+}
+
+
+static
+void* wrap_import_array(void) {
+    import_array(); /* import array returns NULL on failure */
+    return (void*)1;
+}
+
+
+static
+int init_numpy(void) {
+    return wrap_import_array() != NULL;
+}
+
+
+/*
+ * typeof_init(omittedarg_type, typecode_dict)
+ * (called from dispatcher.py to fill in missing information)
+ */
+extern "C" PyObject *
+typeof_init(PyObject *self, PyObject *args)
+{
+    PyObject *tmpobj;
+    PyObject *dict;
+    int index = 0;
+
+    if (!PyArg_ParseTuple(args, "O!O!:typeof_init",
+                          &PyType_Type, &omittedarg_type,
+                          &PyDict_Type, &dict))
+        return NULL;
+
+    /* Initialize Numpy API */
+    if ( ! init_numpy() ) {
+        return NULL;
+    }
+
+    #define UNWRAP_TYPE(S)                                              \
+        if(!(tmpobj = PyDict_GetItemString(dict, #S))) return NULL;     \
+        else {  tc_##S = PyLong_AsLong(tmpobj);                         \
+                BASIC_TYPECODES[index++] = tc_##S;  }
+
+    UNWRAP_TYPE(int8)
+    UNWRAP_TYPE(int16)
+    UNWRAP_TYPE(int32)
+    UNWRAP_TYPE(int64)
+
+    UNWRAP_TYPE(uint8)
+    UNWRAP_TYPE(uint16)
+    UNWRAP_TYPE(uint32)
+    UNWRAP_TYPE(uint64)
+
+    UNWRAP_TYPE(float32)
+    UNWRAP_TYPE(float64)
+
+    UNWRAP_TYPE(complex64)
+    UNWRAP_TYPE(complex128)
+
+    switch(sizeof(void*)) {
+    case 4:
+        tc_intp = tc_int32;
+        break;
+    case 8:
+        tc_intp = tc_int64;
+        break;
+    default:
+        PyErr_SetString(PyExc_AssertionError, "sizeof(void*) != {4, 8}");
+        return NULL;
+    }
+
+    #undef UNWRAP_TYPE
+
+    typecache = PyDict_New();
+    ndarray_typecache = PyDict_New();
+    structured_dtypes = PyDict_New();
+    if (typecache == NULL || ndarray_typecache == NULL ||
+        structured_dtypes == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "failed to create type cache");
+        return NULL;
+    }
+
+    fingerprint_hashtable = _Numba_hashtable_new(sizeof(int),
+                                              hash_writer,
+                                              compare_writer);
+    if (fingerprint_hashtable == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    /* initialize cached_arycode to all ones (in bits) */
+    memset(cached_arycode, 0xFF, sizeof(cached_arycode));
+
+    str_typeof_pyval = PyString_InternFromString("typeof_pyval");
+    str_value = PyString_InternFromString("value");
+    str_numba_type = PyString_InternFromString("_numba_type_");
+    if (!str_value || !str_typeof_pyval || !str_numba_type)
+        return NULL;
+
+    Py_RETURN_NONE;
+}
diff --git a/numba_cuda/numba/cuda/cext/_typeof.h b/numba_cuda/numba/cuda/cext/_typeof.h
new file mode 100644
index 000000000..54e39ff80
--- /dev/null
+++ b/numba_cuda/numba/cuda/cext/_typeof.h
@@ -0,0 +1,19 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef NUMBA_TYPEOF_H_
+#define NUMBA_TYPEOF_H_
+
+#ifdef __cplusplus
+    extern "C" {
+#endif
+
+extern PyObject *typeof_init(PyObject *self, PyObject *args);
+extern int typeof_typecode(PyObject *dispatcher, PyObject *val);
+extern PyObject *typeof_compute_fingerprint(PyObject *val);
+
+#ifdef __cplusplus
+    }
+#endif
+
+#endif  /* NUMBA_TYPEOF_H_ */
diff --git a/numba_cuda/numba/cuda/cext/mviewbuf.c b/numba_cuda/numba/cuda/cext/mviewbuf.c
new file mode 100644
index 000000000..eb5075d0b
--- /dev/null
+++ b/numba_cuda/numba/cuda/cext/mviewbuf.c
@@ -0,0 +1,385 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "_pymodule.h"
+
+static int get_writable_buffer(PyObject* obj, Py_buffer *buf, int force)
+{
+    Py_buffer read_buf;
+    int flags = PyBUF_ND|PyBUF_STRIDES|PyBUF_FORMAT;
+    int ret;
+
+    /* Attempt to get a writable buffer */
+    if (!PyObject_GetBuffer(obj, buf, flags|PyBUF_WRITABLE))
+        return 0;
+    if (!force)
+        return -1;
+
+    /* Make a writable buffer from a read-only buffer */
+    PyErr_Clear();
+    if(-1 == PyObject_GetBuffer(obj, &read_buf, flags))
+        return -1;
+    ret = PyBuffer_FillInfo(buf, NULL, read_buf.buf, read_buf.len, 0,
+                             flags|PyBUF_WRITABLE);
+    PyBuffer_Release(&read_buf);
+    return ret;
+}
+
+static int get_readonly_buffer(PyObject* obj, Py_buffer *buf)
+{
+    int flags = PyBUF_ND|PyBUF_STRIDES|PyBUF_FORMAT;
+
+    return PyObject_GetBuffer(obj, buf, flags);
+}
+
+
+static void free_buffer(Py_buffer * buf)
+{
+    PyBuffer_Release(buf);
+}
+
+/**
+ * Return a pointer to the data of a writable buffer from obj. If only a
+ * read-only buffer is available and force is True, a read-write buffer based on
+ * the read-only buffer is obtained. Note that this may have some surprising
+ * effects on buffers which expect the data from their read-only buffer not to
+ * be modified.
+ */
+static PyObject*
+memoryview_get_buffer(PyObject *self, PyObject *args){
+    PyObject *obj = NULL;
+    int force = 0;
+    int readonly = 0;
+    PyObject *ret = NULL;
+    Py_buffer buf;
+
+    if (!PyArg_ParseTuple(args, "O|ii", &obj, &force, &readonly))
+        return NULL;
+
+    if (readonly) {
+        if (get_readonly_buffer(obj, &buf))
+            return NULL;
+    } else {
+        if (get_writable_buffer(obj, &buf, force))
+            return NULL;
+    }
+
+    ret = PyLong_FromVoidPtr(buf.buf);
+    free_buffer(&buf);
+    return ret;
+}
+
+/**
+ * Gets a half-open range [start, end) which contains the array data
+ * Modified from numpy/core/src/multiarray/array_assign.c
+ */
+static PyObject*
+get_extents(Py_ssize_t *shape, Py_ssize_t *strides, int ndim,
+            Py_ssize_t itemsize, Py_ssize_t ptr)
+{
+    Py_ssize_t start, end;
+    int idim;
+    Py_ssize_t *dimensions = shape;
+    PyObject *ret = NULL;
+
+    if (ndim < 0 ){
+        PyErr_SetString(PyExc_ValueError, "buffer ndim < 0");
+        return NULL;
+    }
+
+    if (!dimensions) {
+        if (ndim == 0) {
+            start = end = ptr;
+            end += itemsize;
+            return Py_BuildValue("nn", start, end);
+        }
+        PyErr_SetString(PyExc_ValueError, "buffer shape is not defined");
+        return NULL;
+    }
+
+    if (!strides) {
+        PyErr_SetString(PyExc_ValueError, "buffer strides is not defined");
+        return NULL;
+    }
+
+    /* Calculate with a closed range [start, end] */
+    start = end = ptr;
+    for (idim = 0; idim < ndim; ++idim) {
+        Py_ssize_t stride = strides[idim], dim = dimensions[idim];
+        /* If the array size is zero, return an empty range */
+        if (dim == 0) {
+            start = end = ptr;
+            ret = Py_BuildValue("nn", start, end);
+            break;
+        }
+        /* Expand either upwards or downwards depending on stride */
+        else {
+            if (stride > 0) {
+                end += stride * (dim - 1);
+            }
+            else if (stride < 0) {
+                start += stride * (dim - 1);
+            }
+        }
+    }
+
+    if (!ret) {
+        /* Return a half-open range */
+        Py_ssize_t out_start = start;
+        Py_ssize_t out_end = end + itemsize;
+
+        ret = Py_BuildValue("nn", out_start, out_end);
+    }
+
+    return ret;
+}
+
+static PyObject*
+memoryview_get_extents(PyObject *self, PyObject *args)
+{
+    PyObject *obj = NULL;
+    PyObject *ret = NULL;
+    Py_buffer b;
+    if (!PyArg_ParseTuple(args, "O", &obj))
+        return NULL;
+
+    if (get_readonly_buffer(obj, &b))
+        return NULL;
+
+    ret = get_extents(b.shape, b.strides, b.ndim, b.itemsize,
+                      (Py_ssize_t)b.buf);
+    free_buffer(&b);
+    return ret;
+}
+
+static PyObject*
+memoryview_get_extents_info(PyObject *self, PyObject *args)
+{
+    int i;
+    Py_ssize_t *shape_ary = NULL;
+    Py_ssize_t *strides_ary = NULL;
+    PyObject *shape_tuple = NULL;
+    PyObject *strides_tuple = NULL;
+    PyObject *shape = NULL, *strides = NULL;
+    Py_ssize_t itemsize = 0;
+    int ndim = 0;
+    PyObject* res = NULL;
+
+    if (!PyArg_ParseTuple(args, "OOin", &shape, &strides, &ndim, &itemsize))
+        goto cleanup;
+
+    if (ndim < 0) {
+        PyErr_SetString(PyExc_ValueError, "ndim is negative");
+        goto cleanup;
+    }
+
+    if (itemsize <= 0) {
+        PyErr_SetString(PyExc_ValueError, "ndim <= 0");
+        goto cleanup;
+    }
+
+    shape_ary = malloc(sizeof(Py_ssize_t) * ndim + 1);
+    strides_ary = malloc(sizeof(Py_ssize_t) * ndim + 1);
+
+    shape_tuple = PySequence_Fast(shape, "shape is not a sequence");
+    if (!shape_tuple) goto cleanup;
+
+    for (i = 0; i < ndim; ++i) {
+        shape_ary[i] = PyNumber_AsSsize_t(
+                           PySequence_Fast_GET_ITEM(shape_tuple, i),
+                           PyExc_OverflowError);
+    }
+
+    strides_tuple = PySequence_Fast(strides, "strides is not a sequence");
+    if (!strides_tuple) goto cleanup;
+
+    for (i = 0; i < ndim; ++i) {
+        strides_ary[i] = PyNumber_AsSsize_t(
+                           PySequence_Fast_GET_ITEM(strides_tuple, i),
+                           PyExc_OverflowError);
+    }
+
+    res = get_extents(shape_ary, strides_ary, ndim, itemsize, 0);
+cleanup:
+    free(shape_ary);
+    free(strides_ary);
+    Py_XDECREF(shape_tuple);
+    Py_XDECREF(strides_tuple);
+    return res;
+}
+
+
+/* new type to expose buffer interface */
+typedef struct {
+    PyObject_HEAD
+    /* Type-specific fields go here. */
+} MemAllocObject;
+
+
+static int
+get_bufinfo(PyObject *self, Py_ssize_t *psize, void **pptr)
+{
+    PyObject *buflen = NULL;
+    PyObject *bufptr = NULL;
+    Py_ssize_t size = 0;
+    void* ptr = NULL;
+    int ret = -1;
+
+    buflen = PyObject_GetAttrString(self, "_buflen_");
+    if (!buflen) goto cleanup;
+
+    bufptr = PyObject_GetAttrString(self, "_bufptr_");
+    if (!bufptr) goto cleanup;
+
+    size = PyNumber_AsSsize_t(buflen, PyExc_OverflowError);
+    if (size == -1 && PyErr_Occurred()) goto cleanup;
+    else if (size < 0) {
+        PyErr_SetString(PyExc_ValueError, "negative buffer size");
+        goto cleanup;
+    }
+
+    ptr = PyLong_AsVoidPtr(PyNumber_Long(bufptr));
+    if (PyErr_Occurred())
+        goto cleanup;
+    else if (!ptr) {
+        PyErr_SetString(PyExc_ValueError, "null buffer pointer");
+        goto cleanup;
+    }
+
+    *psize = size;
+    *pptr = ptr;
+    ret = 0;
+cleanup:
+    Py_XDECREF(buflen);
+    Py_XDECREF(bufptr);
+    return ret;
+}
+
+
+static int
+MemAllocObject_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    Py_ssize_t size = 0;
+    void *ptr = 0;
+    int readonly;
+
+    if(-1 == get_bufinfo(self, &size, &ptr))
+        return -1;
+
+    readonly = (PyBUF_WRITABLE & flags) != PyBUF_WRITABLE;
+
+    /* fill buffer */
+    if (-1 == PyBuffer_FillInfo(view, self, (void*)ptr, size, readonly, flags))
+        return -1;
+
+    return 0;
+}
+
+static void
+MemAllocObject_releasebuffer(PyObject *self, Py_buffer *view)
+{
+    /* Do nothing */
+}
+
+static PyBufferProcs MemAlloc_as_buffer = {
+    MemAllocObject_getbuffer,
+    MemAllocObject_releasebuffer,
+};
+
+
+static PyTypeObject MemAllocType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    "mviewbuf.MemAlloc",                        /* tp_name */
+    sizeof(MemAllocObject),                     /* tp_basicsize */
+    0,                                          /* tp_itemsize */
+    0,                                          /* tp_dealloc */
+    0,                                          /* tp_vectorcall_offset */
+    0,                                          /* tp_getattr */
+    0,                                          /* tp_setattr */
+    0,                                          /* tp_as_async */
+    0,                                          /* tp_repr */
+    0,                                          /* tp_as_number */
+    0,                                          /* tp_as_sequence */
+    0,                                          /* tp_as_mapping */
+    0,                                          /* tp_hash */
+    0,                                          /* tp_call */
+    0,                                          /* tp_str */
+    0,                                          /* tp_getattro */
+    0,                                          /* tp_setattro */
+    &MemAlloc_as_buffer,                        /* tp_as_buffer */
+    (Py_TPFLAGS_DEFAULT| Py_TPFLAGS_BASETYPE),  /* tp_flags */
+    0,                                          /* tp_doc */
+    0,                                          /* tp_traverse */
+    0,                                          /* tp_clear */
+    0,                                          /* tp_richcompare */
+    0,                                          /* tp_weaklistoffset */
+    0,                                          /* tp_iter */
+    0,                                          /* tp_iternext */
+    0,                                          /* tp_methods */
+    0,                                          /* tp_members */
+    0,                                          /* tp_getset */
+    0,                                          /* tp_base */
+    0,                                          /* tp_dict */
+    0,                                          /* tp_descr_get */
+    0,                                          /* tp_descr_set */
+    0,                                          /* tp_dictoffset */
+    0,                                          /* tp_init */
+    0,                                          /* tp_alloc */
+    0,                                          /* tp_new */
+    0,                                          /* tp_free */
+    0,                                          /* tp_is_gc */
+    0,                                          /* tp_bases */
+    0,                                          /* tp_mro */
+    0,                                          /* tp_cache */
+    0,                                          /* tp_subclasses */
+    0,                                          /* tp_weaklist */
+    0,                                          /* tp_del */
+    0,                                          /* tp_version_tag */
+    0,                                          /* tp_finalize */
+    0,                                          /* tp_vectorcall */
+#if (PY_MAJOR_VERSION == 3) && (PY_MINOR_VERSION == 12)
+/* This was introduced first in 3.12
+ * https://github.com/python/cpython/issues/91051
+ */
+    0,                                           /* tp_watched */
+#endif
+
+/* WARNING: Do not remove this, only modify it! It is a version guard to
+ * act as a reminder to update this struct on Python version update! */
+#if (PY_MAJOR_VERSION == 3)
+#if ! (NB_SUPPORTED_PYTHON_MINOR)
+#error "Python minor version is not supported."
+#endif
+#else
+#error "Python major version is not supported."
+#endif
+/* END WARNING*/
+};
+
+
+static PyMethodDef core_methods[] = {
+#define declmethod(func) { #func , ( PyCFunction )func , METH_VARARGS , NULL }
+    declmethod(memoryview_get_buffer),
+    declmethod(memoryview_get_extents),
+    declmethod(memoryview_get_extents_info),
+    { NULL },
+#undef declmethod
+};
+
+
+MOD_INIT(mviewbuf) {
+    PyObject *module;
+    MOD_DEF(module, "mviewbuf", "No docs", core_methods)
+    if (module == NULL)
+        return MOD_ERROR_VAL;
+
+    MemAllocType.tp_new = PyType_GenericNew;
+    if (PyType_Ready(&MemAllocType) < 0){
+        return MOD_ERROR_VAL;
+    }
+
+    Py_INCREF(&MemAllocType);
+    PyModule_AddObject(module, "MemAlloc", (PyObject*)&MemAllocType);
+
+    return MOD_SUCCESS_VAL(module);
+}
diff --git a/numba_cuda/numba/cuda/cext/typeconv.cpp b/numba_cuda/numba/cuda/cext/typeconv.cpp
new file mode 100644
index 000000000..5af7d16b3
--- /dev/null
+++ b/numba_cuda/numba/cuda/cext/typeconv.cpp
@@ -0,0 +1,212 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include <cstring>
+#include <cstdio>
+#include <algorithm>
+#include <limits.h>
+
+#include "typeconv.hpp"
+
+
+// ------ TypeManager ------
+
+TCCMap::TCCMap()
+    : nb_records(0)
+{
+}
+
+size_t TCCMap::hash(const TypePair &key) const {
+    return std::hash<size_t>()(std::hash<Type>()(key.first)) ^
+           std::hash<Type>()(key.second);
+}
+
+void TCCMap::insert(const TypePair &key, TypeCompatibleCode val) {
+    size_t i = hash(key) & (TCCMAP_SIZE - 1);
+    TCCMapBin &bin = records[i];
+    for (unsigned int j = 0; j < bin.size(); ++j) {
+        if (bin[j].key == key) {
+            bin[j].val = val;
+            return;
+        }
+    }
+    bin.push_back({key, val});
+    nb_records++;
+}
+
+TypeCompatibleCode TCCMap::find(const TypePair &key) const {
+    size_t i = hash(key) & (TCCMAP_SIZE - 1);
+    const TCCMapBin &bin = records[i];
+    for (unsigned int j = 0; j < bin.size(); ++j) {
+        if (bin[j].key == key) {
+            return bin[j].val;
+        }
+    }
+    return TCC_FALSE;
+}
+
+// ----- Ratings -----
+Rating::Rating() : promote(0), safe_convert(0), unsafe_convert(0) { }
+
+inline bool Rating::operator < (const Rating &other) const {
+    if (unsafe_convert < other.unsafe_convert)
+        return true;
+    else if (unsafe_convert > other.unsafe_convert)
+        return false;
+    if (safe_convert < other.safe_convert)
+        return true;
+    else if (safe_convert > other.safe_convert)
+        return false;
+    return (promote < other.promote);
+}
+
+inline bool Rating::operator == (const Rating &other) const {
+    return promote == other.promote && safe_convert == other.safe_convert &&
+           unsafe_convert == other.unsafe_convert;
+}
+
+// ------ TypeManager ------
+
+bool TypeManager::canPromote(Type from, Type to) const {
+    return isCompatible(from, to) == TCC_PROMOTE;
+}
+
+bool TypeManager::canSafeConvert(Type from, Type to) const {
+    return isCompatible(from, to) == TCC_CONVERT_SAFE;
+}
+
+bool TypeManager::canUnsafeConvert(Type from, Type to) const {
+    return isCompatible(from, to) == TCC_CONVERT_UNSAFE;
+}
+
+void TypeManager::addPromotion(Type from, Type to) {
+    return addCompatibility(from, to, TCC_PROMOTE);
+}
+
+void TypeManager::addUnsafeConversion(Type from, Type to) {
+    return addCompatibility(from, to, TCC_CONVERT_UNSAFE);
+}
+
+void TypeManager::addSafeConversion(Type from, Type to) {
+    return addCompatibility(from, to, TCC_CONVERT_SAFE);
+}
+
+void TypeManager::addCompatibility(Type from, Type to, TypeCompatibleCode tcc) {
+    TypePair pair(from, to);
+    tccmap.insert(pair, tcc);
+}
+
+TypeCompatibleCode TypeManager::isCompatible(Type from, Type to) const {
+    if (from == to)
+        return TCC_EXACT;
+    TypePair pair(from, to);
+    return tccmap.find(pair);
+}
+
+
+int TypeManager::selectOverload(const Type sig[], const Type ovsigs[],
+                                int &selected,
+                                int sigsz, int ovct, bool allow_unsafe,
+                                bool exact_match_required
+                               ) const {
+    int count;
+    if (ovct <= 16) {
+        Rating ratings[16];
+        int candidates[16];
+        count = _selectOverload(sig, ovsigs, selected, sigsz, ovct,
+                                allow_unsafe, exact_match_required, ratings,
+                                candidates);
+    }
+    else {
+        Rating *ratings = new Rating[ovct];
+        int *candidates = new int[ovct];
+        count = _selectOverload(sig, ovsigs, selected, sigsz, ovct,
+                                allow_unsafe, exact_match_required, ratings,
+                                candidates);
+        delete [] ratings;
+        delete [] candidates;
+    }
+    return count;
+}
+
+int TypeManager::_selectOverload(const Type sig[], const Type ovsigs[],
+                                 int &selected, int sigsz, int ovct,
+                                 bool allow_unsafe, bool exact_match_required,
+                                 Rating ratings[], int candidates[]) const {
+    // Generate rating table
+    // Use a penalize scheme.
+    int nb_candidates = 0;
+
+    for (int i = 0; i < ovct; ++i) {
+        const Type *entry = &ovsigs[i * sigsz];
+        Rating rate;
+
+        for (int j = 0; j < sigsz; ++j) {
+            TypeCompatibleCode tcc = isCompatible(sig[j], entry[j]);
+            if (tcc == TCC_FALSE ||
+                (tcc == TCC_CONVERT_UNSAFE && !allow_unsafe) ||
+                (tcc != TCC_EXACT && exact_match_required)) {
+                // stop the loop early
+                goto _incompatible;
+            }
+            switch(tcc) {
+            case TCC_PROMOTE:
+                rate.promote += 1;
+                break;
+            case TCC_CONVERT_SAFE:
+                rate.safe_convert += 1;
+                break;
+            case TCC_CONVERT_UNSAFE:
+                rate.unsafe_convert += 1;
+                break;
+            default:
+                break;
+            }
+        }
+        ratings[nb_candidates] = rate;
+        candidates[nb_candidates] = i;
+        nb_candidates++;
+    _incompatible:
+        ;
+    }
+
+    // Bail if no match
+    if (nb_candidates == 0)
+        return 0;
+
+    // Find lowest rating
+    Rating best = ratings[0];
+    selected = candidates[0];
+
+    int matchcount = 1;
+    for (int i = 1; i < nb_candidates; ++i) {
+        if (ratings[i] < best) {
+            best = ratings[i];
+            selected = candidates[i];
+            matchcount = 1;
+        }
+        else if (ratings[i] == best) {
+            matchcount += 1;
+        }
+    }
+    return matchcount;
+}
+
+// ----- utils -----
+
+const char* TCCString(TypeCompatibleCode tcc) {
+    switch(tcc) {
+    case TCC_EXACT:
+        return "exact";
+    case TCC_SUBTYPE:
+        return "subtype";
+    case TCC_PROMOTE:
+        return "promote";
+    case TCC_CONVERT_SAFE:
+        return "safe_convert";
+    case TCC_CONVERT_UNSAFE:
+        return "unsafe_convert";
+    default:
+        return "false";
+    }
+}
diff --git a/numba_cuda/numba/cuda/cext/typeconv.hpp b/numba_cuda/numba/cuda/cext/typeconv.hpp
new file mode 100644
index 000000000..da5d87a77
--- /dev/null
+++ b/numba_cuda/numba/cuda/cext/typeconv.hpp
@@ -0,0 +1,101 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef NUMBA_TYPECONV_HPP_
+#define NUMBA_TYPECONV_HPP_
+#include <string>
+#include <vector>
+
+
+typedef int Type;
+
+enum TypeCompatibleCode{
+    // No match
+    TCC_FALSE = 0,
+    // Exact match
+    TCC_EXACT,
+    // Subtype is UNUSED
+    TCC_SUBTYPE,
+    // Promotion with no precision loss
+    TCC_PROMOTE,
+    // Conversion with no precision loss
+    // e.g. int32 to double
+    TCC_CONVERT_SAFE,
+    // Conversion with precision loss
+    // e.g. int64 to double (53 bits precision)
+    TCC_CONVERT_UNSAFE,
+};
+
+typedef std::pair<Type, Type> TypePair;
+
+struct TCCRecord {
+    TypePair key;
+    TypeCompatibleCode val;
+};
+
+typedef std::vector<TCCRecord> TCCMapBin;
+
+class TCCMap {
+public:
+    TCCMap();
+
+    void insert(const TypePair &key, TypeCompatibleCode val);
+    TypeCompatibleCode find(const TypePair &key) const;
+private:
+    size_t hash(const TypePair &key) const;
+
+    /* Must be a power of two */
+    static const size_t TCCMAP_SIZE = 512;
+    TCCMapBin records[TCCMAP_SIZE];
+    int nb_records;
+};
+
+struct Rating {
+    unsigned int promote;
+    unsigned int safe_convert;
+    unsigned int unsafe_convert;
+
+    Rating();
+
+    bool operator < (const Rating &other) const;
+    bool operator == (const Rating &other) const;
+};
+
+
+class TypeManager{
+public:
+    bool canPromote(Type from, Type to) const;
+    bool canUnsafeConvert(Type from, Type to) const;
+    bool canSafeConvert(Type from, Type to) const;
+
+    void addPromotion(Type from, Type to);
+    void addUnsafeConversion(Type from, Type to);
+    void addSafeConversion(Type from, Type to);
+    void addCompatibility(Type from, Type to, TypeCompatibleCode by);
+
+    TypeCompatibleCode isCompatible(Type from, Type to) const;
+
+    /**
+    Output stored in selected.
+    Returns
+        Number of matches
+    */
+    int selectOverload(const Type sig[], const Type ovsigs[], int &selected,
+                       int sigsz, int ovct, bool allow_unsafe,
+                       bool exact_match_required
+                      ) const;
+
+private:
+    int _selectOverload(const Type sig[], const Type ovsigs[], int &selected,
+                        int sigsz, int ovct, bool allow_unsafe,
+                        bool exact_match_required,
+                        Rating ratings[], int candidates[]) const;
+
+    TCCMap tccmap;
+};
+
+
+const char* TCCString(TypeCompatibleCode tcc);
+
+
+#endif // NUMBA_TYPECONV_HPP_
diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py
index e48e66ac7..5597308d7 100644
--- a/numba_cuda/numba/cuda/cudadrv/devicearray.py
+++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py
@@ -16,7 +16,7 @@
 import numpy as np
 
 import numba
-from numba import _devicearray
+from numba.cuda.cext import _devicearray
 from numba.cuda.cudadrv import devices, dummyarray
 from numba.cuda.cudadrv import driver as _driver
 from numba.core import types, config
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 7668f8578..9a6988b57 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -46,7 +46,7 @@
 from collections import namedtuple, deque
 
 
-from numba import mviewbuf
+from numba.cuda.cext import mviewbuf
 from numba.core import config
 from numba.cuda import utils, serialize
 from .error import CudaSupportError, CudaDriverError
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
index aeac72e5e..ad3b57a85 100644
--- a/numba_cuda/numba/cuda/dispatcher.py
+++ b/numba_cuda/numba/cuda/dispatcher.py
@@ -13,18 +13,20 @@
 import re
 from warnings import warn
 
-from numba import cuda, _dispatcher
+from numba.core import types, config, errors, entrypoints
+from numba.cuda import serialize, utils
+from numba import cuda
 
-from numba.core import types, config
 from numba.core.compiler_lock import global_compiler_lock
-from numba.core.dispatcher import _DispatcherBase
-from numba.core.errors import NumbaPerformanceWarning, TypingError
+from numba.core.typeconv.rules import default_type_manager
+from numba.cuda.typing.templates import fold_arguments
 from numba.core.typing.typeof import Purpose, typeof
 
-from numba.cuda import serialize, utils, typing
+from numba.cuda import typing
 from numba.cuda import types as cuda_types
 from numba.cuda.api import get_current_device
 from numba.cuda.args import wrap_arg
+from numba.core.bytecode import get_code_object
 from numba.cuda.compiler import (
     compile_cuda,
     CUDACompiler,
@@ -41,11 +43,12 @@
     missing_launch_config_msg,
     normalize_kernel_dimensions,
 )
-from numba.cuda.typing.templates import fold_arguments
 from numba.cuda.cudadrv.linkable_code import LinkableCode
 from numba.cuda.cudadrv.devices import get_context
 from numba.cuda.memory_management.nrt import rtsys, NRT_LIBRARY
 
+from numba.cuda.cext import _dispatcher
+
 
 cuda_fp16_math_funcs = [
     "hsin",
@@ -694,7 +697,7 @@ def __init__(self, dispatcher, griddim, blockdim, stream, sharedmem):
                     f"Grid size {grid_size} will likely result in GPU "
                     "under-utilization due to low occupancy."
                 )
-                warn(NumbaPerformanceWarning(msg))
+                warn(errors.NumbaPerformanceWarning(msg))
 
     def __call__(self, *args):
         return self.dispatcher.call(
@@ -737,6 +740,601 @@ def load_overload(self, sig, target_context):
             return super().load_overload(sig, target_context)
 
 
+class OmittedArg(object):
+    """
+    A placeholder for omitted arguments with a default value.
+    """
+
+    def __init__(self, value):
+        self.value = value
+
+    def __repr__(self):
+        return "omitted arg(%r)" % (self.value,)
+
+    @property
+    def _numba_type_(self):
+        return types.Omitted(self.value)
+
+
+class CompilingCounter(object):
+    """
+    A simple counter that increment in __enter__ and decrement in __exit__.
+    """
+
+    def __init__(self):
+        self.counter = 0
+
+    def __enter__(self):
+        assert self.counter >= 0
+        self.counter += 1
+
+    def __exit__(self, *args, **kwargs):
+        self.counter -= 1
+        assert self.counter >= 0
+
+    def __bool__(self):
+        return self.counter > 0
+
+    __nonzero__ = __bool__
+
+
+class _DispatcherBase(_dispatcher.Dispatcher):
+    """
+    Common base class for dispatcher Implementations.
+    """
+
+    __numba__ = "py_func"
+
+    def __init__(
+        self, arg_count, py_func, pysig, can_fallback, exact_match_required
+    ):
+        self._tm = default_type_manager
+
+        # A mapping of signatures to compile results
+        self.overloads = collections.OrderedDict()
+
+        self.py_func = py_func
+        # other parts of Numba assume the old Python 2 name for code object
+        self.func_code = get_code_object(py_func)
+        # but newer python uses a different name
+        self.__code__ = self.func_code
+        # a place to keep an active reference to the types of the active call
+        self._types_active_call = set()
+        # Default argument values match the py_func
+        self.__defaults__ = py_func.__defaults__
+
+        argnames = tuple(pysig.parameters)
+        default_values = self.py_func.__defaults__ or ()
+        defargs = tuple(OmittedArg(val) for val in default_values)
+        try:
+            lastarg = list(pysig.parameters.values())[-1]
+        except IndexError:
+            has_stararg = False
+        else:
+            has_stararg = lastarg.kind == lastarg.VAR_POSITIONAL
+        _dispatcher.Dispatcher.__init__(
+            self,
+            self._tm.get_pointer(),
+            arg_count,
+            self._fold_args,
+            argnames,
+            defargs,
+            can_fallback,
+            has_stararg,
+            exact_match_required,
+        )
+
+        self.doc = py_func.__doc__
+        self._compiling_counter = CompilingCounter()
+        weakref.finalize(self, self._make_finalizer())
+
+    def _compilation_chain_init_hook(self):
+        """
+        This will be called ahead of any part of compilation taking place (this
+        even includes being ahead of working out the types of the arguments).
+        This permits activities such as initialising extension entry points so
+        that the compiler knows about additional externally defined types etc
+        before it does anything.
+        """
+        entrypoints.init_all()
+
+    def _reset_overloads(self):
+        self._clear()
+        self.overloads.clear()
+
+    def _make_finalizer(self):
+        """
+        Return a finalizer function that will release references to
+        related compiled functions.
+        """
+        overloads = self.overloads
+        targetctx = self.targetctx
+
+        # Early-bind utils.shutting_down() into the function's local namespace
+        # (see issue #689)
+        def finalizer(shutting_down=utils.shutting_down):
+            # The finalizer may crash at shutdown, skip it (resources
+            # will be cleared by the process exiting, anyway).
+            if shutting_down():
+                return
+            # This function must *not* hold any reference to self:
+            # we take care to bind the necessary objects in the closure.
+            for cres in overloads.values():
+                try:
+                    targetctx.remove_user_function(cres.entry_point)
+                except KeyError:
+                    pass
+
+        return finalizer
+
+    @property
+    def signatures(self):
+        """
+        Returns a list of compiled function signatures.
+        """
+        return list(self.overloads)
+
+    @property
+    def nopython_signatures(self):
+        return [
+            cres.signature
+            for cres in self.overloads.values()
+            if not cres.objectmode
+        ]
+
+    def disable_compile(self, val=True):
+        """Disable the compilation of new signatures at call time."""
+        # If disabling compilation then there must be at least one signature
+        assert (not val) or len(self.signatures) > 0
+        self._can_compile = not val
+
+    def add_overload(self, cres):
+        args = tuple(cres.signature.args)
+        sig = [a._code for a in args]
+        self._insert(sig, cres.entry_point, cres.objectmode)
+        self.overloads[args] = cres
+
+    def fold_argument_types(self, args, kws):
+        return self._compiler.fold_argument_types(args, kws)
+
+    def get_call_template(self, args, kws):
+        """
+        Get a typing.ConcreteTemplate for this dispatcher and the given
+        *args* and *kws* types.  This allows to resolve the return type.
+
+        A (template, pysig, args, kws) tuple is returned.
+        """
+        # XXX how about a dispatcher template class automating the
+        # following?
+
+        # Fold keyword arguments and resolve default values
+        pysig, args = self._compiler.fold_argument_types(args, kws)
+        kws = {}
+        # Ensure an overload is available
+        if self._can_compile:
+            self.compile(tuple(args))
+
+        # Create function type for typing
+        func_name = self.py_func.__name__
+        name = "CallTemplate({0})".format(func_name)
+        # The `key` isn't really used except for diagnosis here,
+        # so avoid keeping a reference to `cfunc`.
+        call_template = typing.make_concrete_template(
+            name, key=func_name, signatures=self.nopython_signatures
+        )
+        return call_template, pysig, args, kws
+
+    def get_overload(self, sig):
+        """
+        Return the compiled function for the given signature.
+        """
+        args, return_type = sigutils.normalize_signature(sig)
+        return self.overloads[tuple(args)].entry_point
+
+    @property
+    def is_compiling(self):
+        """
+        Whether a specialization is currently being compiled.
+        """
+        return self._compiling_counter
+
+    def _compile_for_args(self, *args, **kws):
+        """
+        For internal use.  Compile a specialized version of the function
+        for the given *args* and *kws*, and return the resulting callable.
+        """
+        assert not kws
+        # call any initialisation required for the compilation chain (e.g.
+        # extension point registration).
+        self._compilation_chain_init_hook()
+
+        def error_rewrite(e, issue_type):
+            """
+            Rewrite and raise Exception `e` with help supplied based on the
+            specified issue_type.
+            """
+            if config.SHOW_HELP:
+                help_msg = errors.error_extras[issue_type]
+                e.patch_message("\n".join((str(e).rstrip(), help_msg)))
+            if config.FULL_TRACEBACKS:
+                raise e
+            else:
+                raise e.with_traceback(None)
+
+        argtypes = []
+        for a in args:
+            if isinstance(a, OmittedArg):
+                argtypes.append(types.Omitted(a.value))
+            else:
+                argtypes.append(self.typeof_pyval(a))
+
+        return_val = None
+        try:
+            return_val = self.compile(tuple(argtypes))
+        except errors.ForceLiteralArg as e:
+            # Received request for compiler re-entry with the list of arguments
+            # indicated by e.requested_args.
+            # First, check if any of these args are already Literal-ized
+            already_lit_pos = [
+                i
+                for i in e.requested_args
+                if isinstance(args[i], types.Literal)
+            ]
+            if already_lit_pos:
+                # Abort compilation if any argument is already a Literal.
+                # Letting this continue will cause infinite compilation loop.
+                m = (
+                    "Repeated literal typing request.\n"
+                    "{}.\n"
+                    "This is likely caused by an error in typing. "
+                    "Please see nested and suppressed exceptions."
+                )
+                info = ", ".join(
+                    "Arg #{} is {}".format(i, args[i])
+                    for i in sorted(already_lit_pos)
+                )
+                raise errors.CompilerError(m.format(info))
+            # Convert requested arguments into a Literal.
+            args = [
+                (types.literal if i in e.requested_args else lambda x: x)(
+                    args[i]
+                )
+                for i, v in enumerate(args)
+            ]
+            # Re-enter compilation with the Literal-ized arguments
+            return_val = self._compile_for_args(*args)
+
+        except errors.TypingError as e:
+            # Intercept typing error that may be due to an argument
+            # that failed inferencing as a Numba type
+            failed_args = []
+            for i, arg in enumerate(args):
+                val = arg.value if isinstance(arg, OmittedArg) else arg
+                try:
+                    tp = typeof(val, Purpose.argument)
+                except (errors.NumbaValueError, ValueError) as typeof_exc:
+                    failed_args.append((i, str(typeof_exc)))
+                else:
+                    if tp is None:
+                        failed_args.append(
+                            (i, f"cannot determine Numba type of value {val}")
+                        )
+            if failed_args:
+                # Patch error message to ease debugging
+                args_str = "\n".join(
+                    f"- argument {i}: {err}" for i, err in failed_args
+                )
+                msg = (
+                    f"{str(e).rstrip()} \n\nThis error may have been caused "
+                    f"by the following argument(s):\n{args_str}\n"
+                )
+                e.patch_message(msg)
+
+            error_rewrite(e, "typing")
+        except errors.UnsupportedError as e:
+            # Something unsupported is present in the user code, add help info
+            error_rewrite(e, "unsupported_error")
+        except (
+            errors.NotDefinedError,
+            errors.RedefinedError,
+            errors.VerificationError,
+        ) as e:
+            # These errors are probably from an issue with either the code
+            # supplied being syntactically or otherwise invalid
+            error_rewrite(e, "interpreter")
+        except errors.ConstantInferenceError as e:
+            # this is from trying to infer something as constant when it isn't
+            # or isn't supported as a constant
+            error_rewrite(e, "constant_inference")
+        except Exception as e:
+            if config.SHOW_HELP:
+                if hasattr(e, "patch_message"):
+                    help_msg = errors.error_extras["reportable"]
+                    e.patch_message("\n".join((str(e).rstrip(), help_msg)))
+            # ignore the FULL_TRACEBACKS config, this needs reporting!
+            raise e
+        finally:
+            self._types_active_call.clear()
+        return return_val
+
+    def inspect_llvm(self, signature=None):
+        """Get the LLVM intermediate representation generated by compilation.
+
+        Parameters
+        ----------
+        signature : tuple of numba types, optional
+            Specify a signature for which to obtain the LLVM IR. If None, the
+            IR is returned for all available signatures.
+
+        Returns
+        -------
+        llvm : dict[signature, str] or str
+            Either the LLVM IR string for the specified signature, or, if no
+            signature was given, a dictionary mapping signatures to LLVM IR
+            strings.
+        """
+        if signature is not None:
+            lib = self.overloads[signature].library
+            return lib.get_llvm_str()
+
+        return dict((sig, self.inspect_llvm(sig)) for sig in self.signatures)
+
+    def inspect_asm(self, signature=None):
+        """Get the generated assembly code.
+
+        Parameters
+        ----------
+        signature : tuple of numba types, optional
+            Specify a signature for which to obtain the assembly code. If
+            None, the assembly code is returned for all available signatures.
+
+        Returns
+        -------
+        asm : dict[signature, str] or str
+            Either the assembly code for the specified signature, or, if no
+            signature was given, a dictionary mapping signatures to assembly
+            code.
+        """
+        if signature is not None:
+            lib = self.overloads[signature].library
+            return lib.get_asm_str()
+
+        return dict((sig, self.inspect_asm(sig)) for sig in self.signatures)
+
+    def inspect_types(
+        self, file=None, signature=None, pretty=False, style="default", **kwargs
+    ):
+        """Print/return Numba intermediate representation (IR)-annotated code.
+
+        Parameters
+        ----------
+        file : file-like object, optional
+            File to which to print. Defaults to sys.stdout if None. Must be
+            None if ``pretty=True``.
+        signature : tuple of numba types, optional
+            Print/return the intermediate representation for only the given
+            signature. If None, the IR is printed for all available signatures.
+        pretty : bool, optional
+            If True, an Annotate object will be returned that can render the
+            IR with color highlighting in Jupyter and IPython. ``file`` must
+            be None if ``pretty`` is True. Additionally, the ``pygments``
+            library must be installed for ``pretty=True``.
+        style : str, optional
+            Choose a style for rendering. Ignored if ``pretty`` is ``False``.
+            This is directly consumed by ``pygments`` formatters. To see a
+            list of available styles, import ``pygments`` and run
+            ``list(pygments.styles.get_all_styles())``.
+
+        Returns
+        -------
+        annotated : Annotate object, optional
+            Only returned if ``pretty=True``, otherwise this function is only
+            used for its printing side effect. If ``pretty=True``, an Annotate
+            object is returned that can render itself in Jupyter and IPython.
+        """
+        overloads = self.overloads
+        if signature is not None:
+            overloads = {signature: self.overloads[signature]}
+
+        if not pretty:
+            if file is None:
+                file = sys.stdout
+
+            for ver, res in overloads.items():
+                print("%s %s" % (self.py_func.__name__, ver), file=file)
+                print("-" * 80, file=file)
+                print(res.type_annotation, file=file)
+                print("=" * 80, file=file)
+        else:
+            if file is not None:
+                raise ValueError("`file` must be None if `pretty=True`")
+            from numba.core.annotations.pretty_annotate import Annotate
+
+            return Annotate(self, signature=signature, style=style)
+
+    def inspect_cfg(self, signature=None, show_wrapper=None, **kwargs):
+        """
+        For inspecting the CFG of the function.
+
+        By default the CFG of the user function is shown.  The *show_wrapper*
+        option can be set to "python" or "cfunc" to show the python wrapper
+        function or the *cfunc* wrapper function, respectively.
+
+        Parameters accepted in kwargs
+        -----------------------------
+        filename : string, optional
+            the name of the output file, if given this will write the output to
+            filename
+        view : bool, optional
+            whether to immediately view the optional output file
+        highlight : bool, set, dict, optional
+            what, if anything, to highlight, options are:
+            { incref : bool, # highlight NRT_incref calls
+              decref : bool, # highlight NRT_decref calls
+              returns : bool, # highlight exits which are normal returns
+              raises : bool, # highlight exits which are from raise
+              meminfo : bool, # highlight calls to NRT*meminfo
+              branches : bool, # highlight true/false branches
+             }
+            Default is True which sets all of the above to True. Supplying a set
+            of strings is also accepted, these are interpreted as key:True with
+            respect to the above dictionary. e.g. {'incref', 'decref'} would
+            switch on highlighting on increfs and decrefs.
+        interleave: bool, set, dict, optional
+            what, if anything, to interleave in the LLVM IR, options are:
+            { python: bool # interleave python source code with the LLVM IR
+              lineinfo: bool # interleave line information markers with the LLVM
+                             # IR
+            }
+            Default is True which sets all of the above to True. Supplying a set
+            of strings is also accepted, these are interpreted as key:True with
+            respect to the above dictionary. e.g. {'python',} would
+            switch on interleaving of python source code in the LLVM IR.
+        strip_ir : bool, optional
+            Default is False. If set to True all LLVM IR that is superfluous to
+            that requested in kwarg `highlight` will be removed.
+        show_key : bool, optional
+            Default is True. Create a "key" for the highlighting in the rendered
+            CFG.
+        fontsize : int, optional
+            Default is 8. Set the fontsize in the output to this value.
+        """
+        if signature is not None:
+            cres = self.overloads[signature]
+            lib = cres.library
+            if show_wrapper == "python":
+                fname = cres.fndesc.llvm_cpython_wrapper_name
+            elif show_wrapper == "cfunc":
+                fname = cres.fndesc.llvm_cfunc_wrapper_name
+            else:
+                fname = cres.fndesc.mangled_name
+            return lib.get_function_cfg(fname, py_func=self.py_func, **kwargs)
+
+        return dict(
+            (sig, self.inspect_cfg(sig, show_wrapper=show_wrapper))
+            for sig in self.signatures
+        )
+
+    def inspect_disasm_cfg(self, signature=None):
+        """
+        For inspecting the CFG of the disassembly of the function.
+
+        Requires python package: r2pipe
+        Requires radare2 binary on $PATH.
+        Notebook rendering requires python package: graphviz
+
+        signature : tuple of Numba types, optional
+            Print/return the disassembly CFG for only the given signatures.
+            If None, the IR is printed for all available signatures.
+        """
+        if signature is not None:
+            cres = self.overloads[signature]
+            lib = cres.library
+            return lib.get_disasm_cfg(cres.fndesc.mangled_name)
+
+        return dict(
+            (sig, self.inspect_disasm_cfg(sig)) for sig in self.signatures
+        )
+
+    def get_annotation_info(self, signature=None):
+        """
+        Gets the annotation information for the function specified by
+        signature. If no signature is supplied a dictionary of signature to
+        annotation information is returned.
+        """
+        signatures = self.signatures if signature is None else [signature]
+        out = collections.OrderedDict()
+        for sig in signatures:
+            cres = self.overloads[sig]
+            ta = cres.type_annotation
+            key = (
+                ta.func_id.filename + ":" + str(ta.func_id.firstlineno + 1),
+                ta.signature,
+            )
+            out[key] = ta.annotate_raw()[key]
+        return out
+
+    def _explain_ambiguous(self, *args, **kws):
+        """
+        Callback for the C _Dispatcher object.
+        """
+        assert not kws, "kwargs not handled"
+        args = tuple([self.typeof_pyval(a) for a in args])
+        # The order here must be deterministic for testing purposes, which
+        # is ensured by the OrderedDict.
+        sigs = self.nopython_signatures
+        # This will raise
+        self.typingctx.resolve_overload(
+            self.py_func, sigs, args, kws, allow_ambiguous=False
+        )
+
+    def _explain_matching_error(self, *args, **kws):
+        """
+        Callback for the C _Dispatcher object.
+        """
+        assert not kws, "kwargs not handled"
+        args = [self.typeof_pyval(a) for a in args]
+        msg = "No matching definition for argument type(s) %s" % ", ".join(
+            map(str, args)
+        )
+        raise TypeError(msg)
+
+    def _search_new_conversions(self, *args, **kws):
+        """
+        Callback for the C _Dispatcher object.
+        Search for approximately matching signatures for the given arguments,
+        and ensure the corresponding conversions are registered in the C++
+        type manager.
+        """
+        assert not kws, "kwargs not handled"
+        args = [self.typeof_pyval(a) for a in args]
+        found = False
+        for sig in self.nopython_signatures:
+            conv = self.typingctx.install_possible_conversions(args, sig.args)
+            if conv:
+                found = True
+        return found
+
+    def __repr__(self):
+        return "%s(%s)" % (type(self).__name__, self.py_func)
+
+    def typeof_pyval(self, val):
+        """
+        Resolve the Numba type of Python value *val*.
+        This is called from numba._dispatcher as a fallback if the native code
+        cannot decide the type.
+        """
+        try:
+            tp = typeof(val, Purpose.argument)
+        except (errors.NumbaValueError, ValueError):
+            tp = types.pyobject
+        else:
+            if tp is None:
+                tp = types.pyobject
+        self._types_active_call.add(tp)
+        return tp
+
+    def _callback_add_timer(self, duration, cres, lock_name):
+        md = cres.metadata
+        # md can be None when code is loaded from cache
+        if md is not None:
+            timers = md.setdefault("timers", {})
+            if lock_name not in timers:
+                # Only write if the metadata does not exist
+                timers[lock_name] = duration
+            else:
+                msg = f"'{lock_name} metadata is already defined."
+                raise AssertionError(msg)
+
+    def _callback_add_compiler_timer(self, duration, cres):
+        return self._callback_add_timer(
+            duration, cres, lock_name="compiler_lock"
+        )
+
+    def _callback_add_llvm_timer(self, duration, cres):
+        return self._callback_add_timer(duration, cres, lock_name="llvm_lock")
+
+
 class _MemoMixin:
     __uuid = None
     # A {uuid -> instance} mapping, for deserialization
@@ -828,7 +1426,7 @@ def _compile_cached(self, args, return_type):
 
         try:
             retval = self._compile_core(args, return_type)
-        except TypingError as e:
+        except errors.TypingError as e:
             self._failed_cache[key] = e
             return False, e
         else:
@@ -1316,7 +1914,7 @@ def get_compile_result(self, sig):
                 self.compile(atypes)
             else:
                 msg = f"{sig} not available and compilation disabled"
-                raise TypingError(msg)
+                raise errors.TypingError(msg)
         return self.overloads[atypes]
 
     def recompile(self):
@@ -1523,3 +2121,9 @@ def _reduce_states(self):
         Compiled definitions are discarded.
         """
         return dict(py_func=self.py_func, targetoptions=self.targetoptions)
+
+
+# Initialize typeof machinery
+_dispatcher.typeof_init(
+    OmittedArg, dict((str(t), t._code) for t in types.number_domain)
+)
diff --git a/pyproject.toml b/pyproject.toml
index 550a30057..e81e53950 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,6 +6,7 @@ build-backend = "setuptools.build_meta"
 requires = [
     "setuptools",
     "wheel",
+    "numpy",
 ]
 
 [project]
diff --git a/setup.py b/setup.py
index 607822221..cb5d4b4db 100644
--- a/setup.py
+++ b/setup.py
@@ -2,16 +2,127 @@
 # SPDX-License-Identifier: BSD-2-Clause
 
 import pathlib
+import sys
 
-from setuptools import setup
+from setuptools import setup, Extension
 from setuptools.command.build_py import build_py
 from setuptools.command.editable_wheel import editable_wheel, _TopLevelFinder
+from setuptools.command.build_ext import build_ext
 
 REDIRECTOR_PTH = "_numba_cuda_redirector.pth"
 REDIRECTOR_PY = "_numba_cuda_redirector.py"
 SITE_PACKAGES = pathlib.Path("site-packages")
 
 
+def get_version():
+    """Read version from VERSION file."""
+    version_file = pathlib.Path(__file__).parent / "numba_cuda" / "VERSION"
+    return version_file.read_text().strip()
+
+
+def get_ext_modules():
+    """
+    Return a list of Extension instances for the setup() call.
+    """
+    # Note we don't import NumPy at the toplevel, since setup.py
+    # should be able to run without NumPy for pip to discover the
+    # build dependencies. Need NumPy headers and libm linkage.
+    import numpy as np
+
+    np_compile_args = {
+        "include_dirs": [
+            np.get_include(),
+        ],
+    }
+    if sys.platform != "win32":
+        np_compile_args["libraries"] = [
+            "m",
+        ]
+
+    ext_devicearray = Extension(
+        name="numba_cuda.numba.cuda.cext._devicearray",
+        sources=["numba_cuda/numba/cuda/cext/_devicearray.cpp"],
+        depends=[
+            "numba_cuda/numba/cuda/cext/_pymodule.h",
+            "numba_cuda/numba/cuda/cext/_devicearray.h",
+        ],
+        include_dirs=["numba_cuda/numba/cuda/cext"],
+        extra_compile_args=["-std=c++11"],
+    )
+
+    install_name_tool_fixer = []
+    if sys.platform == "darwin":
+        install_name_tool_fixer = ["-headerpad_max_install_names"]
+
+    ext_mviewbuf = Extension(
+        name="numba_cuda.numba.cuda.cext.mviewbuf",
+        extra_link_args=install_name_tool_fixer,
+        sources=["numba_cuda/numba/cuda/cext/mviewbuf.c"],
+    )
+
+    dispatcher_sources = [
+        "numba_cuda/numba/cuda/cext/_dispatcher.cpp",
+        "numba_cuda/numba/cuda/cext/_typeof.cpp",
+        "numba_cuda/numba/cuda/cext/_hashtable.cpp",
+        "numba_cuda/numba/cuda/cext/typeconv.cpp",
+    ]
+    ext_dispatcher = Extension(
+        name="numba_cuda.numba.cuda.cext._dispatcher",
+        sources=dispatcher_sources,
+        depends=[
+            "numba_cuda/numba/cuda/cext/_pymodule.h",
+            "numba_cuda/numba/cuda/cext/_typeof.h",
+            "numba_cuda/numba/cuda/cext/_hashtable.h",
+        ],
+        extra_compile_args=["-std=c++11"],
+        **np_compile_args,
+    )
+
+    # Append our cext dir to include_dirs
+    ext_dispatcher.include_dirs.append("numba_cuda/numba/cuda/cext")
+
+    return [ext_dispatcher, ext_mviewbuf, ext_devicearray]
+
+
+def is_building():
+    """
+    Parse the setup.py command and return whether a build is requested.
+    If False is returned, only an informational command is run.
+    If True is returned, information about C extensions will have to
+    be passed to the setup() function.
+    """
+    if len(sys.argv) < 2:
+        # User forgot to give an argument probably, let setuptools handle that.
+        return True
+
+    build_commands = [
+        "build",
+        "build_py",
+        "build_ext",
+        "build_clib",
+        "build_scripts",
+        "install",
+        "install_lib",
+        "install_headers",
+        "install_scripts",
+        "install_data",
+        "sdist",
+        "bdist",
+        "bdist_dumb",
+        "bdist_rpm",
+        "bdist_wininst",
+        "check",
+        "build_docs",
+        "bdist_wheel",
+        "bdist_egg",
+        "develop",
+        "easy_install",
+        "test",
+        "editable_wheel",
+    ]
+    return any(bc in sys.argv[1:] for bc in build_commands)
+
+
 # Adapted from https://stackoverflow.com/a/71137790
 class build_py_with_redirector(build_py):  # noqa: N801
     """Include the redirector files in the generated wheel."""
@@ -72,9 +183,52 @@ def _select_strategy(self, name, tag, build_lib):
         return TopLevelFinderWithRedirector(self.distribution, name)
 
 
+cmdclass = {}
+
+numba_be_user_options = [
+    ("werror", None, "Build extensions with -Werror"),
+    ("wall", None, "Build extensions with -Wall"),
+    ("noopt", None, "Build extensions without optimization"),
+]
+
+
+class NumbaBuildExt(build_ext):
+    user_options = build_ext.user_options + numba_be_user_options
+    boolean_options = build_ext.boolean_options + ["werror", "wall", "noopt"]
+
+    def initialize_options(self):
+        super().initialize_options()
+        self.werror = 0
+        self.wall = 0
+        self.noopt = 0
+
+    def run(self):
+        extra_compile_args = []
+        if self.noopt:
+            if sys.platform == "win32":
+                extra_compile_args.append("/Od")
+            else:
+                extra_compile_args.append("-O0")
+        if self.werror:
+            extra_compile_args.append("-Werror")
+        if self.wall:
+            extra_compile_args.append("-Wall")
+        for ext in self.extensions:
+            ext.extra_compile_args.extend(extra_compile_args)
+
+        super().run()
+
+
+cmdclass["build_ext"] = NumbaBuildExt
+cmdclass["build_py"] = build_py_with_redirector
+cmdclass["editable_wheel"] = editable_wheel_with_redirector
+
+if is_building():
+    ext_modules = get_ext_modules()
+else:
+    ext_modules = []
+
 setup(
-    cmdclass={
-        "build_py": build_py_with_redirector,
-        "editable_wheel": editable_wheel_with_redirector,
-    }
+    cmdclass=cmdclass,
+    ext_modules=ext_modules,
 )
diff --git a/site-packages/_numba_cuda_redirector.py b/site-packages/_numba_cuda_redirector.py
index 3c1301ba3..a35894baf 100644
--- a/site-packages/_numba_cuda_redirector.py
+++ b/site-packages/_numba_cuda_redirector.py
@@ -34,7 +34,7 @@ def ensure_initialized(self):
 
         numba_cuda_spec = importlib.util.find_spec("numba_cuda")
 
-        if numba_spec is None:
+        if numba_cuda_spec is None:
             warnings.warn(no_spec_msg.format("numba_cuda"))
             self.initialized = False
             return False