Skip to content

Commit 0490a55

Browse files
committed
initial
1 parent e7125bf commit 0490a55

File tree

8 files changed

+232
-25
lines changed

8 files changed

+232
-25
lines changed

numba_cuda/numba/cuda/cudadrv/nvrtc.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,11 @@ def compile(src, name, cc):
240240
cudadrv_path = os.path.dirname(os.path.abspath(__file__))
241241
numba_cuda_path = os.path.dirname(cudadrv_path)
242242
numba_include = f'-I{numba_cuda_path}'
243-
options = [arch, *cuda_include, numba_include, '-rdc', 'true']
243+
244+
nrt_path = os.path.join(numba_cuda_path, "runtime")
245+
nrt_include = f'-I{nrt_path}'
246+
247+
options = [arch, *cuda_include, numba_include, nrt_include, '-rdc', 'true']
244248

245249
if nvrtc.get_version() < (12, 0):
246250
options += ["-std=c++17"]

numba_cuda/numba/cuda/dispatcher.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from numba.cuda.errors import (missing_launch_config_msg,
2222
normalize_kernel_dimensions)
2323
from numba.cuda import types as cuda_types
24+
from numba.cuda.runtime.nrt import rtsys
2425

2526
from numba import cuda
2627
from numba import _dispatcher
@@ -340,6 +341,8 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):
340341
# Prepare kernel
341342
cufunc = self._codelibrary.get_cufunc()
342343

344+
rtsys.allocate()
345+
343346
if self.debug:
344347
excname = cufunc.name + "__errcode__"
345348
excmem, excsz = cufunc.module.get_global_symbol(excname)
@@ -361,6 +364,9 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):
361364

362365
stream_handle = stream and stream.handle or zero_stream
363366

367+
rtsys.set_memsys_to_module(cufunc.module, stream_handle)
368+
rtsys.initialize()
369+
364370
# Invoke kernel
365371
driver.launch_kernel(cufunc.handle,
366372
*griddim,
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from numba.cuda.runtime.nrt import rtsys # noqa: F401
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#include "memsys.cuh"
2+
3+
extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr)
4+
{
5+
TheMSys = memsys_ptr;
6+
}
7+
8+
extern "C" __global__ void NRT_MemSys_read(uint64_t *managed_memsys)
9+
{
10+
managed_memsys[0] = TheMSys->stats.alloc;
11+
managed_memsys[1] = TheMSys->stats.free;
12+
managed_memsys[2] = TheMSys->stats.mi_alloc;
13+
managed_memsys[3] = TheMSys->stats.mi_free;
14+
}
15+
16+
extern "C" __global__ void NRT_MemSys_init(void)
17+
{
18+
TheMSys->stats.enabled = false;
19+
TheMSys->stats.alloc = 0;
20+
TheMSys->stats.free = 0;
21+
TheMSys->stats.mi_alloc = 0;
22+
TheMSys->stats.mi_free = 0;
23+
}
24+
25+
extern "C" __global__ void NRT_MemSys_enable(void)
26+
{
27+
TheMSys->stats.enabled = true;
28+
}
29+
30+
extern "C" __global__ void NRT_MemSys_disable(void)
31+
{
32+
TheMSys->stats.enabled = false;
33+
}
34+
35+
extern "C" __global__ void NRT_MemSys_print(void)
36+
{
37+
if (TheMSys != nullptr)
38+
{
39+
printf("TheMSys->stats.enabled %d\n", TheMSys->stats.enabled);
40+
printf("TheMSys->stats.alloc %d\n", TheMSys->stats.alloc);
41+
printf("TheMSys->stats.free %d\n", TheMSys->stats.free);
42+
printf("TheMSys->stats.mi_alloc %d\n", TheMSys->stats.mi_alloc);
43+
printf("TheMSys->stats.mi_free %d\n", TheMSys->stats.mi_free);
44+
} else {
45+
printf("TheMsys is null.\n");
46+
}
47+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#include <cuda/atomic>
2+
3+
// Globally needed variables
4+
struct NRT_MemSys {
5+
struct {
6+
bool enabled;
7+
cuda::atomic<size_t, cuda::thread_scope_device> alloc;
8+
cuda::atomic<size_t, cuda::thread_scope_device> free;
9+
cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
10+
cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
11+
} stats;
12+
};
13+
14+
/* The Memory System object */
15+
__device__ NRT_MemSys* TheMSys;
16+
17+
extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr);

numba_cuda/numba/cuda/runtime/nrt.cu

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
#include <cuda/atomic>
55

6+
#include "memsys.cuh"
7+
68
typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info);
79
typedef void (*NRT_dealloc_func)(void* ptr, void* dealloc_info);
810

@@ -18,29 +20,20 @@ struct MemInfo {
1820
};
1921
}
2022

21-
// Globally needed variables
22-
struct NRT_MemSys {
23-
struct {
24-
bool enabled;
25-
cuda::atomic<size_t, cuda::thread_scope_device> alloc;
26-
cuda::atomic<size_t, cuda::thread_scope_device> free;
27-
cuda::atomic<size_t, cuda::thread_scope_device> mi_alloc;
28-
cuda::atomic<size_t, cuda::thread_scope_device> mi_free;
29-
} stats;
30-
};
23+
extern "C" __global__ void NRT_MemSys_set(NRT_MemSys *memsys_ptr)
24+
{
25+
TheMSys = memsys_ptr;
26+
}
3127

3228
static __device__ void *nrt_allocate_meminfo_and_data_align(size_t size, unsigned align, NRT_MemInfo **mi);
3329
static __device__ void *nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo **mi_out);
3430
extern "C" __device__ void* NRT_Allocate_External(size_t size);
3531

36-
/* The Memory System object */
37-
__device__ NRT_MemSys* TheMSys;
38-
3932
extern "C" __device__ void* NRT_Allocate(size_t size)
4033
{
4134
void* ptr = NULL;
4235
ptr = malloc(size);
43-
// if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
36+
if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
4437
return ptr;
4538
}
4639

@@ -49,14 +42,13 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
4942
size_t size,
5043
NRT_dtor_function dtor,
5144
void* dtor_info)
52-
// NRT_MemSys* TheMSys)
5345
{
5446
mi->refct = 1; /* starts with 1 refct */
5547
mi->dtor = dtor;
5648
mi->dtor_info = dtor_info;
5749
mi->data = data;
5850
mi->size = size;
59-
// if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
51+
if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
6052
}
6153

6254
extern "C"
@@ -71,7 +63,7 @@ __device__ NRT_MemInfo* NRT_MemInfo_new(
7163
extern "C" __device__ void NRT_Free(void* ptr)
7264
{
7365
free(ptr);
74-
//if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
66+
if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
7567
}
7668

7769
extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
@@ -82,8 +74,9 @@ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
8274
extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
8375
{
8476
NRT_dealloc(mi);
85-
//if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
77+
if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
8678
}
79+
8780
extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
8881
{
8982
if (mi->dtor) /* We have a destructor */
@@ -158,10 +151,10 @@ extern "C" __device__ void* NRT_Allocate_External(size_t size) {
158151
ptr = malloc(size);
159152
//NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));
160153

161-
//if (TheMSys.stats.enabled)
162-
//{
163-
// TheMSys.stats.alloc++;
164-
//}
154+
if (TheMSys->stats.enabled)
155+
{
156+
TheMSys->stats.alloc++;
157+
}
165158
return ptr;
166159
}
167160

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
import os
2+
import numpy as np
3+
4+
from numba import cuda
5+
from numba.core.runtime.nrt import _nrt_mstats
6+
from numba.cuda.cudadrv.driver import Linker, launch_kernel
7+
from numba.cuda.cudadrv import devices
8+
from numba.cuda.api import get_current_device
9+
10+
11+
class _Runtime:
12+
_instance = None
13+
14+
def __new__(cls, *args, **kwargs):
15+
if cls._instance is None:
16+
cls._instance = super(_Runtime, cls).__new__(cls, *args, **kwargs)
17+
return cls._instance
18+
19+
def __init__(self):
20+
self._memsys_module = None
21+
self._memsys = None
22+
23+
self._initialized = False
24+
25+
def _compile_memsys_module(self):
26+
memsys_mod = os.path.join(
27+
os.path.dirname(os.path.abspath(__file__)),
28+
"memsys.cu"
29+
)
30+
cc = get_current_device().compute_capability
31+
32+
linker = Linker.new(cc=cc)
33+
linker.add_cu_file(memsys_mod)
34+
cubin = linker.complete()
35+
36+
ctx = devices.get_context()
37+
module = ctx.create_module_image(cubin)
38+
39+
self._memsys_module = module
40+
41+
def _ensure_allocate(self):
42+
if self._memsys is not None:
43+
return
44+
45+
self.allocate()
46+
47+
def allocate(self):
48+
from numba.cuda import device_array
49+
50+
if self._memsys_module is None:
51+
self._compile_memsys_module()
52+
53+
if self._memsys is None:
54+
# Allocate space for NRT_MemSys
55+
# TODO: determine the size of NRT_MemSys at runtime
56+
self._memsys = device_array((40,), dtype="i1")
57+
58+
def _single_thread_launch(self, module, stream, name, params=()):
59+
func = module.get_function(name)
60+
launch_kernel(
61+
func.handle,
62+
1, 1, 1,
63+
1, 1, 1,
64+
0,
65+
stream,
66+
params,
67+
cooperative=False
68+
)
69+
70+
def _ensure_initialize(self, stream):
71+
if self._initialized:
72+
return
73+
74+
self.initialize(stream)
75+
76+
def initialize(self, stream):
77+
if self._memsys is None:
78+
raise RuntimeError(
79+
"Please allocate NRT Memsys first before initializing.")
80+
81+
self._single_thread_launch(
82+
self._memsys_module, stream, "NRT_MemSys_init")
83+
self._initialized = True
84+
85+
def enable(self, stream):
86+
self._single_thread_launch(
87+
self._memsys_module, stream, "NR_MemSys_enable")
88+
89+
def disable(self, stream):
90+
self._single_thread_launch(
91+
self._memsys_module, stream, "NR_MemSys_disable")
92+
93+
def _copy_memsys_to_host(self, stream=0):
94+
self._ensure_allocate()
95+
self._ensure_initialize(stream)
96+
97+
# Q: What stream should we execute this on?
98+
# read the stats
99+
dt = np.dtype([
100+
('alloc', np.uint64),
101+
('free', np.uint64),
102+
('mi_alloc', np.uint64),
103+
('mi_free', np.uint64)
104+
])
105+
106+
stats_for_read = cuda.managed_array(1, dt)
107+
108+
self._single_thread_launch(
109+
self._memsys_module,
110+
stream,
111+
"NRT_MemSys_read",
112+
[stats_for_read.device_ctypes_pointer]
113+
)
114+
cuda.synchronize()
115+
116+
return stats_for_read[0]
117+
118+
def get_allocation_stats(self):
119+
memsys = self._copy_memsys_to_host()
120+
return _nrt_mstats(
121+
alloc=memsys.alloc,
122+
free=memsys.free,
123+
mi_alloc=memsys.mi_alloc,
124+
mi_free=memsys.mi_free
125+
)
126+
127+
def set_memsys_to_module(self, module, stream):
128+
if self._memsys is None:
129+
raise RuntimeError(
130+
"Please allocate NRT Memsys first before initializing.")
131+
132+
self._single_thread_launch(
133+
module,
134+
stream,
135+
"NRT_MemSys_set",
136+
[self._memsys.device_ctypes_pointer,]
137+
)
138+
139+
140+
rtsys = _Runtime()

numba_cuda/numba/cuda/tests/nrt/test_nrt.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import numpy as np
44
import unittest
55
from unittest.mock import patch
6-
from numba.core.runtime import rtsys
6+
from numba.cuda.runtime import rtsys
77
from numba.tests.support import EnableNRTStatsMixin
88
from numba.cuda.testing import CUDATestCase
99

@@ -19,7 +19,6 @@ def setUp(self):
1919
gc.collect()
2020
super(TestNrtRefCt, self).setUp()
2121

22-
@unittest.expectedFailure
2322
def test_no_return(self):
2423
"""
2524
Test issue #1291

0 commit comments

Comments
 (0)