|
| 1 | +import os |
| 2 | +import numpy as np |
| 3 | + |
| 4 | +from numba import cuda |
| 5 | +from numba.core.runtime.nrt import _nrt_mstats |
| 6 | +from numba.cuda.cudadrv.driver import Linker, launch_kernel |
| 7 | +from numba.cuda.cudadrv import devices |
| 8 | +from numba.cuda.api import get_current_device |
| 9 | + |
| 10 | + |
| 11 | +class _Runtime: |
| 12 | + _instance = None |
| 13 | + |
| 14 | + def __new__(cls, *args, **kwargs): |
| 15 | + if cls._instance is None: |
| 16 | + cls._instance = super(_Runtime, cls).__new__(cls, *args, **kwargs) |
| 17 | + return cls._instance |
| 18 | + |
| 19 | + def __init__(self): |
| 20 | + self._memsys_module = None |
| 21 | + self._memsys = None |
| 22 | + |
| 23 | + self._initialized = False |
| 24 | + |
| 25 | + def _compile_memsys_module(self): |
| 26 | + memsys_mod = os.path.join( |
| 27 | + os.path.dirname(os.path.abspath(__file__)), |
| 28 | + "memsys.cu" |
| 29 | + ) |
| 30 | + cc = get_current_device().compute_capability |
| 31 | + |
| 32 | + linker = Linker.new(cc=cc) |
| 33 | + linker.add_cu_file(memsys_mod) |
| 34 | + cubin = linker.complete() |
| 35 | + |
| 36 | + ctx = devices.get_context() |
| 37 | + module = ctx.create_module_image(cubin) |
| 38 | + |
| 39 | + self._memsys_module = module |
| 40 | + |
| 41 | + def _ensure_allocate(self): |
| 42 | + if self._memsys is not None: |
| 43 | + return |
| 44 | + |
| 45 | + self.allocate() |
| 46 | + |
| 47 | + def allocate(self): |
| 48 | + from numba.cuda import device_array |
| 49 | + |
| 50 | + if self._memsys_module is None: |
| 51 | + self._compile_memsys_module() |
| 52 | + |
| 53 | + if self._memsys is None: |
| 54 | + # Allocate space for NRT_MemSys |
| 55 | + # TODO: determine the size of NRT_MemSys at runtime |
| 56 | + self._memsys = device_array((40,), dtype="i1") |
| 57 | + |
| 58 | + def _single_thread_launch(self, module, stream, name, params=()): |
| 59 | + func = module.get_function(name) |
| 60 | + launch_kernel( |
| 61 | + func.handle, |
| 62 | + 1, 1, 1, |
| 63 | + 1, 1, 1, |
| 64 | + 0, |
| 65 | + stream, |
| 66 | + params, |
| 67 | + cooperative=False |
| 68 | + ) |
| 69 | + |
| 70 | + def _ensure_initialize(self, stream): |
| 71 | + if self._initialized: |
| 72 | + return |
| 73 | + |
| 74 | + self.initialize(stream) |
| 75 | + |
| 76 | + def initialize(self, stream): |
| 77 | + if self._memsys is None: |
| 78 | + raise RuntimeError( |
| 79 | + "Please allocate NRT Memsys first before initializing.") |
| 80 | + |
| 81 | + self._single_thread_launch( |
| 82 | + self._memsys_module, stream, "NRT_MemSys_init") |
| 83 | + self._initialized = True |
| 84 | + |
| 85 | + def enable(self, stream): |
| 86 | + self._single_thread_launch( |
| 87 | + self._memsys_module, stream, "NR_MemSys_enable") |
| 88 | + |
| 89 | + def disable(self, stream): |
| 90 | + self._single_thread_launch( |
| 91 | + self._memsys_module, stream, "NR_MemSys_disable") |
| 92 | + |
| 93 | + def _copy_memsys_to_host(self, stream=0): |
| 94 | + self._ensure_allocate() |
| 95 | + self._ensure_initialize(stream) |
| 96 | + |
| 97 | + # Q: What stream should we execute this on? |
| 98 | + # read the stats |
| 99 | + dt = np.dtype([ |
| 100 | + ('alloc', np.uint64), |
| 101 | + ('free', np.uint64), |
| 102 | + ('mi_alloc', np.uint64), |
| 103 | + ('mi_free', np.uint64) |
| 104 | + ]) |
| 105 | + |
| 106 | + stats_for_read = cuda.managed_array(1, dt) |
| 107 | + |
| 108 | + self._single_thread_launch( |
| 109 | + self._memsys_module, |
| 110 | + stream, |
| 111 | + "NRT_MemSys_read", |
| 112 | + [stats_for_read.device_ctypes_pointer] |
| 113 | + ) |
| 114 | + cuda.synchronize() |
| 115 | + |
| 116 | + return stats_for_read[0] |
| 117 | + |
| 118 | + def get_allocation_stats(self): |
| 119 | + memsys = self._copy_memsys_to_host() |
| 120 | + return _nrt_mstats( |
| 121 | + alloc=memsys.alloc, |
| 122 | + free=memsys.free, |
| 123 | + mi_alloc=memsys.mi_alloc, |
| 124 | + mi_free=memsys.mi_free |
| 125 | + ) |
| 126 | + |
| 127 | + def set_memsys_to_module(self, module, stream): |
| 128 | + if self._memsys is None: |
| 129 | + raise RuntimeError( |
| 130 | + "Please allocate NRT Memsys first before initializing.") |
| 131 | + |
| 132 | + self._single_thread_launch( |
| 133 | + module, |
| 134 | + stream, |
| 135 | + "NRT_MemSys_set", |
| 136 | + [self._memsys.device_ctypes_pointer,] |
| 137 | + ) |
| 138 | + |
| 139 | + |
| 140 | +rtsys = _Runtime() |
0 commit comments