|
| 1 | +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | +// SPDX-License-Identifier: BSD-2-Clause |
| 3 | + |
| 4 | +/* |
| 5 | + * Helper functions used by Numba CUDA at runtime. |
| 6 | + * This C file is meant to be included after defining the |
| 7 | + * NUMBA_EXPORT_FUNC() and NUMBA_EXPORT_DATA() macros. |
| 8 | + */ |
| 9 | + |
| 10 | +#include "_pymodule.h" |
| 11 | +#include <stddef.h> |
| 12 | + |
| 13 | +/* |
| 14 | + * Unicode helpers |
| 15 | + */ |
| 16 | + |
| 17 | +/* Developer note: |
| 18 | + * |
| 19 | + * The hash value of unicode objects is obtained via: |
| 20 | + * ((PyASCIIObject *)(obj))->hash; |
| 21 | + * The use comes from this definition: |
| 22 | + * https://github.com/python/cpython/blob/6d43f6f081023b680d9db4542d19b9e382149f0a/Objects/unicodeobject.c#L119-L120 |
| 23 | + * and it's used extensively throughout the `cpython/Object/unicodeobject.c` |
| 24 | + * source, not least in `unicode_hash` itself: |
| 25 | + * https://github.com/python/cpython/blob/6d43f6f081023b680d9db4542d19b9e382149f0a/Objects/unicodeobject.c#L11662-L11679 |
| 26 | + * |
| 27 | + * The Unicode string struct layouts are described here: |
| 28 | + * https://github.com/python/cpython/blob/6d43f6f081023b680d9db4542d19b9e382149f0a/Include/cpython/unicodeobject.h#L82-L161 |
| 29 | + * essentially, all the unicode string layouts start with a `PyASCIIObject` at |
| 30 | + * offset 0 (as of commit 6d43f6f081023b680d9db4542d19b9e382149f0a, somewhere |
| 31 | + * in the 3.8 development cycle). |
| 32 | + * |
| 33 | + * For safety against future CPython internal changes, the code checks that the |
| 34 | + * _base members of the unicode structs are what is expected in 3.7, and that |
| 35 | + * their offset is 0. It then walks the struct to the hash location to make sure |
| 36 | + * the offset is indeed the same as PyASCIIObject->hash. |
| 37 | + * Note: The large condition in the if should evaluate to a compile time |
| 38 | + * constant. |
| 39 | + */ |
| 40 | + |
| 41 | +#define MEMBER_SIZE(structure, member) sizeof(((structure *)0)->member) |
| 42 | + |
| 43 | +NUMBA_EXPORT_FUNC(void *) |
| 44 | +numba_extract_unicode(PyObject *obj, Py_ssize_t *length, int *kind, |
| 45 | + unsigned int *ascii, Py_ssize_t *hash) { |
| 46 | + if (!PyUnicode_READY(obj)) { |
| 47 | + *length = PyUnicode_GET_LENGTH(obj); |
| 48 | + *kind = PyUnicode_KIND(obj); |
| 49 | + /* could also use PyUnicode_IS_ASCII but it is not publicly advertised in https://docs.python.org/3/c-api/unicode.html */ |
| 50 | + *ascii = (unsigned int)(PyUnicode_MAX_CHAR_VALUE(obj) == (0x7f)); |
| 51 | + /* this is here as a crude check for safe casting of all unicode string |
| 52 | + * structs to a PyASCIIObject */ |
| 53 | + if (MEMBER_SIZE(PyCompactUnicodeObject, _base) == sizeof(PyASCIIObject) && |
| 54 | + MEMBER_SIZE(PyUnicodeObject, _base) == sizeof(PyCompactUnicodeObject) && |
| 55 | + offsetof(PyCompactUnicodeObject, _base) == 0 && |
| 56 | + offsetof(PyUnicodeObject, _base) == 0 && |
| 57 | + offsetof(PyCompactUnicodeObject, _base.hash) == offsetof(PyASCIIObject, hash) && |
| 58 | + offsetof(PyUnicodeObject, _base._base.hash) == offsetof(PyASCIIObject, hash) |
| 59 | + ) { |
| 60 | + /* Grab the hash from the type object cache, do not compute it. */ |
| 61 | + *hash = ((PyASCIIObject *)(obj))->hash; |
| 62 | + } |
| 63 | + else { |
| 64 | + /* cast is not safe, fail */ |
| 65 | + return NULL; |
| 66 | + } |
| 67 | + return PyUnicode_DATA(obj); |
| 68 | + } else { |
| 69 | + return NULL; |
| 70 | + } |
| 71 | +} |
0 commit comments