Skip to content

Commit c19aa4f

Browse files
authored
Check and fix chararray string dimension names (#10395)
* MIN: forward variable name down to coders for CharacterArrayCoder * FIX: check character arrays for possible changes in dimension name and length, issue appropriate warnings * add whats-new.rst entry * fix subclass in test * only rewrite dimension name, if new length differs from original length * Refactor out validation logic
1 parent f1738a0 commit c19aa4f

File tree

10 files changed

+105
-28
lines changed

10 files changed

+105
-28
lines changed

doc/whats-new.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ Bug fixes
2828
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
2929
- Fix the SciPy backend for netCDF3 files . (:issue:`8909`, :pull:`10376`)
3030
By `Deepak Cherian <https://github.com/dcherian>`_.
31+
- Check and fix character array string dimension names, issue warnings as needed (:issue:`6352`, :pull:`10395`).
32+
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
33+
3134

3235

3336
Documentation
@@ -36,6 +39,8 @@ Documentation
3639

3740
Internal Changes
3841
~~~~~~~~~~~~~~~~
42+
- Forward variable name down to coders for AbstractWritableDataStore.encode_variable and subclasses. (:pull:`10395`).
43+
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
3944

4045
.. _whats-new.2025.06.1:
4146

xarray/backends/common.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -389,11 +389,11 @@ def encode(self, variables, attributes):
389389
attributes : dict-like
390390
391391
"""
392-
variables = {k: self.encode_variable(v) for k, v in variables.items()}
392+
variables = {k: self.encode_variable(v, name=k) for k, v in variables.items()}
393393
attributes = {k: self.encode_attribute(v) for k, v in attributes.items()}
394394
return variables, attributes
395395

396-
def encode_variable(self, v):
396+
def encode_variable(self, v, name=None):
397397
"""encode one variable"""
398398
return v
399399

@@ -641,7 +641,7 @@ def encode(self, variables, attributes):
641641
variables = {
642642
k: ensure_dtype_not_object(v, name=k) for k, v in variables.items()
643643
}
644-
variables = {k: self.encode_variable(v) for k, v in variables.items()}
644+
variables = {k: self.encode_variable(v, name=k) for k, v in variables.items()}
645645
attributes = {k: self.encode_attribute(v) for k, v in attributes.items()}
646646
return variables, attributes
647647

xarray/backends/h5netcdf_.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -286,8 +286,8 @@ def set_dimension(self, name, length, is_unlimited=False):
286286
def set_attribute(self, key, value):
287287
self.ds.attrs[key] = value
288288

289-
def encode_variable(self, variable):
290-
return _encode_nc4_variable(variable)
289+
def encode_variable(self, variable, name=None):
290+
return _encode_nc4_variable(variable, name=name)
291291

292292
def prepare_variable(
293293
self, name, variable, check_encoding=False, unlimited_dims=None

xarray/backends/netCDF4_.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99

1010
import numpy as np
1111

12-
from xarray import coding
1312
from xarray.backends.common import (
1413
BACKEND_ENTRYPOINTS,
1514
BackendArray,
@@ -30,6 +29,12 @@
3029
)
3130
from xarray.backends.netcdf3 import encode_nc3_attr_value, encode_nc3_variable
3231
from xarray.backends.store import StoreBackendEntrypoint
32+
from xarray.coding.strings import (
33+
CharacterArrayCoder,
34+
EncodedStringCoder,
35+
create_vlen_dtype,
36+
is_unicode_dtype,
37+
)
3338
from xarray.coding.variables import pop_to
3439
from xarray.core import indexing
3540
from xarray.core.utils import (
@@ -73,7 +78,7 @@ def __init__(self, variable_name, datastore):
7378
# check vlen string dtype in further steps
7479
# it also prevents automatic string concatenation via
7580
# conventions.decode_cf_variable
76-
dtype = coding.strings.create_vlen_dtype(str)
81+
dtype = create_vlen_dtype(str)
7782
self.dtype = dtype
7883

7984
def __setitem__(self, key, value):
@@ -127,12 +132,12 @@ def _getitem(self, key):
127132
return array
128133

129134

130-
def _encode_nc4_variable(var):
135+
def _encode_nc4_variable(var, name=None):
131136
for coder in [
132-
coding.strings.EncodedStringCoder(allows_unicode=True),
133-
coding.strings.CharacterArrayCoder(),
137+
EncodedStringCoder(allows_unicode=True),
138+
CharacterArrayCoder(),
134139
]:
135-
var = coder.encode(var)
140+
var = coder.encode(var, name=name)
136141
return var
137142

138143

@@ -164,7 +169,7 @@ def _nc4_dtype(var):
164169
if "dtype" in var.encoding:
165170
dtype = var.encoding.pop("dtype")
166171
_check_encoding_dtype_is_vlen_string(dtype)
167-
elif coding.strings.is_unicode_dtype(var.dtype):
172+
elif is_unicode_dtype(var.dtype):
168173
dtype = str
169174
elif var.dtype.kind in ["i", "u", "f", "c", "S"]:
170175
dtype = var.dtype
@@ -535,12 +540,12 @@ def set_attribute(self, key, value):
535540
else:
536541
self.ds.setncattr(key, value)
537542

538-
def encode_variable(self, variable):
543+
def encode_variable(self, variable, name=None):
539544
variable = _force_native_endianness(variable)
540545
if self.format == "NETCDF4":
541-
variable = _encode_nc4_variable(variable)
546+
variable = _encode_nc4_variable(variable, name=name)
542547
else:
543-
variable = encode_nc3_variable(variable)
548+
variable = encode_nc3_variable(variable, name=name)
544549
return variable
545550

546551
def prepare_variable(

xarray/backends/netcdf3.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,12 @@ def _maybe_prepare_times(var):
118118
return data
119119

120120

121-
def encode_nc3_variable(var):
121+
def encode_nc3_variable(var, name=None):
122122
for coder in [
123123
coding.strings.EncodedStringCoder(allows_unicode=False),
124124
coding.strings.CharacterArrayCoder(),
125125
]:
126-
var = coder.encode(var)
126+
var = coder.encode(var, name=name)
127127
data = _maybe_prepare_times(var)
128128
data = coerce_nc3_dtype(data)
129129
attrs = encode_nc3_attrs(var.attrs)

xarray/backends/scipy_.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -227,8 +227,8 @@ def set_attribute(self, key, value):
227227
value = encode_nc3_attr_value(value)
228228
setattr(self.ds, key, value)
229229

230-
def encode_variable(self, variable):
231-
variable = encode_nc3_variable(variable)
230+
def encode_variable(self, variable, name=None):
231+
variable = encode_nc3_variable(variable, name=name)
232232
return variable
233233

234234
def prepare_variable(

xarray/backends/zarr.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -855,8 +855,8 @@ def set_dimensions(self, variables, unlimited_dims=None):
855855
def set_attributes(self, attributes):
856856
_put_attrs(self.zarr_group, attributes)
857857

858-
def encode_variable(self, variable):
859-
variable = encode_zarr_variable(variable)
858+
def encode_variable(self, variable, name=None):
859+
variable = encode_zarr_variable(variable, name=name)
860860
return variable
861861

862862
def encode_attribute(self, a):

xarray/coding/strings.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
import re
56
from functools import partial
67

78
import numpy as np
@@ -15,7 +16,7 @@
1516
unpack_for_encoding,
1617
)
1718
from xarray.core import indexing
18-
from xarray.core.utils import module_available
19+
from xarray.core.utils import emit_user_level_warning, module_available
1920
from xarray.core.variable import Variable
2021
from xarray.namedarray.parallelcompat import get_chunked_array_type
2122
from xarray.namedarray.pycompat import is_chunked_array
@@ -113,6 +114,36 @@ def ensure_fixed_length_bytes(var: Variable) -> Variable:
113114
return var
114115

115116

117+
def validate_char_dim_name(strlen, encoding, name) -> str:
118+
"""Check character array dimension naming and size and return it."""
119+
120+
if (char_dim_name := encoding.pop("char_dim_name", None)) is not None:
121+
# 1 - extract all characters up to last number sequence
122+
# 2 - extract last number sequence
123+
match = re.search(r"^(.*?)(\d+)(?!.*\d)", char_dim_name)
124+
if match:
125+
new_dim_name = match.group(1)
126+
if int(match.group(2)) != strlen:
127+
emit_user_level_warning(
128+
f"String dimension naming mismatch on variable {name!r}. {char_dim_name!r} provided by encoding, but data has length of '{strlen}'. Using '{new_dim_name}{strlen}' instead of {char_dim_name!r} to prevent possible naming clash.\n"
129+
"To silence this warning either remove 'char_dim_name' from encoding or provide a fitting name."
130+
)
131+
char_dim_name = f"{new_dim_name}{strlen}"
132+
else:
133+
if (
134+
original_shape := encoding.get("original_shape", [-1])[-1]
135+
) != -1 and original_shape != strlen:
136+
emit_user_level_warning(
137+
f"String dimension length mismatch on variable {name!r}. '{original_shape}' provided by encoding, but data has length of '{strlen}'. Using '{char_dim_name}{strlen}' instead of {char_dim_name!r} to prevent possible naming clash.\n"
138+
f"To silence this warning remove 'original_shape' from encoding."
139+
)
140+
char_dim_name = f"{char_dim_name}{strlen}"
141+
else:
142+
char_dim_name = f"string{strlen}"
143+
144+
return char_dim_name
145+
146+
116147
class CharacterArrayCoder(VariableCoder):
117148
"""Transforms between arrays containing bytes and character arrays."""
118149

@@ -122,10 +153,7 @@ def encode(self, variable, name=None):
122153
dims, data, attrs, encoding = unpack_for_encoding(variable)
123154
if data.dtype.kind == "S" and encoding.get("dtype") is not str:
124155
data = bytes_to_char(data)
125-
if "char_dim_name" in encoding.keys():
126-
char_dim_name = encoding.pop("char_dim_name")
127-
else:
128-
char_dim_name = f"string{data.shape[-1]}"
156+
char_dim_name = validate_char_dim_name(data.shape[-1], encoding, name)
129157
dims = dims + (char_dim_name,)
130158
return Variable(dims, data, attrs, encoding)
131159

xarray/tests/test_coding_strings.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,45 @@ def test_CharacterArrayCoder_char_dim_name(original, expected_char_dim_name) ->
139139
assert roundtripped.dims[-1] == original.dims[-1]
140140

141141

142+
@pytest.mark.parametrize(
143+
[
144+
"original",
145+
"expected_char_dim_name",
146+
"expected_char_dim_length",
147+
"warning_message",
148+
],
149+
[
150+
(
151+
Variable(("x",), [b"ab", b"cde"], encoding={"char_dim_name": "foo4"}),
152+
"foo3",
153+
3,
154+
"String dimension naming mismatch",
155+
),
156+
(
157+
Variable(
158+
("x",),
159+
[b"ab", b"cde"],
160+
encoding={"original_shape": (2, 4), "char_dim_name": "foo"},
161+
),
162+
"foo3",
163+
3,
164+
"String dimension length mismatch",
165+
),
166+
],
167+
)
168+
def test_CharacterArrayCoder_dim_mismatch_warnings(
169+
original, expected_char_dim_name, expected_char_dim_length, warning_message
170+
) -> None:
171+
coder = strings.CharacterArrayCoder()
172+
with pytest.warns(UserWarning, match=warning_message):
173+
encoded = coder.encode(original)
174+
roundtripped = coder.decode(encoded)
175+
assert encoded.dims[-1] == expected_char_dim_name
176+
assert encoded.sizes[expected_char_dim_name] == expected_char_dim_length
177+
assert roundtripped.encoding["char_dim_name"] == expected_char_dim_name
178+
assert roundtripped.dims[-1] == original.dims[-1]
179+
180+
142181
def test_StackedBytesArray() -> None:
143182
array = np.array([[b"a", b"b", b"c"], [b"d", b"e", b"f"]], dtype="S")
144183
actual = strings.StackedBytesArray(array)

xarray/tests/test_conventions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -555,10 +555,10 @@ def test_decode_cf_time_kwargs(self, time_unit) -> None:
555555

556556

557557
class CFEncodedInMemoryStore(WritableCFDataStore, InMemoryDataStore):
558-
def encode_variable(self, var):
558+
def encode_variable(self, var, name=None):
559559
"""encode one variable"""
560560
coder = coding.strings.EncodedStringCoder(allows_unicode=True)
561-
var = coder.encode(var)
561+
var = coder.encode(var, name=name)
562562
return var
563563

564564

0 commit comments

Comments
 (0)