-
Notifications
You must be signed in to change notification settings - Fork 51
Open
Milestone
Description
What I did
I created kerchunk reference files following #381 and tried to combine them via xr.merge.
import xarray as xr
import tempfile
import json
from pathlib import Path
import virtualizarr as vz
import os
#Create xarray dataset
ds1 = xr.Dataset(
{
"a": (("x", "y"), [[1, 2], [3, 4]]),
"b": (("x", "y"), [[10, 20], [30, 40]]),
},
coords={"x": [10, 20], "y": [1, 2]},
)
ds2 = xr.Dataset(
{
"c": (("x", "y"), [[5, 6], [7, 8]]),
"d": (("x", "y"), [[50, 60], [70, 80]]),
},
coords={"x": [1, 2], "y": [1, 2]},
)
ref1 = ds1.virtualize.to_kerchunk()
ref2 = ds2.virtualize.to_kerchunk()
## 'Outlining' all references as inlined data is not yet supported
"""NotImplementedError: Outlining all references as inlined data is not yet supported. [ToDo]"""
tempdir1 = Path(tempfile.TemporaryDirectory().name)
tempdir2 = Path(tempfile.TemporaryDirectory().name)
def outline_references(ref: dict, folder: Path = None) -> dict:
"""
Virtualizarr currently does not support inlined references.
To open references with virtualizarr, the references must be written to a file.
Except the .zarray, .zattrs and .zgroup files, all references are written to disk.
"""
refs = ref["refs"]
for k, v in refs.items():
if os.path.basename(k).startswith('.'):
continue
elif isinstance(v, str):
file = folder / k
if not os.path.exists(os.path.dirname(file)):
os.makedirs(os.path.dirname(file))
with open(folder / k, "w") as f:
f.write(v)
refs[k] = [str(file), 0, v.__sizeof__()]
return ref
ref1 = outline_references(ref1, tempdir1)
ref2 = outline_references(ref2, tempdir2)
## Write references to disk (open_virtual_dataset expects a string)
with open("ref1.json", "w") as f:
json.dump(ref1, f)
with open("ref2.json", "w") as f:
json.dump(ref2, f)
# Note this section requires the modification in #381
vds1 = vz.open_virtual_dataset("ref1.json", filetype='kerchunk')
vds2 = vz.open_virtual_dataset("ref2.json", filetype='kerchunk')
xr.merge([vds1, vds2])What happened
TypeError Traceback (most recent call last)
Cell In[20], line 66
63 vds1 = vz.open_virtual_dataset("ref1.json", filetype='kerchunk')
64 vds2 = vz.open_virtual_dataset("ref2.json", filetype='kerchunk')
---> 66 xr.merge([vds1, vds2])
File ~/virtualizarr/lib/python3.10/site-packages/xarray/core/merge.py:976, in merge(objects, compat, join, fill_value, combine_attrs)
973 obj = obj.to_dataset()
974 dict_like_objects.append(obj)
--> 976 merge_result = merge_core(
977 dict_like_objects,
978 compat,
979 join,
980 combine_attrs=combine_attrs,
981 fill_value=fill_value,
982 )
983 return Dataset._construct_direct(**merge_result._asdict())
File ~/virtualizarr/lib/python3.10/site-packages/xarray/core/merge.py:701, in merge_core(objects, compat, join, combine_attrs, priority_arg, explicit_coords, indexes, fill_value, skip_align_args)
699 collected = collect_variables_and_indexes(aligned, indexes=indexes)
700 prioritized = _get_priority_vars_and_indexes(aligned, priority_arg, compat=compat)
--> 701 variables, out_indexes = merge_collected(
702 collected, prioritized, compat=compat, combine_attrs=combine_attrs
703 )
705 dims = calculate_dimensions(variables)
707 coord_names, noncoord_names = determine_coords(coerced)
File ~/virtualizarr/lib/python3.10/site-packages/xarray/core/merge.py:290, in merge_collected(grouped, prioritized, compat, combine_attrs, equals)
288 variables = [variable for variable, _ in elements_list]
289 try:
--> 290 merged_vars[name] = unique_variable(
291 name, variables, compat, equals.get(name, None)
292 )
293 except MergeError:
294 if compat != "minimal":
295 # we need more than "minimal" compatibility (for which
296 # we drop conflicting coordinates)
File ~/virtualizarr/lib/python3.10/site-packages/xarray/core/merge.py:137, in unique_variable(name, variables, compat, equals)
133 break
135 if equals is None:
136 # now compare values with minimum number of computes
--> 137 out = out.compute()
138 for var in variables[1:]:
139 equals = getattr(out, compat)(var)
File ~/virtualizarr/lib/python3.10/site-packages/xarray/core/variable.py:1026, in Variable.compute(self, **kwargs)
1008 """Manually trigger loading of this variable's data from disk or a
1009 remote source into memory and return a new variable. The original is
1010 left unaltered.
(...)
1023 dask.array.compute
1024 """
1025 new = self.copy(deep=False)
-> 1026 return new.load(**kwargs)
File ~/virtualizarr/lib/python3.10/site-packages/xarray/core/variable.py:1004, in Variable.load(self, **kwargs)
987 def load(self, **kwargs):
988 """Manually trigger loading of this variable's data from disk or a
989 remote source into memory and return this variable.
990
(...)
1002 dask.array.compute
1003 """
-> 1004 self._data = to_duck_array(self._data, **kwargs)
1005 return self
File ~/virtualizarr/lib/python3.10/site-packages/xarray/namedarray/pycompat.py:129, in to_duck_array(data, **kwargs)
126 from xarray.namedarray.parallelcompat import get_chunked_array_type
128 if is_chunked_array(data):
--> 129 chunkmanager = get_chunked_array_type(data)
130 loaded_data, *_ = chunkmanager.compute(data, **kwargs) # type: ignore[var-annotated]
131 return loaded_data
File ~/virtualizarr/lib/python3.10/site-packages/xarray/namedarray/parallelcompat.py:158, in get_chunked_array_type(*args)
152 selected = [
153 chunkmanager
154 for chunkmanager in chunkmanagers.values()
155 if chunkmanager.is_chunked_array(chunked_arr)
156 ]
157 if not selected:
--> 158 raise TypeError(
159 f"Could not find a Chunk Manager which recognises type {type(chunked_arr)}"
160 )
161 elif len(selected) >= 2:
162 raise TypeError(f"Multiple ChunkManagers recognise type {type(chunked_arr)}")
TypeError: Could not find a Chunk Manager which recognises type <class 'virtualizarr.manifests.array.ManifestArray'>What I expected
I expected the virtual equivalent to the "real" datasets to return:
>>> xr.merge([ds1, ds2])
<xarray.Dataset> Size: 304B
Dimensions: (x: 4, y: 2)
Coordinates:
* x (x) int64 32B 1 2 10 20
* y (y) int64 16B 1 2
Data variables:
a (x, y) float64 64B nan nan nan nan 1.0 2.0 3.0 4.0
b (x, y) float64 64B nan nan nan nan 10.0 20.0 30.0 40.0
c (x, y) float64 64B 5.0 6.0 7.0 8.0 nan nan nan nan
d (x, y) float64 64B 50.0 60.0 70.0 80.0 nan nan nan nan
Attributes:
coordinates: x yEnvironment
vz.version = 1.2.1.dev19+g0d2d6ab
norlandrhagen
Metadata
Metadata
Assignees
Labels
No labels