Expose DiskArrays.cache (#417)

meggart · web-flow · commit 3b392522dda7 · 2024-07-26T13:16:29.000+02:00
* forward DiskArrays.cache

* add some docs

* bump version
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "YAXArrays"
 uuid = "c21b50f5-aa40-41ea-b809-c0f5e47bfa5c"
 authors = ["Fabian Gans <fgans@bgc-jena.mpg.de>"]
-version = "0.5.9"
+version = "0.5.10"
 
 [deps]
 CFTime = "179af706-886a-5703-950a-314cd64e0468"
diff --git a/docs/src/UserGuide/cache.md b/docs/src/UserGuide/cache.md
@@ -0,0 +1,18 @@
+# Caching YAXArrays
+
+For some applications like interactive plotting of large datasets it can not be avoided that the same data must be accessed several times. In these cases it can be useful to store recently accessed data in a cache. In YAXArrays this can be easily achieved using the `cache` function. For example, if we open a large dataset from a remote source and want to keep data in a cache of size 500MB one can use:
+
+````julia
+using YAXArrays, Zarr
+ds = open_dataset("path/to/source")
+cachesize = 500 #MB
+cache(ds,maxsize = cachesize)
+````
+
+The above will wrap every array in the dataset into its own cache, where the 500MB are distributed equally across datasets. 
+Alternatively individual caches can be applied to single `YAXArray`s
+
+````julia
+yax = ds.avariable
+cache(yax,maxsize = 1000)
+````
diff --git a/src/Cubes/Cubes.jl b/src/Cubes/Cubes.jl
@@ -3,7 +3,7 @@ The functions provided by YAXArrays are supposed to work on different types of c
 Data types that
 """
 module Cubes
-using DiskArrays: DiskArrays, eachchunk, approx_chunksize, max_chunksize, grid_offset, GridChunks
+using DiskArrays: DiskArrays, eachchunk, approx_chunksize, max_chunksize, grid_offset, GridChunks, cache
 using Distributed: myid
 using Dates: TimeType, Date
 using IntervalSets: Interval, (..)
@@ -17,7 +17,7 @@ using Tables: istable, schema, columns
 using DimensionalData: DimensionalData as DD, AbstractDimArray, NoName
 import DimensionalData: name
 
-export concatenatecubes, caxes, subsetcube, readcubedata, renameaxis!, YAXArray, setchunks
+export concatenatecubes, caxes, subsetcube, readcubedata, renameaxis!, YAXArray, setchunks, cache
 
 """
 This function calculates a subset of a cube's data
@@ -179,6 +179,7 @@ function Base.permutedims(c::YAXArray, p)
     newchunks = DiskArrays.GridChunks(eachchunk(c).chunks[collect(dimnums)])
     YAXArray(newdims, newdata, c.properties, newchunks, c.cleaner)
 end
+DiskArrays.cache(a::YAXArray;maxsize=1000) = DD.rebuild(a,cache(a.data;maxsize))
 
 # DimensionalData overloads
 
diff --git a/src/DatasetAPI/Datasets.jl b/src/DatasetAPI/Datasets.jl
@@ -145,6 +145,15 @@ function Base.getindex(x::Dataset, i::Vector{Symbol})
     cubesnew = [j => x.cubes[j] for j in i]
     Dataset(; cubesnew...)
 end
+function DiskArrays.cache(ds::Dataset;maxsize=1000)
+    #Distribute cache size equally across cubes
+    maxsize = maxsize ÷ length(ds.cubes)
+    cachedcubes = OrderedDict{Symbol,YAXArray}(
+        k => DiskArrays.cache(ds.cubes[k];maxsize) for k in keys(ds.cubes)
+    )
+    Dataset(cachedcubes,ds.axes,ds.properties)
+end
+
 
 function fuzzyfind(s::String, comp::Vector{String})
     sl = lowercase(s)
diff --git a/test/Datasets/datasets.jl b/test/Datasets/datasets.jl