diff --git a/Project.toml b/Project.toml index fe43535..cfe36f7 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.9.6" [deps] Blosc = "a74b3585-a348-5f62-a45c-50e91977d574" +CRC32c = "8bf52ea8-c179-5cab-976a-9e18b702a9bc" ChunkCodecCore = "0b6fb165-00bc-4d37-ab8b-79f91016dbe1" ChunkCodecLibZlib = "4c0bbee4-addc-4d73-81a0-b6caacae83c8" ChunkCodecLibZstd = "55437552-ac27-4d47-9aa3-63184e8fd398" @@ -32,6 +33,7 @@ Blosc = "0.5, 0.6, 0.7" ChunkCodecCore = "1" ChunkCodecLibZlib = "1" ChunkCodecLibZstd = "1" +CRC32c = "1.10, 1.11" DataStructures = "0.17, 0.18, 0.19" DateTimes64 = "1" DiskArrays = "0.4.2" diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md index 6b9f6b3..b42c202 100644 --- a/docs/src/tutorial.md +++ b/docs/src/tutorial.md @@ -197,7 +197,7 @@ Order : C Read-Only : false Compressor : Zarr.BloscCompressor(0, 3, "zstd", 1) Filters : nothing -Store type : Dictionary Storage +Store type : Zarr.FormattedStore{2, '.', Zarr.DictStore}(Dictionary Storage) No. bytes : 400000000 No. bytes stored : 2412289 Storage ratio : 165.81761140559857 diff --git a/src/Codecs/Codecs.jl b/src/Codecs/Codecs.jl new file mode 100644 index 0000000..ec6e620 --- /dev/null +++ b/src/Codecs/Codecs.jl @@ -0,0 +1,49 @@ +module Codecs + +using JSON: JSON + +""" + abstract type Codec + +The abstract supertype for all Zarr codecs + +## Interface + +All subtypes of `Codec` SHALL implement the following methods: + +- `zencode(a, c::Codec)`: compress the array `a` using the codec `c`. +- `zdecode(a, c::Codec, T)`: decode the array `a` using the codec `c` + and return an array of type `T`. +- `JSON.lower(c::Codec)`: return a JSON representation of the codec `c`, which + follows the Zarr specification for that codec. +- `getCodec(::Type{<:Codec}, d::Dict)`: return a codec object from a given + dictionary `d` which contains the codec's parameters according to the Zarr spec. + +Subtypes of `Codec` MAY also implement the following methods: + +- `zencode!(encoded, data, c::Codec)`: encode the array `data` using the + codec `c` and store the result in the array `encoded`. +- `zdecode!(data, encoded, c::Codec)`: decode the array `encoded` + using the codec `c` and store the result in the array `data`. + +Finally, an entry MUST be added to the `VN.codectypes` dictionary for each codec type where N is the +Zarr format version. +This must also follow the Zarr specification's name for that compressor. The name of the compressor +is the key, and the value is the compressor type (e.g. `BloscCodec` or `NoCodec`). + +For example, the Blosc codec is named "blosc" in the Zarr spec, so the entry for [`BloscCodec`](@ref) +must be added to `codectypes` as `codectypes["blosc"] = BloscCodec`. +""" + +abstract type Codec end + +zencode(a, c::Codec) = error("Unimplemented") +zencode!(encoded, data, c::Codec) = error("Unimplemented") +zdecode(a, c::Codec, T::Type) = error("Unimplemented") +zdecode!(data, encoded, c::Codec) = error("Unimplemented") +JSON.lower(c::Codec) = error("Unimplemented") +getCodec(::Type{<:Codec}, d::Dict) = error("Unimplemented") + +include("V3/V3.jl") + +end diff --git a/src/Codecs/V3/V3.jl b/src/Codecs/V3/V3.jl new file mode 100644 index 0000000..2cf3c58 --- /dev/null +++ b/src/Codecs/V3/V3.jl @@ -0,0 +1,531 @@ +module V3Codecs + +import ..Codecs: zencode, zdecode, zencode!, zdecode! +using CRC32c: CRC32c +using JSON: JSON + +abstract type V3Codec{In,Out} end +const codectypes = Dict{String, V3Codec}() + +@enum BloscCompressor begin + lz4 + lz4hc + blosclz + zstd + snappy + zlib +end + +@enum BloscShuffle begin + noshuffle + shuffle + bitshuffle +end + +struct BloscCodec <: V3Codec{:bytes, :bytes} + cname::BloscCompressor + clevel::Int64 + shuffle::BloscShuffle + typesize::UInt8 + blocksize::UInt +end +name(::BloscCodec) = "blosc" + +struct BytesCodec <: V3Codec{:array, :bytes} +end +name(::BytesCodec) = "bytes" + +struct CRC32cCodec <: V3Codec{:bytes, :bytes} +end +name(::CRC32cCodec) = "crc32c" + +struct GzipCodec <: V3Codec{:bytes, :bytes} +end +name(::GzipCodec) = "gzip" + + +#= +zencode(a, c::Codec) = error("Unimplemented") +zencode!(encoded, data, c::Codec) = error("Unimplemented") +zdecode(a, c::Codec, T::Type) = error("Unimplemented") +zdecode!(data, encoded, c::Codec) = error("Unimplemented") +=# + +function crc32c_stream!(output::IO, input::IO; buffer = Vector{UInt8}(undef, 1024*32)) + hash::UInt32 = 0x00000000 + while(bytesavailable(input) > 0) + sized_buffer = @view(buffer[1:min(length(buffer), bytesavailable(input))]) + read!(input, sized_buffer) + write(output, sized_buffer) + hash = CRC32c.crc32c(sized_buffer, hash) + end + return hash +end +function zencode!(encoded::Vector{UInt8}, data::Vector{UInt8}, c::CRC32cCodec) + output = IOBuffer(encoded, read=false, write=true) + input = IOBuffer(data, read=true, write=false) + zencode!(output, input, c) + return take!(output) +end +function zencode!(output::IO, input::IO, c::CRC32cCodec) + hash = crc32c_stream!(output, input) + write(output, hash) + return output +end +function zdecode!(encoded::Vector{UInt8}, data::Vector{UInt8}, c::CRC32cCodec) + output = IOBuffer(encoded, read=false, write=true) + input = IOBuffer(data, read=true, write=true) + zdecode!(output, input, c) + return take!(output) +end +function zdecode!(output::IOBuffer, input::IOBuffer, c::CRC32cCodec) + input_vec = take!(input) + truncated_input = IOBuffer(@view(input_vec[1:end-4]); read=true, write=false) + hash = crc32c_stream!(output, truncated_input) + if input_vec[end-3:end] != reinterpret(UInt8, [hash]) + throw(IOError("CRC32c hash does not match")) + end + return output +end + +""" + ShardingCodec{N} + +Sharding codec for Zarr v3. Sharding splits chunks into smaller "shards" and stores them +in a single file with an index mapping chunk coordinates to shard locations. + +# Fields +- `chunk_shape`: Shape of each shard (NTuple{N,Int}) +- `codecs`: Vector of codecs to apply to shard data (e.g., [BytesCodec(), GzipCodec()]) +- `index_codecs`: Vector of codecs to apply to the index (e.g., [BytesCodec()]) +- `index_location`: Location of index in shard file, either `:start` or `:end` + +# Implementation Notes +Sharding works by: +1. Taking a chunk of data and splitting it into shards based on `chunk_shape` +2. Encoding each shard using the `codecs` pipeline +3. Creating an index that maps (chunk_coords, shard_coords) -> (offset, size) in the shard file +4. Encoding the index using `index_codecs` +5. Writing the shard file with index at `index_location` (start or end) + +""" +struct ShardingCodec{N} <: V3Codec{:array, :bytes} + chunk_shape::NTuple{N,Int} # Shape of each shard + codecs::Vector{V3Codec} # Codecs to apply to shard data + index_codecs::Vector{V3Codec} # Codecs to apply to the index + index_location::Symbol # :start or :end +end +name(::ShardingCodec) = "sharding_indexed" + +""" + JSON.lower(c::ShardingCodec) + +Serialize ShardingCodec to JSON format for Zarr v3 metadata. +""" +function JSON.lower(c::ShardingCodec) + return Dict( + "name" => "sharding_indexed", + "configuration" => Dict( + "chunk_shape" => collect(c.chunk_shape), + "codecs" => [JSON.lower(codec) for codec in c.codecs], + "index_codecs" => [JSON.lower(codec) for codec in c.index_codecs], + "index_location" => string(c.index_location) + ) + ) +end + +""" + getCodec(::Type{ShardingCodec}, d::Dict) + +Deserialize ShardingCodec from JSON configuration dict. +""" +function getCodec(::Type{ShardingCodec}, d::Dict) + config = d["configuration"] + N = length(config["chunk_shape"]) + chunk_shape = NTuple{N,Int}(config["chunk_shape"]) + codecs = [getCodec(codec_dict) for codec_dict in config["codecs"]] + index_codecs = [getCodec(codec_dict) for codec_dict in config["index_codecs"]] + index_location = Symbol(get(config, "index_location", "end")) + return ShardingCodec{N}(chunk_shape, codecs, index_codecs, index_location) +end + +const MAX_UINT64 = typemax(UInt64) + +""" + ChunkShardInfo + +Information about a chunk's location within a shard. +""" +struct ChunkShardInfo + offset::UInt64 # Byte offset within shard where chunk begins + nbytes::UInt64 # Number of bytes the chunk occupies +end + +ChunkShardInfo() = ChunkShardInfo(MAX_UINT64, MAX_UINT64) # Empty chunk marker + +""" + ShardIndex{N} + +Internal structure representing the shard index. +Stores chunk location info for an N-dimensional grid of chunks. +Empty chunks are marked with ChunkShardInfo(MAX_UINT64, MAX_UINT64) +""" +struct ShardIndex{N} + chunks::Array{ChunkShardInfo, N} # N-dimensional array of chunk info +end + +""" + ShardIndex(chunks_per_shard::NTuple{N,Int}) + +Create an empty shard index with all chunks marked as empty. +""" +function ShardIndex(chunks_per_shard::NTuple{N,Int}) where N + chunks = fill(ChunkShardInfo(), chunks_per_shard) + return ShardIndex{N}(chunks) +end + +""" + get_chunk_slice(idx::ShardIndex, chunk_coords::NTuple{N,Int}) + +Get the byte range (offset, offset+nbytes) for a chunk, or nothing if empty. +""" +function get_chunk_slice(idx::ShardIndex, chunk_coords::NTuple{N,Int}) where N + info = idx.chunks[chunk_coords...] + + if info.offset == MAX_UINT64 && info.nbytes == MAX_UINT64 + return nothing + end + + return (Int(info.offset), Int(info.offset + info.nbytes)) +end + +""" + set_chunk_slice!(idx::ShardIndex, chunk_coords::NTuple{N,Int}, offset::Int, nbytes::Int) + +Set the byte range for a chunk in the index. +""" +function set_chunk_slice!(idx::ShardIndex, chunk_coords::NTuple{N,Int}, offset::Int, nbytes::Int) where N + idx.chunks[chunk_coords...] = ChunkShardInfo(UInt64(offset), UInt64(nbytes)) +end + +""" + set_chunk_empty!(idx::ShardIndex, chunk_coords::NTuple{N,Int}) + +Mark a chunk as empty in the index. +""" +function set_chunk_empty!(idx::ShardIndex, chunk_coords::NTuple{N,Int}) where N + idx.chunks[chunk_coords...] = ChunkShardInfo() +end + +""" + calculate_chunks_per_shard(shard_shape::NTuple{N,Int}, chunk_shape::NTuple{N,Int}) + +Calculate how many chunks fit in each shard dimension. +""" +function calculate_chunks_per_shard(shard_shape::NTuple{N,Int}, chunk_shape::NTuple{N,Int}) where N + return ntuple(i -> div(shard_shape[i], chunk_shape[i]), N) +end + +""" + get_chunk_slice_in_shard(chunk_coords::NTuple{N,Int}, chunk_shape::NTuple{N,Int}, shard_shape::NTuple{N,Int}) + +Get the array slice ranges for a chunk within a shard. +chunk_coords are 1-based indices. +""" +function get_chunk_slice_in_shard(chunk_coords::NTuple{N,Int}, chunk_shape::NTuple{N,Int}, shard_shape::NTuple{N,Int}) where N + return ntuple(N) do i + start_idx = (chunk_coords[i] - 1) * chunk_shape[i] + 1 + end_idx = min(chunk_coords[i] * chunk_shape[i], shard_shape[i]) + start_idx:end_idx + end +end + +""" + apply_codec_chain(data, codecs::Vector{V3Codec}) + +Apply codec pipeline in forward order (encoding). +""" +function apply_codec_chain(data, codecs::Vector{V3Codec}) + result = data + for codec in codecs + result = zencode(result, codec) + end + return result +end + +""" + reverse_codec_chain(data, codecs::Vector{V3Codec}) + +Apply codec pipeline in reverse order (decoding). +""" +function reverse_codec_chain(data, codecs::Vector{V3Codec}) + result = data + for codec in reverse(codecs) + result = zdecode(result, codec) + end + return result +end + +""" + encode_shard_index(index::ShardIndex, index_codecs::Vector{V3Codec}) + +Encode the shard index using the index codec pipeline. + +Per Zarr v3 spec, the index is linearized in C-order (row-major) with alternating +offset/nbytes values: [chunk_0_offset, chunk_0_nbytes, chunk_1_offset, chunk_1_nbytes, ...] +``` +""" +function encode_shard_index(index::ShardIndex{N}, index_codecs::Vector{V3Codec}) where N + # Pre-allocate buffer for index data + n_chunks = length(index.chunks) + index_data = Vector{UInt64}(undef, 2 * n_chunks) + + # Iterate in C-order (row-major) and interleave offset/nbytes + idx = 1 + for cart_idx in CartesianIndices(index.chunks) + info = index.chunks[cart_idx] + index_data[idx] = info.offset + index_data[idx + 1] = info.nbytes + idx += 2 + end + + # Convert to bytes + index_bytes = reinterpret(UInt8, index_data) + + # Apply index codecs + encoded = apply_codec_chain(index_bytes, index_codecs) + + return encoded +end + +""" + decode_shard_index(index_bytes::Vector{UInt8}, chunks_per_shard::NTuple{N,Int}, index_codecs::Vector{V3Codec}) + +Decode the shard index from bytes. + +The bytes are in C-order with alternating offset/nbytes: +[offset0, nbytes0, offset1, nbytes1, ...] +""" +function decode_shard_index(index_bytes::Vector{UInt8}, chunks_per_shard::NTuple{N,Int}, index_codecs::Vector{V3Codec}) where N + # Decode using index codecs (in reverse order) + decoded_bytes = reverse_codec_chain(index_bytes, index_codecs) + + # Expected size: 16 bytes (2 * UInt64) per chunk + n_chunks = prod(chunks_per_shard) + expected_length = n_chunks * 2 * sizeof(UInt64) + + if length(decoded_bytes) != expected_length + throw(DimensionMismatch("Index size mismatch: expected $expected_length, got $(length(decoded_bytes))")) + end + + # Reinterpret as UInt64 array: [offset1, nbytes1, offset1, nbytes1, ...] + index_data = reinterpret(UInt64, decoded_bytes) + + # Reconstruct the N-dimensional array of ChunkShardInfo + chunks = Array{ChunkShardInfo, N}(undef, chunks_per_shard) + + idx = 1 + for cart_idx in CartesianIndices(chunks) + offset = index_data[idx] + nbytes = index_data[idx + 1] + chunks[cart_idx] = ChunkShardInfo(offset, nbytes) + idx += 2 + end + + return ShardIndex{N}(chunks) +end + +""" + compute_encoded_index_size(chunks_per_shard::NTuple{N,Int}, index_codecs::Vector{V3Codec}) + +Compute the byte size of the encoded shard index. +Per spec: "The size of the index can be determined by applying c.compute_encoded_size +for each index codec recursively. The initial size is the byte size of the index array, +i.e. 16 * chunks per shard." +""" +function compute_encoded_index_size(chunks_per_shard::NTuple{N,Int}, index_codecs::Vector{V3Codec}) where N + # Initial size: 16 bytes per chunk (2 * UInt64) + n_chunks = prod(chunks_per_shard) + size = n_chunks * 16 + + # Apply each codec's size transformation + # For most codecs, we need to actually encode to know the size + # For simplicity, we encode an empty index + index = ShardIndex(chunks_per_shard) + encoded = encode_shard_index(index, index_codecs) + + return length(encoded) +end + +""" + zencode!(encoded::Vector{UInt8}, data::AbstractArray, c::ShardingCodec) + +Encode array data using sharding codec following Zarr v3 spec. + +Per spec: "In the sharding_indexed binary format, inner chunks are written successively +in a shard, where unused space between them is allowed, followed by an index referencing them." +""" +function zencode!(encoded::Vector{UInt8}, data::AbstractArray, c::ShardingCodec{N}) where N + shard_shape = size(data) + chunks_per_shard = calculate_chunks_per_shard(shard_shape, c.chunk_shape) + + # Create empty index + index = ShardIndex(chunks_per_shard) + + # Buffers for encoded chunks + chunk_buffers = Vector{UInt8}[] + current_offset = 0 + + # Process chunks in C order (row-major) + # Per spec: "The actual order of the chunk content is not fixed" + for cart_idx in CartesianIndices(chunks_per_shard) + chunk_coords = Tuple(cart_idx) + + # Extract chunk data from shard + slice_ranges = get_chunk_slice_in_shard(chunk_coords, c.chunk_shape, shard_shape) + chunk_data = data[slice_ranges...] + + # Encode chunk using codec pipeline + encoded_chunk = apply_codec_chain(chunk_data, c.codecs) + + # Skip if chunk is empty (no bytes) + if isempty(encoded_chunk) + set_chunk_empty!(index, chunk_coords) + continue + end + + nbytes = length(encoded_chunk) + + # Record offset and length in index + set_chunk_slice!(index, chunk_coords, current_offset, nbytes) + + push!(chunk_buffers, encoded_chunk) + current_offset += nbytes + end + + # Encode the index + encoded_index = encode_shard_index(index, c.index_codecs) + index_size = length(encoded_index) + + # If index is at start, adjust all offsets to account for index size + if c.index_location == :start + # Add index_size to all non-empty chunk offsets + for cart_idx in CartesianIndices(chunks_per_shard) + chunk_coords = Tuple(cart_idx) + info = index.chunks[cart_idx] + if info.offset != MAX_UINT64 + index.chunks[cart_idx] = ChunkShardInfo(info.offset + index_size, info.nbytes) + end + end + # Re-encode index with corrected offsets + encoded_index = encode_shard_index(index, c.index_codecs) + end + + # If all chunks are empty, return empty buffer (no shard) + if isempty(chunk_buffers) + resize!(encoded, 0) + return encoded + end + + # Assemble final shard: [index] + chunks or chunks + [index] + total_size = (c.index_location == :start ? index_size : 0) + + current_offset + + (c.index_location == :end ? index_size : 0) + + resize!(encoded, total_size) + output = IOBuffer(encoded, write=true) + + if c.index_location == :start + write(output, encoded_index) + for buf in chunk_buffers + write(output, buf) + end + else # :end + for buf in chunk_buffers + write(output, buf) + end + write(output, encoded_index) + end + + return encoded +end + +""" + zdecode!(data::AbstractArray, encoded::Vector{UInt8}, c::ShardingCodec) + +Decode sharded data back to array following Zarr v3 spec. + +Per spec: "A simple implementation to decode inner chunks in a shard would +(a) read the entire value from the store into a byte buffer, +(b) parse the shard index from the beginning or end of the buffer and +(c) cut out the relevant bytes that belong to the requested chunk." +""" +function zdecode!(data::AbstractArray, encoded::Vector{UInt8}, c::ShardingCodec{N}) where N + # Handle empty shard (no data) + if isempty(encoded) + fill!(data, zero(eltype(data))) # Fill with zeros (or should use fill_value from spec) + return data + end + + shard_shape = size(data) + chunks_per_shard = calculate_chunks_per_shard(shard_shape, c.chunk_shape) + + # Compute encoded index size + index_size = compute_encoded_index_size(chunks_per_shard, c.index_codecs) + + # Extract index bytes based on location + if c.index_location == :start + index_bytes = encoded[1:index_size] + chunk_data_offset = index_size + else # :end + index_bytes = encoded[end-index_size+1:end] + chunk_data_offset = 0 + end + + # Decode the index + index = decode_shard_index(index_bytes, chunks_per_shard, c.index_codecs) + + # Decode each chunk and place into output array + for cart_idx in CartesianIndices(chunks_per_shard) + chunk_coords = Tuple(cart_idx) + + # Get chunk byte range from index + chunk_slice = get_chunk_slice(index, chunk_coords) + + # Get array slice for this chunk + array_slice = get_chunk_slice_in_shard(chunk_coords, c.chunk_shape, shard_shape) + + if chunk_slice === nothing + # Empty chunk - fill with zeros (or fill_value) + # Per spec: "Empty inner chunks are interpreted as being filled with the fill value" + data[array_slice...] .= zero(eltype(data)) + continue + end + + # Extract chunk bytes + # Offsets in index are relative to start of chunk data + offset_start, offset_end = chunk_slice + + # Adjust for where chunk data begins in the shard + byte_start = chunk_data_offset + offset_start + 1 # Julia 1-based indexing + byte_end = chunk_data_offset + offset_end + + encoded_chunk = encoded[byte_start:byte_end] + + # Decode chunk using codec pipeline (in reverse) + decoded_chunk = reverse_codec_chain(encoded_chunk, c.codecs) + + # Place decoded chunk into output array + expected_shape = length.(array_slice) + data[array_slice...] = reshape(decoded_chunk, expected_shape) + end + + return data +end + +struct TransposeCodec <: V3Codec{:array, :array} +end +name(::TransposeCodec) = "transpose" + + +end diff --git a/src/Compressors/Compressors.jl b/src/Compressors/Compressors.jl index 1854128..c647eff 100644 --- a/src/Compressors/Compressors.jl +++ b/src/Compressors/Compressors.jl @@ -49,10 +49,13 @@ const compressortypes = Dict{Union{String,Nothing}, Type{<: Compressor}}() include("blosc.jl") include("zlib.jl") include("zstd.jl") +include("v3.jl") # ## Fallback definitions for the compressor interface # Define fallbacks and generic methods for the compressor interface -getCompressor(compdict::Dict) = getCompressor(compressortypes[compdict["id"]],compdict) +getCompressor(compdict::Dict) = haskey(compdict, "id") ? + getCompressor(compressortypes[compdict["id"]], compdict) : + getCompressor(compressortypes[compdict["name"]], compdict["configuration"]) getCompressor(::Nothing) = NoCompressor() # Compression when no filter is given @@ -104,4 +107,4 @@ end JSON.lower(::NoCompressor) = nothing -compressortypes[nothing] = NoCompressor \ No newline at end of file +compressortypes[nothing] = NoCompressor diff --git a/src/Compressors/v3.jl b/src/Compressors/v3.jl new file mode 100644 index 0000000..fa8c1ef --- /dev/null +++ b/src/Compressors/v3.jl @@ -0,0 +1,58 @@ +""" + Compressor v3{C <: Compressor} <: Compressor + +Wrapper to indicate Zarr v3 of a compressor +""" +struct Compressor_v3{C} <: Compressor + parent::C +end +Base.parent(c::Compressor_v3) = c.parent + +function zuncompress(a, z::Compressor_v3, T) + zuncompress(a, parent(z), T) +end + +function zuncompress!(data::DenseArray, compressed, z::Compressor_v3) + zuncompress!(data, compressed, parent(z)) +end + +function zcompress(a, z::Compressor_v3) + zcompress(a, parent(z)) +end + + +function JSON.lower(c::Compressor_v3{BloscCompressor}) + p = parent(c) + return Dict( + "name" => "blosc", + "configuration" => Dict( + "cname" => p.cname, + "clevel" => p.clevel, + "shuffle" => p.shuffle, +# TODO: Evaluate if we can encode typesize +# "typesize" => p.typesize, + "blocksize" => p.blocksize + ) + ) +end + +function JSON.lower(c::Compressor_v3{ZlibCompressor}) + p = parent(c) + return Dict( + "name" => "gzip", + "configuration" => Dict( + "level" => p.config.level + ) + ) +end + +function JSON.lower(c::Compressor_v3{ZstdCompressor}) + p = parent(c) + return Dict( + "name" => "zstd", + "configuration" => Dict( + "level" => p.config.compressionlevel, + "checksum" => p.config.checksum + ) + ) +end diff --git a/src/Compressors/zstd.jl b/src/Compressors/zstd.jl index c0e0254..6cd80a0 100644 --- a/src/Compressors/zstd.jl +++ b/src/Compressors/zstd.jl @@ -4,6 +4,7 @@ This file implements a Zstd compressor via ChunkCodecLibZstd.jl. =# + using ChunkCodecLibZstd: ZstdEncodeOptions using ChunkCodecCore: encode, decode, decode! @@ -51,4 +52,4 @@ function JSON.lower(z::ZstdCompressor) end end -Zarr.compressortypes["zstd"] = ZstdCompressor \ No newline at end of file +Zarr.compressortypes["zstd"] = ZstdCompressor diff --git a/src/Storage/Storage.jl b/src/Storage/Storage.jl index c113b05..b5816b2 100644 --- a/src/Storage/Storage.jl +++ b/src/Storage/Storage.jl @@ -123,15 +123,19 @@ function writeattrs(s::AbstractStore, p, att::Dict; indent_json::Bool= false) att end -is_zgroup(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zgroup")) -is_zarray(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zarray")) +is_zarr3(s::AbstractStore, p) = isinitialized(s,_concatpath(p,"zarr.json")) +is_zarr2(s::AbstractStore, p) = is_z2array(s, p) || is_z2group(s,p) +is_zgroup(s::AbstractStore, p) = is_z2group(s,p) +is_zarray(s::AbstractStore, p) = is_z2array(s,p) +is_z2group(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zgroup")) +is_z2array(s::AbstractStore, p) = isinitialized(s,_concatpath(p,".zarray")) isinitialized(s::AbstractStore, p, i::CartesianIndex)=isinitialized(s,p,citostring(i)) isinitialized(s::AbstractStore, p, i) = isinitialized(s,_concatpath(p,i)) isinitialized(s::AbstractStore, i) = s[i] !== nothing getmetadata(s::AbstractStore, p,fill_as_missing) = Metadata(String(maybecopy(s[p,".zarray"])),fill_as_missing) -function writemetadata(s::AbstractStore, p, m::Metadata; indent_json::Bool= false) +function writemetadata(s::AbstractStore, p, m::AbstractMetadata; indent_json::Bool= false) met = IOBuffer() if indent_json @@ -213,6 +217,7 @@ isemptysub(s::AbstractStore, p) = isempty(subkeys(s,p)) && isempty(subdirs(s,p)) storageregexlist = Pair[] push!(storageregexlist, r"^s3://" => S3Store) +include("formattedstore.jl") include("directorystore.jl") include("dictstore.jl") include("gcstore.jl") diff --git a/src/Storage/formattedstore.jl b/src/Storage/formattedstore.jl new file mode 100644 index 0000000..782c180 --- /dev/null +++ b/src/Storage/formattedstore.jl @@ -0,0 +1,230 @@ +# Default Zarr version +const DV = 2 + +# Default Zarr separator + +# Default Zarr v2 separator +const DS2 = '.' +# Default Zarr v3 separator +const DS3 = '/' + +default_sep(version) = version == 2 ? DS2 : + version == 3 ? DS3 : + error("Unknown version: $version") +const DS = default_sep(DV) + +# Chunk Key Encodings for Zarr v3 +# A Char is the separator for the default chunk key encoding +abstract type ChunkKeyEncoding end +struct V2ChunkKeyEncoding{SEP} <: ChunkKeyEncoding end +separator(c::Char) = c +separator(v2cke::V2ChunkKeyEncoding{SEP}) where SEP = SEP + +""" + FormattedStore{V,CKE,STORE <: AbstractStore} <: AbstractStore + +FormattedStore wraps an AbstractStore to indicate a specific Zarr format. +The path of a chunk depends on the version and chunk key encoding. + +# Type Parameters + +- V: Zarr format version +- CKE: Chunk key encoding or dimension separator. + CKE could be a `Char` or a subtype of `ChunkKeyEncoding`. +- STORE: Type of AbstractStore wrapped + +# Chunk Path Formats + +## Zarr version 2 + +### '.' dimension separator (default) + +Chunks are encoded as "1.2.3" + +### '/' dimension separator + +Chunks are encoded as "1/2/3" + +## Zarr version 3 + +### '/' dimension separator (default) + +Chunks are encoded as "c/1/2/3" + +### '.' dimension separator + +Chunks are encoded as "c.1.2.3" + +### V2ChunkKeyEncoding{SEP} + +See Zarr version 2 +""" +struct FormattedStore{V,SEP,STORE <: AbstractStore} <: AbstractStore + parent::STORE +end +FormattedStore(args...) = FormattedStore{DV,DS}(args...) +FormattedStore(s::FormattedStore) = s +FormattedStore{V}(args...) where V = FormattedStore{V, default_sep(V)}(args...) +FormattedStore{V}(s::FormattedStore{<:Any,S}) where {V,S} = FormattedStore{V, S}(s) +FormattedStore{<: Any, S}(args...) where S = FormattedStore{DV, S}(args...) +FormattedStore{<: Any, S}(s::FormattedStore{V}) where {V,S} = FormattedStore{V, S}(s) +function FormattedStore{V,S}(store::AbstractStore) where {V,S} + return FormattedStore{V,S,typeof(store)}(store) +end +function FormattedStore{V,S}(store::FormattedStore) where {V,S} + p = parent(store) + return FormattedStore{V,S,typeof(p)}(p) +end + +Base.parent(store::FormattedStore) = store.parent + +@inline citostring(i::CartesianIndex, version::Int, sep::Char=default_sep(version)) = (version == 3 ? "c$sep" : "" ) * join(reverse((i - oneunit(i)).I), sep) +@inline citostring(::CartesianIndex{0}, version::Int, sep::Char=default_sep(version)) = (version == 3 ? "c$(sep)0" : "0" ) +@inline citostring(i::CartesianIndex, ::Int, ::Type{V2ChunkKeyEncoding{S}}) where S = citostring(i, 2, S) +citostring(i::CartesianIndex, s::FormattedStore{V, S}) where {V,S} = citostring(i, V, S) + +Base.getindex(s::FormattedStore, p, i::CartesianIndex) = s[p, citostring(i,s)] +Base.delete!(s::FormattedStore, p, i::CartesianIndex) = delete!(s, p, citostring(i,s)) +Base.setindex!(s::FormattedStore, v, p, i::CartesianIndex) = s[p, citostring(i,s)]=v + +isinitialized(s::FormattedStore, p, i::CartesianIndex) = isinitialized(s,p,citostring(i, s)) + +""" +- [`storagesize(d::AbstractStore, p::AbstractString)`](@ref storagesize) +- [`subdirs(d::AbstractStore, p::AbstractString)`](@ref subdirs) +- [`subkeys(d::AbstractStore, p::AbstractString)`](@ref subkeys) +- [`isinitialized(d::AbstractStore, p::AbstractString)`](@ref isinitialized) +- [`storefromstring(::Type{<: AbstractStore}, s, _)`](@ref storefromstring) +- `Base.getindex(d::AbstractStore, i::AbstractString)`: return the data stored in key `i` as a Vector{UInt8} +- `Base.setindex!(d::AbstractStore, v, i::AbstractString)`: write the values in `v` to the key `i` of the given store `d` +""" + +storagesize(d::FormattedStore, p::AbstractString) = storagesize(parent(d), p) +subdirs(d::FormattedStore, p::AbstractString) = subdirs(parent(d), p) +subkeys(d::FormattedStore, p::AbstractString) = subkeys(parent(d), p) +isinitialized(d::FormattedStore, p::AbstractString) = isinitialized(parent(d), p) +storefromstring(::Type{FormattedStore{<: Any, <: Any, STORE}}, s, _) where STORE = FormattedStore{DV,DS}(storefromstring(STORE, s)) +storefromstring(::Type{FormattedStore{V,S}}, s, _) where {V,S} = FormattedStore{DV,DS}(storefromstring(s)) +storefromstring(::Type{FormattedStore{V,S,STORE}}, s, _) where {V,S,STORE} = FormattedStore{V,S,STORE}(storefromstring(STORE, s)) +Base.getindex(d::FormattedStore, i::AbstractString) = getindex(parent(d), i) +Base.setindex!(d::FormattedStore, v, i::AbstractString) = setindex!(parent(d), v, i) +Base.delete!(d::FormattedStore, i::AbstractString) = delete!(parent(d), i) + + +function Base.getproperty(store::FormattedStore{V,S}, sym::Symbol) where {V,S} + if sym == :dimension_separator + return S + elseif sym == :zarr_format + return V + elseif sym ∈ propertynames(getfield(store, :parent)) + # Support forwarding of properties to parent + return getproperty(store.parent, sym) + else + getfield(store, sym) + end +end +function Base.propertynames(store::FormattedStore) + return (:dimension_separator, :zarr_format, fieldnames(typeof(store))..., propertynames(store.parent)...) +end + + +""" + Zarr.set_dimension_separator(store::FormattedStore{V}, sep::Char)::FormattedStore{V,sep} + +Returns a FormattedStore of the same type with the same `zarr_format` parameter, `V`, +but with a dimension separator of `sep`. Note that this does not mutate the original store. + +# Examples + +``` +julia> Zarr.set_dimension_separator(Zarr.FormattedStore{2, '.'}(Zarr.DictStore(), '/')) |> typeof +Zarr.FormattedStore{2, '/',Zarr.DictStore} +``` + +""" +function set_dimension_separator(store::FormattedStore{V}, sep::Char) where V + return FormattedStore{V,sep}(store) +end +function set_dimension_separator(store::AbstractStore, sep::Char) + return FormattedStore{<: Any,sep}(store) +end + +""" + set_zarr_format(::FormattedStore{<: Any, S}, zarr_format::Int)::FormattedStore{zarr_format,S} + +Returns a FormattedStore of the same type with the same `dimension_separator` parameter, `S`, +but with the specified `zarr_format` parameter. Note that this does not mutate the original store. + +# Examples + +``` +julia> Zarr.set_zarr_format(Zarr.FormattedStore{2, '.'}(Zarr.DictStore(), 3)) |> typeof +Zarr.FormattedStore{3, '.', DictStore} +``` + +""" +function set_zarr_format(store::FormattedStore{<: Any, S}, zarr_format::Int) where S + return FormattedStore{zarr_format,S}(store) +end +function set_zarr_format(store::AbstractStore, zarr_format::Int) + return FormattedStore{zarr_format}(store) +end + +dimension_separator(::AbstractStore) = DS +dimension_separator(::FormattedStore{<: Any,S}) where S = S +zarr_format(::AbstractStore) = DV +zarr_format(::FormattedStore{V}) where V = V + +is_zgroup(s::FormattedStore{3}, p, metadata=getmetadata(s, p, false)) = + isinitialized(s,_concatpath(p,"zarr.json")) && + metadata.node_type == "group" +is_zarray(s::FormattedStore{3}, p, metadata=getmetadata(s, p, false)) = + isinitialized(s,_concatpath(p,"zarr.json")) && + metadata.node_type == "array" + +getmetadata(s::FormattedStore{3}, p,fill_as_missing) = Metadata(String(maybecopy(s[p,"zarr.json"])),fill_as_missing) +function writemetadata(s::FormattedStore{3}, p, m::AbstractMetadata; indent_json::Bool= false) + met = IOBuffer() + + if indent_json + JSON.print(met,m,4) + else + JSON.print(met,m) + end + + s[p,"zarr.json"] = take!(met) + m +end + +function getattrs(s::FormattedStore{3}) + md = s[p,"zarr.json"] + if md === nothing + error("zarr.json not found") + else + md = JSON.parse(replace(String(maybecopy(md)),": NaN,"=>": \"NaN\",")) + return get(md, "attributes", Dict{String, Any}()) + end +end + +function writeattrs(s::FormattedStore{3}, p, att::Dict; indent_json::Bool= false) + # This is messy, we need to open zarr.json and replace the attributes section + md = s[p,"zarr.json"] + if md === nothing + error("zarr.json not found") + else + md = JSON.parse(replace(String(maybecopy(md)),": NaN,"=>": \"NaN\",")) + end + md = Dict(md) + md["attributes"] = att + + b = IOBuffer() + + if indent_json + JSON.print(b,md,4) + else + JSON.print(b,md) + end + + s[p,"zarr.json"] = take!(b) + att +end diff --git a/src/Storage/http.jl b/src/Storage/http.jl index 9b68cb1..980284f 100644 --- a/src/Storage/http.jl +++ b/src/Storage/http.jl @@ -13,8 +13,8 @@ python package. In case you experience performance issues, one can try to use struct HTTPStore <: AbstractStore url::String allowed_codes::Set{Int} + HTTPStore(url, allowed_codes = Set((404,))) = new(url, allowed_codes) end -HTTPStore(url) = HTTPStore(url,Set((404,))) function Base.getindex(s::HTTPStore, k::String) r = HTTP.request("GET",string(s.url,"/",k),status_exception = false,socket_type_tls=OpenSSL.SSLStream) @@ -39,7 +39,21 @@ end push!(storageregexlist,r"^https://"=>HTTPStore) push!(storageregexlist,r"^http://"=>HTTPStore) -storefromstring(::Type{<:HTTPStore}, s,_) = ConsolidatedStore(HTTPStore(s),""),"" +function storefromstring(::Type{<:HTTPStore}, s,_) + http_store = HTTPStore(s) + try + if http_store["", ".zmetadata"] !== nothing + http_store = ConsolidatedStore(http_store,"") + end + if is_zarray(http_store, "") + meta = getmetadata(http_store, "", false) + http_store = FormattedStore{meta.zarr_format, meta.dimension_separator}(http_store) + end + catch err + @warn exception=err "Additional metadata was not available for HTTPStore." + end + return http_store,"" +end """ missing_chunk_return_code!(s::HTTPStore, code::Union{Int,AbstractVector{Int}}) diff --git a/src/ZArray.jl b/src/ZArray.jl index 95035d6..aca7ac3 100644 --- a/src/ZArray.jl +++ b/src/ZArray.jl @@ -33,7 +33,7 @@ Base.IndexStyle(::Type{<:SenMissArray})=Base.IndexLinear() # Currently this is not an AbstractArray, because indexing single elements is # would be really slow, although most AbstractArray interface functions are implemented struct ZArray{T, N, C<:Compressor, S<:AbstractStore} <: AbstractDiskArray{T,N} - metadata::Metadata{T, N, C} + metadata::AbstractMetadata{T, N, C} storage::S path::String attrs::Dict @@ -42,20 +42,20 @@ end Base.eltype(::ZArray{T}) where {T} = T Base.ndims(::ZArray{<:Any,N}) where {N} = N -Base.size(z::ZArray) = z.metadata.shape[] -function Base.size(z::ZArray,i) +Base.size(z::ZArray{<:Any,N}) where {N} = z.metadata.shape[]::NTuple{N, Int} +function Base.size(z::ZArray{<:Any,N}, i::Integer) where {N} len = length(z.metadata.shape[]) if 0 < i <= len - z.metadata.shape[][i] + z.metadata.shape[][i]::Int elseif i > len 1 else error("arraysize: dimension out of range") end end -Base.length(z::ZArray) = prod(z.metadata.shape[]) -Base.lastindex(z::ZArray,n) = size(z,n) -Base.lastindex(z::ZArray{<:Any,1}) = size(z,1) +Base.length(z::ZArray) = prod(z.metadata.shape[])::Int +Base.lastindex(z::ZArray{<:Any,N}, n::Integer) where {N} = size(z, n)::Int +Base.lastindex(z::ZArray{<:Any,1}) = size(z, 1)::Int function Base.show(io::IO,z::ZArray) print(io, "ZArray{", eltype(z) ,"} of size ",join(string.(size(z)), " x ")) @@ -312,6 +312,7 @@ Creates a new empty zarr array with element type `T` and array dimensions `dims` * `path=""` directory name to store a persistent array. If left empty, an in-memory array will be created * `name=""` name of the zarr array, defaults to the directory name +* `zarr_format`=$(DV) Zarr format version (2 or 3) * `storagetype` determines the storage to use, current options are `DirectoryStore` or `DictStore` * `chunks=dims` size of the individual array chunks, must be a tuple of length `length(dims)` * `fill_value=nothing` value to represent missing values @@ -321,23 +322,33 @@ Creates a new empty zarr array with element type `T` and array dimensions `dims` * `attrs=Dict()` a dict containing key-value pairs with metadata attributes associated to the array * `writeable=true` determines if the array is opened in read-only or write mode * `indent_json=false` determines if indents are added to format the json files `.zarray` and `.zattrs`. This makes them more readable, but increases file size. +* `dimension_separator='.'` sets how chunks are encoded. The Zarr v2 default is '.' such that the first 3D chunk would be `0.0.0`. The Zarr v3 default is `/`. """ function zcreate(::Type{T}, dims::Integer...; name="", path=nothing, + zarr_format=DV, + dimension_separator=default_sep(zarr_format), kwargs... ) where T + + if dimension_separator isa AbstractString + # Convert AbstractString to Char + dimension_separator = only(dimension_separator) + end + if path===nothing - store = DictStore() + store = FormattedStore{zarr_format, dimension_separator}(DictStore()) else - store = DirectoryStore(joinpath(path,name)) + store = FormattedStore{zarr_format, dimension_separator}(DirectoryStore(joinpath(path,name))) end - zcreate(T, store, dims...; kwargs...) + zcreate(T, store, dims...; zarr_format, kwargs...) end function zcreate(::Type{T},storage::AbstractStore, dims...; path = "", + zarr_format = DV, chunks=dims, fill_value=nothing, fill_as_missing=false, @@ -345,24 +356,36 @@ function zcreate(::Type{T},storage::AbstractStore, filters = filterfromtype(T), attrs=Dict(), writeable=true, - indent_json=false - ) where T + indent_json=false, + dimension_separator=nothing + ) where {T} + + if isnothing(dimension_separator) + dimension_separator = Zarr.dimension_separator(storage) + elseif dimension_separator != Zarr.dimension_separator(storage) + error("The dimension separator keyword value, $dimension_separator, + must agree with the dimension separator type parameter, $(Zarr.dimension_separator(storage))") + end length(dims) == length(chunks) || throw(DimensionMismatch("Dims must have the same length as chunks")) N = length(dims) C = typeof(compressor) - T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} - metadata = Metadata{T2, N, C, typeof(filters)}( - 2, - dims, - chunks, - typestr(T), - compressor, - fill_value, - 'C', - filters, + + # Create a dummy array to use with Metadata constructor + # This allows us to leverage the multiple dispatch in Metadata constructors + dummy_array = Array{T,N}(undef, dims...) + metadata = Metadata(dummy_array, chunks; + zarr_format=zarr_format, + compressor=compressor, + fill_value=fill_value, + filters=filters, + fill_as_missing=fill_as_missing, + dimension_separator=dimension_separator ) + # Extract the element type from the metadata (handles T2 calculation) + T2 = eltype(metadata) + isemptysub(storage,path) || error("$storage $path is not empty") writemetadata(storage, path, metadata, indent_json=indent_json) diff --git a/src/ZGroup.jl b/src/ZGroup.jl index be2b0d1..031d33f 100644 --- a/src/ZGroup.jl +++ b/src/ZGroup.jl @@ -20,10 +20,21 @@ function ZGroup(s::T,mode="r",path="";fill_as_missing=false) where T <: Abstract for d in subdirs(s,path) dshort = split(d,'/')[end] - m = zopen_noerr(s,mode,path=_concatpath(path,dshort),fill_as_missing=fill_as_missing) - if isa(m, ZArray) + subpath = _concatpath(path,dshort) + if is_zarr2(s, subpath) + # check for zarr2 first + elseif is_zarr3(s, subpath) + s = set_zarr_format(s, 3) + end + if is_zarray(s, subpath) + meta = getmetadata(s, subpath, false) + if dimension_separator(s) != meta.dimension_separator + s = set_dimension_separator(s, meta.dimension_separator) + end + m = zopen_noerr(s,mode,path=_concatpath(path,dshort),fill_as_missing=fill_as_missing) arrays[dshort] = m - elseif isa(m, ZGroup) + elseif is_zgroup(s, subpath) + m = zopen_noerr(s,mode,path=_concatpath(path,dshort),fill_as_missing=fill_as_missing) groups[dshort] = m end end @@ -37,9 +48,9 @@ end Works like `zopen` with the single difference that no error is thrown when the path or store does not point to a valid zarr array or group, but nothing -is returned instead. +is returned instead. """ -function zopen_noerr(s::AbstractStore, mode="r"; +function zopen_noerr(s::AbstractStore, mode="r"; consolidated = false, path="", lru = 0, @@ -116,8 +127,21 @@ function storefromstring(s, create=true) return storefromstring(t,s,create) end end - if create || isdir(s) - return DirectoryStore(s), "" + if create + return FormattedStore(DirectoryStore(s)), "" + elseif isdir(s) + # parse metadata to determine store kind + temp_store = DirectoryStore(s) + if is_zarr3(temp_store, "") + temp_store = set_zarr_format(temp_store, 3) + end + if is_zarray(temp_store, "") + meta = getmetadata(temp_store, "", false) + store = FormattedStore{meta.zarr_format, meta.dimension_separator}(temp_store) + else + store = FormattedStore(temp_store) + end + return store, "" else throw(ArgumentError("Path $s is not a directory.")) end @@ -129,7 +153,7 @@ end Create a new zgroup in the store `s` """ function zgroup(s::AbstractStore, path::String=""; attrs=Dict(), indent_json::Bool= false) - d = Dict("zarr_format"=>2) + d = Dict("zarr_format"=>DV) isemptysub(s, path) || error("Store is not empty") b = IOBuffer() diff --git a/src/Zarr.jl b/src/Zarr.jl index dbdeb9a..1783bdf 100644 --- a/src/Zarr.jl +++ b/src/Zarr.jl @@ -4,7 +4,9 @@ import JSON import Blosc include("metadata.jl") +include("metadata3.jl") include("Compressors/Compressors.jl") +include("Codecs/Codecs.jl") include("Storage/Storage.jl") include("Filters/Filters.jl") include("ZArray.jl") diff --git a/src/metadata.jl b/src/metadata.jl index 17e48c4..ae01ae5 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -90,12 +90,25 @@ end Each array requires essential configuration metadata to be stored, enabling correct interpretation of the stored data. This metadata is encoded using JSON and stored as the -value of the “.zarray” key within an array store. +value of the ".zarray" key within an array store. + +# Type Parameters +* T - element type of the array +* N - dimensionality of the array +* C - compressor +* F - filters +* S - dimension separator + +# See Also https://zarr.readthedocs.io/en/stable/spec/v2.html#metadata """ -struct Metadata{T, N, C, F} +abstract type AbstractMetadata{T, N, C, F, S} end + +"""Metadata for Zarr version 2 arrays""" +struct MetadataV2{T, N, C, F, S} <: AbstractMetadata{T, N, C, F, S} zarr_format::Int + node_type::String shape::Base.RefValue{NTuple{N, Int}} chunks::NTuple{N, Int} dtype::String # structured data types not yet supported @@ -103,43 +116,126 @@ struct Metadata{T, N, C, F} fill_value::Union{T, Nothing} order::Char filters::F # not yet supported - function Metadata{T2, N, C, F}(zarr_format, shape, chunks, dtype, compressor,fill_value, order, filters) where {T2,N,C,F} - #We currently only support version - zarr_format == 2 || throw(ArgumentError("Zarr.jl currently only support v2 of the protocol")) + function MetadataV2{T2, N, C, F, S}(zarr_format, node_type, shape, chunks, dtype, compressor, fill_value, order, filters) where {T2,N,C,F,S} + zarr_format == 2 || throw(ArgumentError("MetadataV2 only functions if zarr_format == 2")) #Do some sanity checks to make sure we have a sane array any(<(0), shape) && throw(ArgumentError("Size must be positive")) any(<(1), chunks) && throw(ArgumentError("Chunk size must be >= 1 along each dimension")) order === 'C' || throw(ArgumentError("Currently only 'C' storage order is supported")) - new{T2, N, C, F}(zarr_format, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor,fill_value, order, filters) + new{T2, N, C, F, S}(zarr_format, node_type, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor,fill_value, order, filters) + end + function MetadataV2{T2, N, C, F}( + zarr_format, + node_type, + shape, + chunks, + dtype, + compressor, + fill_value, + order, + filters, + dimension_separator::Char = '.' + ) where {T2,N,C,F} + return MetadataV2{T2, N, C, F, dimension_separator}( + zarr_format, + node_type, + shape, + chunks, + dtype, + compressor, + fill_value, + order, + filters + ) end end +"""Metadata for Zarr version 3 arrays""" +struct MetadataV3{T, N, C, F, S} <: AbstractMetadata{T, N, C, F, S} + zarr_format::Int + node_type::String + shape::Base.RefValue{NTuple{N, Int}} + chunks::NTuple{N, Int} + dtype::String # data_type in v3 + compressor::C + fill_value::Union{T, Nothing} + order::Char + filters::F # not yet supported + function MetadataV3{T2, N, C, F, S}(zarr_format, node_type, shape, chunks, dtype, compressor, fill_value, order, filters) where {T2,N,C,F,S} + zarr_format == 3 || throw(ArgumentError("MetadataV3 only functions if zarr_format == 3")) + #Do some sanity checks to make sure we have a sane array + any(<(0), shape) && throw(ArgumentError("Size must be positive")) + any(<(1), chunks) && throw(ArgumentError("Chunk size must be >= 1 along each dimension")) + order === 'C' || throw(ArgumentError("Currently only 'C' storage order is supported")) + new{T2, N, C, F, S}(zarr_format, node_type, Base.RefValue{NTuple{N,Int}}(shape), chunks, dtype, compressor,fill_value, order, filters) + end +end + +# Type alias for backward compatibility +const Metadata = AbstractMetadata + +const DimensionSeparatedMetadata{S} = AbstractMetadata{<: Any, <: Any, <: Any, <: Any, S} + +function Base.getproperty(m::DimensionSeparatedMetadata{S}, name::Symbol) where S + if name == :dimension_separator + return S + end + return getfield(m, name) +end +Base.propertynames(m::AbstractMetadata) = (fieldnames(typeof(m))..., :dimension_separator) + #To make unit tests pass with ref shape import Base.== -function ==(m1::Metadata, m2::Metadata) +function ==(m1::AbstractMetadata, m2::AbstractMetadata) m1.zarr_format == m2.zarr_format && + m1.node_type == m2.node_type && m1.shape[] == m2.shape[] && m1.chunks == m2.chunks && m1.dtype == m2.dtype && m1.compressor == m2.compressor && m1.fill_value == m2.fill_value && m1.order == m2.order && - m1.filters == m2.filters + m1.filters == m2.filters && + m1.dimension_separator == m2.dimension_separator end "Construct Metadata based on your data" function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; zarr_format::Integer=2, + node_type::String="array", compressor::C=BloscCompressor(), fill_value::Union{T, Nothing}=nothing, order::Char='C', - filters::Nothing=nothing, + filters=nothing, fill_as_missing = false, + dimension_separator::Char = '.' ) where {T, N, C} + return Metadata(A, chunks, Val(zarr_format); + node_type=node_type, + compressor=compressor, + fill_value=fill_value, + order=order, + filters=filters, + fill_as_missing=fill_as_missing, + dimension_separator=dimension_separator + ) +end + +# V2 constructor +function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}, ::Val{2}; + node_type::String="array", + compressor::C=BloscCompressor(), + fill_value::Union{T, Nothing}=nothing, + order::Char='C', + filters::F=nothing, + fill_as_missing = false, + dimension_separator::Char = '.' + ) where {T, N, C, F} T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} - Metadata{T2, N, C, typeof(filters)}( - zarr_format, + MetadataV2{T2, N, C, typeof(filters), dimension_separator}( + 2, + node_type, size(A), chunks, typestr(eltype(A)), @@ -150,11 +246,44 @@ function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; ) end +function Metadata(A::AbstractArray{T, N}, chunks::NTuple{N, Int}, ::Val{3}; + node_type::String="array", + compressor::C=BloscCompressor(), + fill_value::Union{T, Nothing}=nothing, + order::Char='C', + filters::F=nothing, + fill_as_missing = false, + dimension_separator::Char = '.' + ) where {T, N, C, F} + return Metadata3(A, chunks; + node_type=node_type, + compressor=compressor, + fill_value=fill_value, + order=order, + filters=filters, + fill_as_missing=fill_as_missing, + dimension_separator=dimension_separator + ) +end + Metadata(s::Union{AbstractString, IO}, fill_as_missing) = Metadata(JSON.parse(s; dicttype=Dict), fill_as_missing) "Construct Metadata from Dict" function Metadata(d::AbstractDict, fill_as_missing) - # create a Metadata struct from it + zarr_format = d["zarr_format"]::Int + if zarr_format == 2 + return Metadata(d, fill_as_missing, Val(2)) + elseif zarr_format == 3 + return Metadata(d, fill_as_missing, Val(3)) + else + throw(ArgumentError("Zarr.jl currently only supports v2 or v3 of the specification")) + end +end + +# V2 constructor from Dict +function Metadata(d::AbstractDict, fill_as_missing, ::Val{2}) + # Zarr v2 metadata is only for arrays + node_type = "array" compdict = d["compressor"] if isnothing(compdict) @@ -176,8 +305,11 @@ function Metadata(d::AbstractDict, fill_as_missing) TU = (fv === nothing || !fill_as_missing) ? T : Union{T,Missing} - Metadata{TU, N, C, F}( + S = only(get(d, "dimension_separator", '.')) + + MetadataV2{TU, N, C, F, S}( d["zarr_format"], + node_type, NTuple{N, Int}(d["shape"]) |> reverse, NTuple{N, Int}(d["chunks"]) |> reverse, d["dtype"], @@ -188,20 +320,31 @@ function Metadata(d::AbstractDict, fill_as_missing) ) end +# V3 constructor from Dict - delegate to metadata3.jl +function Metadata(d::AbstractDict, fill_as_missing, ::Val{3}) + return Metadata3(d, fill_as_missing) +end + "Describes how to lower Metadata to JSON, used in json(::Metadata)" -function JSON.lower(md::Metadata) +function JSON.lower(md::MetadataV2) Dict{String, Any}( "zarr_format" => md.zarr_format, + "node_type" => md.node_type, "shape" => md.shape[] |> reverse, "chunks" => md.chunks |> reverse, "dtype" => md.dtype, "compressor" => md.compressor, "fill_value" => fill_value_encoding(md.fill_value), "order" => md.order, - "filters" => md.filters + "filters" => md.filters, + "dimension_separator" => md.dimension_separator ) end +function JSON.lower(md::MetadataV3) + return lower3(md) +end + # Fill value encoding and decoding as described in # https://zarr.readthedocs.io/en/stable/spec/v2.html#fill-value-encoding @@ -217,7 +360,7 @@ function fill_value_encoding(v::AbstractFloat) end end -Base.eltype(::Metadata{T}) where T = T +Base.eltype(::AbstractMetadata{T}) where T = T # this correctly parses "NaN" and "Infinity" fill_value_decoding(v::AbstractString, T::Type{<:Number}) = parse(T, v) diff --git a/src/metadata3.jl b/src/metadata3.jl new file mode 100644 index 0000000..5a45c69 --- /dev/null +++ b/src/metadata3.jl @@ -0,0 +1,337 @@ +""" +Prototype Zarr version 3 support +""" + +const typemap3 = Dict{String, DataType}() +foreach([Bool, Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float16, Float32, Float64]) do t + typemap3[lowercase(string(t))] = t +end +typemap3["complex64"] = ComplexF32 +typemap3["complex128"] = ComplexF64 + +function typestr3(t::Type) + return lowercase(string(t)) +end +# TODO: Check raw types +function typestr3(::Type{NTuple{N,UInt8}}) where {N} + return "r$(N*8)" +end + +function typestr3(s::AbstractString, codecs=nothing) + if !haskey(typemap3, s) + if startswith(s, "r") + num_bits = tryparse(Int, s[2:end]) + if isnothing(num_bits) + raise(ArgumentError("$s is not a known type")) + end + if mod(num_bits, 8) == 0 + return NTuple{num_bits÷8,UInt8} + else + raise(ArgumentError("$s must describe a raw type with bit size that is a multiple of 8 bits")) + end + end + end + return typemap3[s] +end + +function check_keys(d::AbstractDict, keys) + for key in keys + if !haskey(d, key) + throw(ArgumentError("Zarr v3 metadata must have a key called $key")) + end + end +end + +function Metadata3(d::AbstractDict, fill_as_missing) + check_keys(d, ("zarr_format", "node_type")) + + zarr_format = d["zarr_format"]::Int + + node_type = d["node_type"]::String + if node_type ∉ ("group", "array") + throw(ArgumentError("Unknown node_type of $node_type")) + end + + zarr_format == 3 || throw(ArgumentError("Metadata3 only functions if zarr_format == 3")) + + # Groups + if node_type == "group" + # Groups only need zarr_format and node_type + # Optionally they can have attributes + for key in keys(d) + if key ∉ ("zarr_format", "node_type", "attributes") + throw(ArgumentError("Zarr v3 group metadata cannot have a key called $key")) + end + end + + return MetadataV3{Int,0,Nothing,Nothing,'/'}(zarr_format, node_type, (), (), "", nothing, 0, 'C', nothing) + end + + # Array keys + mandatory_keys = [ + "zarr_format", + "node_type", + "shape", + "data_type", + "chunk_grid", + "chunk_key_encoding", + "fill_value", + "codecs", + ] + optional_keys = [ + "attributes", + "storage_transformers", + "dimension_names", + ] + + check_keys(d, mandatory_keys) + for key in keys(d) + if key ∉ mandatory_keys && key ∉ optional_keys + throw(ArgumentError("Zarr v3 metadata cannot have a key called $key")) + end + end + + # Shape + shape = Int.(d["shape"]) + + # Datatype + data_type = d["data_type"]::String + + # Chunk Grid + chunk_grid = d["chunk_grid"] + if chunk_grid["name"] == "regular" + chunks = Int.(chunk_grid["configuration"]["chunk_shape"]) + if length(shape) != length(chunks) + throw(ArgumentError("Shape has rank $(length(shape)) which does not match the chunk_shape rank of $(length(chunks))")) + end + else + throw(ArgumentError("Unknown chunk_grid of name, $(chunk_grid["name"])")) + end + + # Chunk Key Encoding + chunk_key_encoding = d["chunk_key_encoding"] + if chunk_key_encoding["name"] == "default" + elseif chunk_key_encoding["name"] == "v2" + else + throw(ArgumentError("Unknown chunk_key_encoding of name, $(chunk_key_encoding["name"])")) + end + + + # Codecs + compdict = nothing + + # For transpose codec permutation tracking + default_dim_perm = Tuple(1:length(shape)) + dim_perm = default_dim_perm + + codec_data_type = Ref(:array) + + function check_codec_data_type(codec_name, from, to) + codec_data_type[] == from || + throw(ArgumentError("$codec_name found by codec_data_type is $(codec_data_type[])")) + codec_data_type[] = to + return nothing + end + + for codec in d["codecs"] + codec_name = codec["name"] + if codec_name == "bytes" + # array -> bytes + check_codec_data_type(codec_name, :array, :bytes) + if haskey(codec, "configuration") + codec["configuration"]["endian"] == "little" || + throw(ArgumentError("Zarr.jl currently only supports little endian for the bytes codec")) + end + elseif codec_name == "zstd" + # bytes -> bytes + check_codec_data_type(codec_name, :bytes, :bytes) + compdict = codec + elseif codec_name == "blosc" + # bytes -> bytes + check_codec_data_type(codec_name, :bytes, :bytes) + compdict = codec + elseif codec_name == "gzip" + # bytes -> bytes + check_codec_data_type(codec_name, :bytes, :bytes) + compdict = codec + elseif codec_name == "transpose" + # array -> array + check_codec_data_type(codec_name, :array, :array) + _dim_order = codec["configuration"]["order"] + if _dim_order == "C" + @warn "Transpose codec dimension order of $_dim_order is deprecated" + _dim_order = 1:length(shape) + elseif _dim_order == "F" + @warn "Transpose codec dimension order of $_dim_order is deprecated" + _dim_order = reverse(1:length(shape)) + else + _dim_order = Int.(codec["configuration"]["order"]) .+ 1 + end + dim_perm = dim_perm[_dim_order] + elseif codec_name == "sharding_indexed" + # array -> bytes + check_codec_data_type(codec_name, :array, :bytes) + # TODO: Implement sharding codec support + # See implementation suggestions in src/Codecs/V3/V3.jl for ShardingCodec + throw(ArgumentError("Zarr.jl currently does not support the $(codec["name"]) codec. See src/Codecs/V3/V3.jl for implementation suggestions.")) + elseif codec_name == "crc32c" + # bytes -> bytes + check_codec_data_type(codec_name, :bytes, :bytes) + throw(ArgumentError("Zarr.jl currently does not support the $(codec["name"]) codec")) + else + throw(ArgumentError("Zarr.jl currently does not support the $(codec["name"]) codec")) + end + end + + if dim_perm == default_dim_perm + order = 'C' + elseif dim_perm == reverse(default_dim_perm) + order = 'F' + else + throw(ArgumentError("Dimension permutation of $dim_perm is not implemented")) + end + + compressor = getCompressor(compdict) + + # Filters (NOT IMPLEMENTED) + # For v3, filters are not yet implemented, so we return nothing + filters = nothing + + # Type Parameters + T = typestr3(data_type) + N = length(shape) + C = typeof(compressor) + F = typeof(filters) + + fv = fill_value_decoding(d["fill_value"], T)::T + + TU = (fv === nothing || !fill_as_missing) ? T : Union{T,Missing} + + cke_configuration = get(chunk_key_encoding, "configuration") do + Dict{String,Any}() + end + # V2 uses '.' while default CKE uses '/' by default + if chunk_key_encoding["name"] == "v2" + separator = only(get(cke_configuration, "separator", '.')) + S = V2ChunkKeyEncoding{separator}() + elseif chunk_key_encoding["name"] == "default" + S = only(get(cke_configuration, "separator", '/')) + end + + MetadataV3{TU, N, C, F, S}( + zarr_format, + node_type, + NTuple{N, Int}(shape) |> reverse, + NTuple{N, Int}(chunks) |> reverse, + data_type, + compressor, + fv, + order, + filters, + ) +end + +"Construct MetadataV3 based on your data" +function Metadata3(A::AbstractArray{T, N}, chunks::NTuple{N, Int}; + node_type::String="array", + compressor::C=BloscCompressor(), + fill_value::Union{T, Nothing}=nothing, + order::Char='C', + filters::F=nothing, + fill_as_missing = false, + dimension_separator::Char = '/' + ) where {T, N, C, F} + @warn("Zarr v3 support is experimental") + T2 = (fill_value === nothing || !fill_as_missing) ? T : Union{T,Missing} + if fill_value === nothing + fill_value = zero(T) + end + MetadataV3{T2, N, C, typeof(filters), dimension_separator}( + 3, + node_type, + size(A), + chunks, + typestr3(eltype(A)), + compressor, + fill_value, + order, + filters + ) +end + +function lower3(md::MetadataV3{T}) where T + + mandatory_keys = [ + "zarr_format", + "node_type", + "shape", + "data_type", + "chunk_grid", + "chunk_key_encoding", + "fill_value", + "codecs", + ] + optional_keys = [ + "attributes", + "storage_transformers", + "dimension_names", + ] + + chunk_grid = Dict{String,Any}( + "name" => "regular", + "configuration" => Dict{String,Any}( + "chunk_shape" => md.chunks |> reverse + ) + ) + + chunk_key_encoding = Dict{String,Any}( + "name" => isa(md.dimension_separator, Char) ? "default" : + isa(md.dimension_separator, V2ChunkKeyEncoding) ? "v2" : + error("Unknown encoding for $(md.dimension_separator)"), + "configuration" => Dict{String,Any}( + "separator" => separator(md.dimension_separator) + ) + ) + + # TODO: Incorporate filters + codecs = Dict{String,Any}[] + + default_dim_perm = Tuple(0:length(md.shape[])-1) + + # Encode the order as a single transpose codec (array to array) + push!(codecs, + Dict{String,Any}( + "name" => "transpose", + "configuration" => Dict( + "order" => md.order == 'C' ? default_dim_perm : + md.order == 'F' ? reverse(default_dim_perm) : + error("Unable to encode order $(md.order)") + ) + ) + ) + + # Convert from array to bytes + push!(codecs, + Dict{String,Any}( + "name" => "bytes", + "configuration" => Dict{String, Any}( + "endian" => "little" + ) + ) + ) + # Compress bytes to bytes (only if not NoCompressor) + if !(md.compressor isa NoCompressor) + push!(codecs, JSON.lower(Compressor_v3(md.compressor))) + end + + Dict{String, Any}( + "zarr_format" => md.zarr_format, + "node_type" => md.node_type, + "shape" => md.shape[] |> reverse, + "data_type" => typestr3(T), + "chunk_grid" => chunk_grid, + "chunk_key_encoding" => chunk_key_encoding, + "fill_value" => fill_value_encoding(md.fill_value)::T, + "codecs" => codecs + ) +end diff --git a/test/runtests.jl b/test/runtests.jl index f730495..b705c8c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -13,11 +13,14 @@ using Dates @testset "fields" begin z = zzeros(Int64, 2, 3) @test z isa ZArray{Int64, 2, Zarr.BloscCompressor, - Zarr.DictStore} + Zarr.FormattedStore{2, '.', Zarr.DictStore}} + @test :a ∈ propertynames(z.storage) @test length(z.storage.a) === 3 @test length(z.storage.a["0.0"]) === 64 @test eltype(z.storage.a["0.0"]) === UInt8 + @test z.metadata.zarr_format === 2 + @test z.metadata.node_type === "array" @test z.metadata.shape[] === (2, 3) @test z.metadata.order === 'C' @test z.metadata.chunks === (2, 3) @@ -29,16 +32,17 @@ using Dates @test z.metadata.compressor.shuffle === 1 @test z.attrs == Dict{Any, Any}() @test z.writeable === true + @test z.metadata.dimension_separator === Zarr.DS + @test :dimension_separator ∈ propertynames(z.metadata) @test_throws ArgumentError zzeros(Int64,2,3, chunks = (0,1)) @test_throws ArgumentError zzeros(Int64,0,-1) - @test_throws ArgumentError Zarr.Metadata(zeros(2,2), (2,2), zarr_format = 3) @test_throws ArgumentError Zarr.Metadata(zeros(2,2), (2,2), order = 'F') end @testset "methods" begin z = zzeros(Int64, 2, 3) @test z isa ZArray{Int64, 2, Zarr.BloscCompressor, - Zarr.DictStore} + Zarr.FormattedStore{2, '.', Zarr.DictStore}} @test eltype(z) === Int64 @test ndims(z) === 2 @@ -60,7 +64,7 @@ using Dates compressor=Zarr.NoCompressor()) @test z.metadata.compressor === Zarr.NoCompressor() - @test z.storage === Zarr.DirectoryStore("$dir/$name") + @test z.storage === Zarr.FormattedStore{2 ,'.'}(Zarr.DirectoryStore("$dir/$name")) @test isdir("$dir/$name") @test ispath("$dir/$name/.zarray") @test ispath("$dir/$name/.zattrs") @@ -69,12 +73,15 @@ using Dates @test JSON.parsefile("$dir/$name/.zarray") == Dict{String, Any}( "dtype" => " nothing, - "shape" => [3, 2], + "shape" => Any[3, 2], "order" => "C", "zarr_format" => 2, - "chunks" => [3, 2], + "node_type" => "array", + "chunks" => Any[3, 2], "fill_value" => nothing, - "compressor" => nothing) + "compressor" => nothing, + "dimension_separator" => "." + ) # call gc to avoid unlink: operation not permitted (EPERM) on Windows # might be because files are left open # from https://github.com/JuliaLang/julia/blob/f6344d32d3ebb307e2b54a77e042559f42d2ebf6/stdlib/SharedArrays/test/runtests.jl#L146 diff --git a/test/storage.jl b/test/storage.jl index d233301..1a06d40 100644 --- a/test/storage.jl +++ b/test/storage.jl @@ -14,10 +14,39 @@ using AWSS3 @test Zarr.normalize_path("/path/to/a") == "/path/to/a" end +@testset "Version and Dimension Separator" begin + v2cke_period = Zarr.V2ChunkKeyEncoding{'.'} + v2cke_slash = Zarr.V2ChunkKeyEncoding{'/'} + let ci = CartesianIndex() + @test Zarr.citostring(ci, 2, '.') == "0" + @test Zarr.citostring(ci, 2, '/') == "0" + @test Zarr.citostring(ci, 3, v2cke_period) == "0" + @test Zarr.citostring(ci, 3, v2cke_slash) == "0" + @test Zarr.citostring(ci, 3, '.') == "c.0" + @test Zarr.citostring(ci, 3, '/') == "c/0" + end + let ci = CartesianIndex(1,1,1) + @test Zarr.citostring(ci, 2, '.') == "0.0.0" + @test Zarr.citostring(ci, 2, '/') == "0/0/0" + @test Zarr.citostring(ci, 3, v2cke_period) == "0.0.0" + @test Zarr.citostring(ci, 3, v2cke_slash) == "0/0/0" + @test Zarr.citostring(ci, 3, '.') == "c.0.0.0" + @test Zarr.citostring(ci, 3, '/') == "c/0/0/0" + end + let ci = CartesianIndex(1,3,5) + @test Zarr.citostring(ci, 2, '.') == "4.2.0" + @test Zarr.citostring(ci, 2, '/') == "4/2/0" + @test Zarr.citostring(ci, 3, v2cke_period) == "4.2.0" + @test Zarr.citostring(ci, 3, v2cke_slash) == "4/2/0" + @test Zarr.citostring(ci, 3, '.') == "c.4.2.0" + @test Zarr.citostring(ci, 3, '/') == "c/4/2/0" + end +end + """ Function to test the interface of AbstractStore. Every complete implementation should pass this test. """ -function test_store_common(ds) +function test_store_common(ds::Zarr.AbstractStore) @test !Zarr.is_zgroup(ds,"") ds[".zgroup"]=rand(UInt8,50) @test haskey(ds,".zgroup") @@ -37,17 +66,23 @@ function test_store_common(ds) @test Zarr.subdirs(ds,"bar") == String[] #Test getindex and setindex data = rand(UInt8,50) - ds["bar/0.0.0"] = data + V = Zarr.zarr_format(ds) + S = Zarr.dimension_separator(ds) + first_ci_str = Zarr.citostring(CartesianIndex(1,1,1), V, S) + second_ci_str = Zarr.citostring(CartesianIndex(2,1,1), V, S) + ds["bar/" * first_ci_str] = data @test ds["bar/0.0.0"]==data @test Zarr.storagesize(ds,"bar")==50 - @test Zarr.isinitialized(ds,"bar/0.0.0") - @test !Zarr.isinitialized(ds,"bar/0.0.1") + @test Zarr.isinitialized(ds,"bar/" * first_ci_str) + @test !Zarr.isinitialized(ds,"bar/" * second_ci_str) Zarr.writeattrs(ds,"bar",Dict("a"=>"b")) @test Zarr.getattrs(ds,"bar")==Dict("a"=>"b") - delete!(ds,"bar/0.0.0") - @test !Zarr.isinitialized(ds,"bar",CartesianIndex((0,0,0))) - @test !Zarr.isinitialized(ds,"bar/0.0.0") - ds["bar/0.0.0"] = data + delete!(ds,"bar/" * first_ci_str) + @test !Zarr.isinitialized(ds,"bar",CartesianIndex((1,1,1))) + @test !Zarr.isinitialized(ds,"bar/" * first_ci_str) + ds["bar/" * first_ci_str] = data + @test !Zarr.isinitialized(ds, "bar", CartesianIndex(0,0,0)) + @test Zarr.isinitialized(ds, "bar", CartesianIndex(1,1,1)) #Add tests for empty storage @test Zarr.isemptysub(ds,"ba") @test Zarr.isemptysub(ds,"ba/") @@ -157,6 +192,7 @@ end @testset "Minio S3 storage" begin + @info "Testing Minio S3 storage" A = fill(1.0, 30, 20) chunks = (5,10) metadata = Zarr.Metadata(A, chunks; fill_value=-1.5) @@ -177,6 +213,7 @@ end end @testset "AWS S3 Storage" begin + @info "Testing AWS S3 storage" AWSS3.AWS.global_aws_config(AWSS3.AWS.AWSConfig(creds=nothing, region="us-west-2")) S3, p = Zarr.storefromstring("s3://mur-sst/zarr-v1") @test Zarr.is_zgroup(S3, p) @@ -189,6 +226,7 @@ end end @testset "GCS Storage" begin + @info "Testing GCS storage" for s in ( "gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/highresSST-present/r1i1p1f1/6hrPlev/psl/gn/v20170706", "https://storage.googleapis.com/cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/highresSST-present/r1i1p1f1/6hrPlev/psl/gn/v20170706", @@ -210,6 +248,7 @@ end end @testset "HTTP Storage" begin + @info "Testing HTTP Storage" s = Zarr.DictStore() g = zgroup(s, attrs = Dict("groupatt"=>5)) a = zcreate(Int,g,"a1",10,20,chunks=(5,5),attrs=Dict("arratt"=>2.5)) @@ -245,6 +284,7 @@ end end @testset "Zip Storage" begin + @info "Testing Zip Storage" s = Zarr.DictStore() g = zgroup(s, attrs = Dict("groupatt"=>5)) a = zcreate(Int,g,"a1",10,20,chunks=(5,5),attrs=Dict("arratt"=>2.5)) @@ -265,4 +305,5 @@ end Zarr.writezip(io, ds) Zarr.ZipStore(take!(io)) end + @info "Finished testing ZipStore" end diff --git a/test/v3_julia.jl b/test/v3_julia.jl new file mode 100644 index 0000000..ec73502 --- /dev/null +++ b/test/v3_julia.jl @@ -0,0 +1,308 @@ +# Julia script to generate Zarr v3 fixtures using pure Julia +# Mirrors the examples from v3_python.jl + +using Zarr +using JSON + +# Paths +path_v3 = joinpath(@__DIR__, "v3_julia", "data.zarr") + +# Remove existing +if isdir(path_v3) + rm(path_v3, recursive=true) +end + +# Create store and root group for v3 +store = Zarr.FormattedStore{3, '/'}(Zarr.DirectoryStore(path_v3)) +# Manually create v3 group metadata (zgroup defaults to v2) # TODO: we need to fix this! +group_meta = Dict("zarr_format" => 3, "node_type" => "group") +b = IOBuffer() +JSON.print(b, group_meta) +store["", "zarr.json"] = take!(b) + +# Helper: create array and set data +function create_and_fill(store, name, data; + dtype=nothing, + shape=nothing, + chunks=nothing, + compressor=Zarr.BloscCompressor(), + fill_value=nothing, + zarr_format=3, + dimension_separator='/') + + # Create the array + z = zcreate(eltype(data), store, shape...; + path=name, + chunks=chunks, + compressor=compressor, + fill_value=fill_value, + zarr_format=zarr_format, + dimension_separator=dimension_separator) + # Fill the array with the data + z[:] = data + return z +end + +# 1d.contiguous.gzip.i2 +create_and_fill(store, "1d.contiguous.gzip.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.blosc.i2 +create_and_fill(store, "1d.contiguous.blosc.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.raw.i2 +create_and_fill(store, "1d.contiguous.raw.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.NoCompressor(), +) + +# 1d.contiguous.i4 +create_and_fill(store, "1d.contiguous.i4", Int32[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.u1 +create_and_fill(store, "1d.contiguous.u1", UInt8[255,0,255,0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.f2.le +create_and_fill(store, "1d.contiguous.f2.le", Float16[-1000.5, 0.0, 1000.5, 0.0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.f4.le +create_and_fill(store, "1d.contiguous.f4.le", Float32[-1000.5, 0.0, 1000.5, 0.0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.f4.be +# Note: Big endian is not directly supported in Julia, but we can create the array +# The actual endianness is handled by the bytes codec in v3 +create_and_fill(store, "1d.contiguous.f4.be", Float32[-1000.5, 0.0, 1000.5, 0.0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.f8 +create_and_fill(store, "1d.contiguous.f8", Float64[1.5,2.5,3.5,4.5]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.contiguous.b1 +create_and_fill(store, "1d.contiguous.b1", Bool[true,false,true,false]; + shape=(4,), + chunks=(4,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 1d.chunked.i2 +z = create_and_fill(store, "1d.chunked.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(2,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# Adjust zarr.json to set dimension_names = null +meta_path = joinpath(path_v3, "1d.chunked.i2", "zarr.json") +meta = JSON.parsefile(meta_path; dicttype = Dict{String,Any}) +meta["dimension_names"] = nothing +open(meta_path, "w") do io + JSON.print(io, meta) +end + +# 1d.chunked.ragged.i2 +create_and_fill(store, "1d.chunked.ragged.i2", Int16[1,2,3,4,5]; + shape=(5,), + chunks=(2,), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 2d.contiguous.i2 +create_and_fill(store, "2d.contiguous.i2", Int16[1 2; 3 4]; + shape=(2,2), + chunks=(2,2), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 2d.chunked.i2 +create_and_fill(store, "2d.chunked.i2", Int16[1 2; 3 4]; + shape=(2,2), + chunks=(1,1), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 2d.chunked.ragged.i2 +create_and_fill(store, "2d.chunked.ragged.i2", Int16[1 2 3; 4 5 6; 7 8 9]; + shape=(3,3), + chunks=(2,2), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 3d.contiguous.i2 +create_and_fill(store, "3d.contiguous.i2", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(3,3,3), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 3d.chunked.i2 +create_and_fill(store, "3d.chunked.i2", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(1,1,1), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 3d.chunked.mixed.i2.C +create_and_fill(store, "3d.chunked.mixed.i2.C", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(3,3,1), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +# 3d.chunked.mixed.i2.F +# Note: Column-major order (F) is simulated with transpose filter in Python +# In Julia, we create with C order as that's what's currently supported +create_and_fill(store, "3d.chunked.mixed.i2.F", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(3,3,1), + compressor=Zarr.BloscCompressor(shuffle=0), # noshuffle +) + +##### Sharded/compressed examples +# Note: Sharding is not yet fully implemented in Zarr.jl, so these examples +# may not produce the exact same structure as the Python version. +# They are included for completeness but may need adjustment once sharding is supported. + +# 1d.contiguous.compressed.sharded.i2 +create_and_fill(store, "1d.contiguous.compressed.sharded.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.compressed.sharded.i4 +create_and_fill(store, "1d.contiguous.compressed.sharded.i4", Int32[1,2,3,4]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.compressed.sharded.u1 +create_and_fill(store, "1d.contiguous.compressed.sharded.u1", UInt8[255,0,255,0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.compressed.sharded.f4 +create_and_fill(store, "1d.contiguous.compressed.sharded.f4", Float32[-1000.5,0,1000.5,0]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.compressed.sharded.f8 +create_and_fill(store, "1d.contiguous.compressed.sharded.f8", Float64[1.5,2.5,3.5,4.5]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.contiguous.compressed.sharded.b1 +create_and_fill(store, "1d.contiguous.compressed.sharded.b1", Bool[true,false,true,false]; + shape=(4,), + chunks=(4,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.chunked.compressed.sharded.i2 +create_and_fill(store, "1d.chunked.compressed.sharded.i2", Int16[1,2,3,4]; + shape=(4,), + chunks=(1,), + compressor=Zarr.ZlibCompressor(), +) + +# 1d.chunked.filled.compressed.sharded.i2 +create_and_fill(store, "1d.chunked.filled.compressed.sharded.i2", Int16[1,2,0,0]; + shape=(4,), + chunks=(1,), + compressor=Zarr.ZlibCompressor(), +) + +# 2d.contiguous.compressed.sharded.i2 +create_and_fill(store, "2d.contiguous.compressed.sharded.i2", Int16[1 2; 3 4]; + shape=(2,2), + chunks=(2,2), + compressor=Zarr.ZlibCompressor(), +) + +# 2d.chunked.compressed.sharded.filled.i2 +create_and_fill(store, "2d.chunked.compressed.sharded.filled.i2", reshape(Int16.(0:15), 4, 4); + shape=(4,4), + chunks=(1,1), + compressor=Zarr.ZlibCompressor(), +) + +# 2d.chunked.compressed.sharded.i2 +create_and_fill(store, "2d.chunked.compressed.sharded.i2", reshape(Int16.(1:16), 4, 4); + shape=(4,4), + chunks=(1,1), + compressor=Zarr.ZlibCompressor(), +) + +# 2d.chunked.ragged.compressed.sharded.i2 +create_and_fill(store, "2d.chunked.ragged.compressed.sharded.i2", reshape(Int16.(1:9), 3, 3); + shape=(3,3), + chunks=(1,1), + compressor=Zarr.ZlibCompressor(), +) + +# 3d.contiguous.compressed.sharded.i2 +create_and_fill(store, "3d.contiguous.compressed.sharded.i2", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(3,3,3), + compressor=Zarr.ZlibCompressor(), +) + +# 3d.chunked.compressed.sharded.i2 +create_and_fill(store, "3d.chunked.compressed.sharded.i2", reshape(Int16.(0:63), 4, 4, 4); + shape=(4,4,4), + chunks=(1,1,1), + compressor=Zarr.ZlibCompressor(), +) + +# 3d.chunked.mixed.compressed.sharded.i2 +create_and_fill(store, "3d.chunked.mixed.compressed.sharded.i2", reshape(Int16.(0:26), 3, 3, 3); + shape=(3,3,3), + chunks=(3,3,1), + compressor=Zarr.ZlibCompressor(), +) + +# Group with spaces in the name +group_path = "my group with spaces" +group_meta2 = Dict("zarr_format" => 3, "node_type" => "group", "attributes" => Dict("description" => "A group with spaces in the name")) +b2 = IOBuffer() +JSON.print(b2, group_meta2) +store[group_path, "zarr.json"] = take!(b2) + +@info "Zarr v3 fixtures generated at: $path_v3" \ No newline at end of file diff --git a/test/v3_python.jl b/test/v3_python.jl new file mode 100644 index 0000000..247d181 --- /dev/null +++ b/test/v3_python.jl @@ -0,0 +1,479 @@ +# Julia script to generate Zarr v3 fixtures using PythonCall + CondaPkg +# Adapted from: https://github.com/manzt/zarrita.js/blob/23abb3bee9094aabbe60985626caef2802360963/scripts/generate-v3.py + +using CondaPkg +using JSON + +# Install Python deps into Conda env used by PythonCall (zarr v3 and numpy) +CondaPkg.add("numpy") +CondaPkg.add("zarr"; version="3.*") +CondaPkg.add("numcodecs") + +using PythonCall +# Import Python modules +np = pyimport("numpy") +zarr = pyimport("zarr") +codecs = pyimport("zarr.codecs") +storage = pyimport("zarr.storage") +json = pyimport("json") +shutil = pyimport("shutil") +pathlib = pyimport("pathlib") +builtins = pyimport("builtins") + +# Paths +path_v3 = joinpath(@__DIR__, "v3_python", "data.zarr") + +# deterministic RNG for numpy +np.random.seed(42) + +# remove existing +try + shutil.rmtree(path_v3) +catch + # ignore +end + +# create store and path_v3 group +store = storage.LocalStore(path_v3) +zarr.create_group(store) + +# helper: create array and set data (value should be a numpy array or convertible) +function create_and_fill(store; name, dtype=nothing, shape=nothing, chunks=nothing, + serializer=nothing, compressors=nothing, filters=nothing, shards=nothing, data) + # Build NamedTuple of only non-nothing keyword arguments + kwargs = (; name=name) + if dtype !== nothing + kwargs = merge(kwargs, (; dtype=dtype)) + end + if shape !== nothing + kwargs = merge(kwargs, (; shape=shape)) + end + if chunks !== nothing + kwargs = merge(kwargs, (; chunks=chunks)) + end + if serializer !== nothing + kwargs = merge(kwargs, (; serializer=serializer)) + end + if compressors !== nothing + kwargs = merge(kwargs, (; compressors=compressors)) + end + if filters !== nothing + kwargs = merge(kwargs, (; filters=filters)) + end + if shards !== nothing + kwargs = merge(kwargs, (; shards=shards)) + end + + # create the array + a = zarr.create_array(store; kwargs...) + + # ensure numpy array + arr = data isa Py ? data : np.array(data) + + # assign content + a.__setitem__(builtins.Ellipsis, arr) + + return a +end + +# 1d.contiguous.gzip.i2 +create_and_fill(store; + name="1d.contiguous.gzip.i2", + dtype="int16", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=[1,2,3,4], +) + +# 1d.contiguous.blosc.i2 +create_and_fill(store; + name="1d.contiguous.blosc.i2", + dtype="int16", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=[1,2,3,4], +) + +# 1d.contiguous.raw.i2 +create_and_fill(store; + name="1d.contiguous.raw.i2", + dtype="int16", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=nothing, + data=[1,2,3,4], +) + +# 1d.contiguous.i4 +create_and_fill(store; + name="1d.contiguous.i4", + dtype="int32", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=[1,2,3,4], +) + +# 1d.contiguous.u1 +create_and_fill(store; + name="1d.contiguous.u1", + dtype="uint8", + shape=(4,), + chunks=(4,), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([255,0,255,0], dtype="u1") +) + +# 1d.contiguous.f2.le +create_and_fill(store; + name="1d.contiguous.f2.le", + dtype="float16", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([-1000.5, 0.0, 1000.5, 0.0], dtype="f2"), +) + +# 1d.contiguous.f4.le +create_and_fill(store; + name="1d.contiguous.f4.le", + dtype="float32", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([-1000.5, 0.0, 1000.5, 0.0], dtype="f4"), +) + +# 1d.contiguous.f4.be +create_and_fill(store; + name="1d.contiguous.f4.be", + dtype="float32", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="big"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([-1000.5, 0.0, 1000.5, 0.0], dtype="f4"), +) + +# 1d.contiguous.f8 +create_and_fill(store; + name="1d.contiguous.f8", + dtype="float64", + shape=(4,), + chunks=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([1.5,2.5,3.5,4.5], dtype="f8"), +) + +# 1d.contiguous.b1 +create_and_fill(store; + name="1d.contiguous.b1", + dtype="bool", + shape=(4,), + chunks=(4,), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([true,false,true,false], dtype="bool"), +) + +# 1d.chunked.i2 +create_and_fill(store; + name="1d.chunked.i2", + dtype="int16", + shape=(4,), + chunks=(2,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([1,2,3,4], dtype="i2"), +) + +# adjust zarr.json to set dimension_names = null +meta_path = joinpath(path_v3, "1d.chunked.i2", "zarr.json") +meta = JSON.parsefile(meta_path; dicttype = Dict{String,Any}) +meta["dimension_names"] = nothing +open(meta_path, "w") do io + JSON.print(io, meta) +end + +# 1d.chunked.ragged.i2 +create_and_fill(store; + name="1d.chunked.ragged.i2", + dtype="int16", + shape=(5,), + chunks=(2,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([1,2,3,4,5], dtype="i2"), +) + +# 2d.contiguous.i2 +create_and_fill(store; + name="2d.contiguous.i2", + dtype="int16", + shape=(2,2), + chunks=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data= np.array([ [1,2], [3,4] ] |> pylist, dtype="i2"), +) + +# 2d.chunked.i2 +create_and_fill(store; + name="2d.chunked.i2", + dtype="int16", + shape=(2,2), + chunks=(1,1), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([[1,2],[3,4]] |> pylist, dtype="i2"), +) + +# 2d.chunked.ragged.i2 +create_and_fill(store; + name="2d.chunked.ragged.i2", + dtype="int16", + shape=(3,3), + chunks=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.array([[1,2,3],[4,5,6],[7,8,9]] |> pylist, dtype="i2"), +) + +# 3d.contiguous.i2 +create_and_fill(store; + name="3d.contiguous.i2", + dtype="int16", + shape=(3,3,3), + chunks=(3,3,3), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.arange(27).reshape(3,3,3), +) + +# 3d.chunked.i2 +create_and_fill(store; + name="3d.chunked.i2", + dtype="int16", + shape=(3,3,3), + chunks=(1,1,1), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.arange(27).reshape(3,3,3), +) + +# 3d.chunked.mixed.i2.C +create_and_fill(store; + name="3d.chunked.mixed.i2.C", + dtype="int16", + shape=(3,3,3), + chunks=(3,3,1), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.arange(27).reshape(3,3,3), +) + +# 3d.chunked.mixed.i2.F (with transpose filter to simulate column-major) +transpose_filter = codecs.TransposeCodec(order=[2,1,0]) +create_and_fill(store; + name="3d.chunked.mixed.i2.F", + dtype="int16", + shape=(3,3,3), + chunks=(3,3,1), + filters=[transpose_filter], + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.BloscCodec(typesize=4, shuffle="noshuffle")], + data=np.arange(27).reshape(3,3,3), +) + +##### Sharded/compressed examples +# 1d.contiguous.compressed.sharded.i2 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.i2", + shape=(4,), + dtype=np.array([1,2,3,4], dtype="i2").dtype, + chunks=(4,), + shards=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([1,2,3,4], dtype="i2"), +) + +# 1d.contiguous.compressed.sharded.i4 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.i4", + shape=(4,), + dtype=np.array([1,2,3,4], dtype="i4").dtype, + chunks=(4,), + shards=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([1,2,3,4], dtype="i4"), +) + +# 1d.contiguous.compressed.sharded.u1 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.u1", + shape=(4,), + dtype=np.array([255,0,255,0], dtype="u1").dtype, + chunks=(4,), + shards=(4,), + compressors=[codecs.GzipCodec()], + data=np.array([255,0,255,0], dtype="u1"), +) + +# 1d.contiguous.compressed.sharded.f4 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.f4", + shape=(4,), + dtype=np.array([-1000.5,0,1000.5,0], dtype="f4").dtype, + chunks=(4,), + shards=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([-1000.5,0,1000.5,0], dtype="f4"), +) + +# 1d.contiguous.compressed.sharded.f8 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.f8", + shape=(4,), + dtype=np.array([1.5,2.5,3.5,4.5], dtype="f8").dtype, + chunks=(4,), + shards=(4,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([1.5,2.5,3.5,4.5], dtype="f8"), +) + +# 1d.contiguous.compressed.sharded.b1 +create_and_fill(store; + name="1d.contiguous.compressed.sharded.b1", + shape=(4,), + dtype="bool", + chunks=(4,), + shards=(4,), + compressors=[codecs.GzipCodec()], + data=np.array([true,false,true,false], dtype="bool"), +) + +# 1d.chunked.compressed.sharded.i2 +create_and_fill(store; + name="1d.chunked.compressed.sharded.i2", + shape=(4,), + dtype=np.array([1,2,3,4], dtype="i2").dtype, + chunks=(1,), + shards=(2,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([1,2,3,4], dtype="i2"), +) + +# 1d.chunked.filled.compressed.sharded.i2 +create_and_fill(store; + name="1d.chunked.filled.compressed.sharded.i2", + shape=(4,), + dtype=np.array([1,2,0,0], dtype="i2").dtype, + chunks=(1,), + shards=(2,), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.array([1,2,0,0], dtype="i2"), +) + +# 2d.contiguous.compressed.sharded.i2 +create_and_fill(store; + name="2d.contiguous.compressed.sharded.i2", + shape=(2,2), + dtype=np.arange(1,5, dtype="i2").dtype, + chunks=(2,2), + shards=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(1,5, dtype="i2").reshape(2,2), +) + +# 2d.chunked.compressed.sharded.filled.i2 +create_and_fill(store; + name="2d.chunked.compressed.sharded.filled.i2", + shape=(4,4), + dtype=np.arange(16, dtype="i2").dtype, + chunks=(1,1), + shards=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(16, dtype="i2").reshape(4,4), +) + +# 2d.chunked.compressed.sharded.i2 +create_and_fill(store; + name="2d.chunked.compressed.sharded.i2", + shape=(4,4), + dtype=np.arange(16, dtype="i2").dtype, + chunks=(1,1), + shards=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=(np.arange(16, dtype="i2").reshape(4,4) + 1), +) + +# 2d.chunked.ragged.compressed.sharded.i2 +create_and_fill(store; + name="2d.chunked.ragged.compressed.sharded.i2", + shape=(3,3), + dtype=np.arange(1,10, dtype="i2").dtype, + chunks=(1,1), + shards=(2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(1,10, dtype="i2").reshape(3,3), +) + +# 3d.contiguous.compressed.sharded.i2 +create_and_fill(store; + name="3d.contiguous.compressed.sharded.i2", + shape=(3,3,3), + dtype=np.arange(27, dtype="i2").dtype, + chunks=(3,3,3), + shards=(3,3,3), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(27, dtype="i2").reshape(3,3,3), +) + +# 3d.chunked.compressed.sharded.i2 +create_and_fill(store; + name="3d.chunked.compressed.sharded.i2", + shape=(4,4,4), + dtype=np.arange(64, dtype="i2").dtype, + chunks=(1,1,1), + shards=(2,2,2), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(64, dtype="i2").reshape(4,4,4), +) + +# 3d.chunked.mixed.compressed.sharded.i2 +create_and_fill(store; + name="3d.chunked.mixed.compressed.sharded.i2", + shape=(3,3,3), + dtype=np.arange(27, dtype="i2").dtype, + chunks=(3,3,1), + shards=(3,3,3), + serializer=codecs.BytesCodec(endian="little"), + compressors=[codecs.GzipCodec()], + data=np.arange(27, dtype="i2").reshape(3,3,3), +) + +# Group with spaces in the name +g = zarr.create_group(store, path="my group with spaces") +g.attrs["description"] = "A group with spaces in the name" + +@info "Zarr v3 fixtures generated at: $path_v3"