Skip to content

Commit b4253b7

Browse files
authored
Exclude metadata from ingestion (#106)
* Excluding original metadata from_bioimg path * Install openslide-tools failing tests * Test CI failing
1 parent 500ebab commit b4253b7

File tree

6 files changed

+104
-10
lines changed

6 files changed

+104
-10
lines changed

.github/workflows/ci.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ jobs:
3232
pytest-mock
3333
3434
- name: Install openslide for non-Win
35-
run: micromamba install openslide
35+
run: |
36+
sudo apt install openslide-tools
37+
micromamba install openslide
3638
if: matrix.os != 'windows-latest'
3739

3840
- name: Install openslide for Win
@@ -61,9 +63,10 @@ jobs:
6163
# default is C: ,thus we create a temp_test folder for pytest's tmp_dir to run on D: as well
6264
if [ "$RUNNER_OS" == "Linux" ]; then
6365
pytest -v --cov=tiledb --cov-report=term-missing --durations=0 tests/ > coverage.txt
64-
cat coverage.txt
66+
exit_code=$?
6567
TEST_COVERAGE="$(grep '^TOTAL' coverage.txt | awk -v N=4 '{print $N}')"
6668
echo "COVERAGE=$TEST_COVERAGE" >> $GITHUB_OUTPUT
69+
exit $exit_code
6770
else
6871
mkdir test_temp
6972
pytest --basetemp=test_temp -v --cov=tiledb --cov-report=term-missing --durations=0 tests/

tests/__init__.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ def get_schema(x_size, y_size, c_size=3, compressor=tiledb.ZstdFilter(level=0)):
1414
dims = []
1515
x_tile = min(x_size, 1024)
1616
y_tile = min(y_size, 1024)
17+
# WEBP Compressor does not accept specific dtypes so for dimensions we use the default
18+
dim_compressor = tiledb.ZstdFilter(level=0)
19+
if not isinstance(compressor, tiledb.WebpFilter):
20+
dim_compressor = compressor
1721
if isinstance(compressor, tiledb.WebpFilter):
1822
x_size *= c_size
1923
x_tile *= c_size
@@ -30,10 +34,34 @@ def get_schema(x_size, y_size, c_size=3, compressor=tiledb.ZstdFilter(level=0)):
3034
lossless=compressor.lossless,
3135
)
3236
else:
33-
dims.append(tiledb.Dim("C", (0, c_size - 1), tile=c_size, dtype=np.uint32))
37+
dims.append(
38+
tiledb.Dim(
39+
"C",
40+
(0, c_size - 1),
41+
tile=c_size,
42+
dtype=np.uint32,
43+
filters=tiledb.FilterList([compressor]),
44+
)
45+
)
3446

35-
dims.append(tiledb.Dim("Y", (0, y_size - 1), tile=y_tile, dtype=np.uint32))
36-
dims.append(tiledb.Dim("X", (0, x_size - 1), tile=x_tile, dtype=np.uint32))
47+
dims.append(
48+
tiledb.Dim(
49+
"Y",
50+
(0, y_size - 1),
51+
tile=y_tile,
52+
dtype=np.uint32,
53+
filters=tiledb.FilterList([dim_compressor]),
54+
)
55+
)
56+
dims.append(
57+
tiledb.Dim(
58+
"X",
59+
(0, x_size - 1),
60+
tile=x_tile,
61+
dtype=np.uint32,
62+
filters=tiledb.FilterList([dim_compressor]),
63+
)
64+
)
3765

3866
return tiledb.ArraySchema(
3967
domain=tiledb.Domain(*dims),

tests/integration/converters/test_ome_tiff.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,43 @@ def test_ome_tiff_converter_group_metadata(tmp_path, filename):
9696
assert shape[level_axes.index("Y")] == level_height
9797

9898

99+
@pytest.mark.parametrize(
100+
"filename,num_series", [("CMU-1-Small-Region.ome.tiff", 3), ("UTM2GTIF.tiff", 1)]
101+
)
102+
@pytest.mark.parametrize("preserve_axes", [False, True])
103+
@pytest.mark.parametrize("chunked,max_workers", [(False, 0), (True, 0), (True, 4)])
104+
@pytest.mark.parametrize(
105+
"compressor",
106+
[
107+
tiledb.ZstdFilter(level=0),
108+
tiledb.WebpFilter(WebpInputFormat.WEBP_RGB, lossless=False),
109+
tiledb.WebpFilter(WebpInputFormat.WEBP_RGB, lossless=True),
110+
tiledb.WebpFilter(WebpInputFormat.WEBP_NONE, lossless=True),
111+
],
112+
)
113+
def test_ome_tiff_converter_exclude_original_metadata(
114+
tmp_path, filename, num_series, preserve_axes, chunked, max_workers, compressor
115+
):
116+
if isinstance(compressor, tiledb.WebpFilter) and filename == "UTM2GTIF.tiff":
117+
pytest.skip(f"WebPFilter cannot be applied to {filename}")
118+
119+
input_path = get_path(filename)
120+
tiledb_path = tmp_path / "to_tiledb"
121+
OMETiffConverter.to_tiledb(
122+
input_path,
123+
str(tiledb_path),
124+
preserve_axes=preserve_axes,
125+
chunked=chunked,
126+
max_workers=max_workers,
127+
compressor=compressor,
128+
log=False,
129+
exclude_metadata=True,
130+
)
131+
132+
with TileDBOpenSlide(str(tiledb_path)) as t:
133+
assert t.properties["original_metadata"] == "{}"
134+
135+
99136
@pytest.mark.parametrize(
100137
"filename,num_series", [("CMU-1-Small-Region.ome.tiff", 3), ("UTM2GTIF.tiff", 1)]
101138
)

tiledb/bioimg/converters/base.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,7 @@ def to_tiledb(
296296
preserve_axes: bool = False,
297297
chunked: bool = False,
298298
max_workers: int = 0,
299+
exclude_metadata: bool = False,
299300
compressor: Optional[Union[Mapping[int, Any], Any]] = None,
300301
log: Optional[Union[bool, logging.Logger]] = None,
301302
reader_kwargs: Optional[Mapping[str, Any]] = None,
@@ -317,6 +318,7 @@ def to_tiledb(
317318
original ones.
318319
:param max_workers: Maximum number of threads that can be used for conversion.
319320
Applicable only if chunked=True.
321+
:param exclude_metadata: If true, drop original metadata of the images and exclude them from being ingested.
320322
:param compressor: TileDB compression filter mapping for each level
321323
:param log: verbose logging, defaults to None. Allows passing custom logging.Logger or boolean.
322324
If None or bool=False it initiates an INFO level logging. If bool=True then a logger is instantiated in
@@ -465,7 +467,8 @@ def to_tiledb(
465467
metadata["channels"] = {f"{ATTR_NAME}": metadata["channels"]}
466468
logger.debug(f'Metadata channels: {metadata["channels"]}')
467469

468-
original_metadata = reader.original_metadata
470+
if not exclude_metadata:
471+
original_metadata = reader.original_metadata
469472

470473
with rw_group:
471474
rw_group.w_group.meta.update(

tiledb/bioimg/helpers.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,19 @@ def get_schema(
9797

9898
dims = []
9999
assert len(dim_names) == len(dim_shape), (dim_names, dim_shape)
100+
# WEBP Compressor does not accept specific dtypes so for dimensions we use the default
101+
dim_compressor = tiledb.ZstdFilter(level=0)
102+
if not isinstance(compressor, tiledb.WebpFilter):
103+
dim_compressor = compressor
100104
for dim_name, dim_size in zip(dim_names, dim_shape):
101105
dim_tile = min(dim_size, max_tiles[dim_name])
102-
dim = tiledb.Dim(dim_name, (0, dim_size - 1), dim_tile, dtype=dim_dtype)
106+
dim = tiledb.Dim(
107+
dim_name,
108+
(0, dim_size - 1),
109+
dim_tile,
110+
dtype=dim_dtype,
111+
filters=[dim_compressor],
112+
)
103113
dims.append(dim)
104114
attr = tiledb.Attr(name=ATTR_NAME, dtype=attr_dtype, filters=[compressor])
105115
return tiledb.ArraySchema(domain=tiledb.Domain(*dims), attrs=[attr])

tiledb/bioimg/wrappers.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ def from_bioimg(
1616
converter: Converters = Converters.OMETIFF,
1717
*,
1818
verbose: bool = False,
19+
exclude_metadata: bool = False,
1920
**kwargs: Any,
2021
) -> Type[ImageConverter]:
2122
"""
@@ -32,17 +33,29 @@ def from_bioimg(
3233
if converter is Converters.OMETIFF:
3334
logger.info("Converting OME-TIFF file")
3435
return OMETiffConverter.to_tiledb(
35-
source=src, output_path=dest, log=logger, **kwargs
36+
source=src,
37+
output_path=dest,
38+
log=logger,
39+
exclude_metadata=exclude_metadata,
40+
**kwargs,
3641
)
3742
elif converter is Converters.OMEZARR:
3843
logger.info("Converting OME-Zarr file")
3944
return OMEZarrConverter.to_tiledb(
40-
source=src, output_path=dest, log=logger, **kwargs
45+
source=src,
46+
output_path=dest,
47+
log=logger,
48+
exclude_metadata=exclude_metadata,
49+
**kwargs,
4150
)
4251
else:
4352
logger.info("Converting Openslide")
4453
return OpenSlideConverter.to_tiledb(
45-
source=src, output_path=dest, log=logger, **kwargs
54+
source=src,
55+
output_path=dest,
56+
log=logger,
57+
exclude_metadata=exclude_metadata,
58+
**kwargs,
4659
)
4760

4861

0 commit comments

Comments
 (0)