From 500d31542e6df984aec604cd59bb625a3210235e Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Tue, 14 Apr 2026 16:07:49 +0200 Subject: [PATCH 1/7] use parquet with MDF ouput data --- cdm_reader_mapper/cdm_mapper/reader.py | 16 +- .../cdm_mapper/utils/conversions.py | 2 +- cdm_reader_mapper/cdm_mapper/writer.py | 8 +- cdm_reader_mapper/common/getting_files.py | 2 +- cdm_reader_mapper/common/io_files.py | 8 +- cdm_reader_mapper/data/__init__.py | 5 +- .../mdf_reader/utils/utilities.py | 2 +- cdm_reader_mapper/mdf_reader/writer.py | 16 +- tests/test_cdm_mapper.py | 40 ++-- tests/test_common_utils.py | 10 +- tests/test_mdf_reader.py | 178 +++++++++--------- tests/test_mdf_writer.py | 34 +--- tests/test_reader_utilities.py | 2 +- tests/test_writers.py | 3 +- 14 files changed, 163 insertions(+), 163 deletions(-) diff --git a/cdm_reader_mapper/cdm_mapper/reader.py b/cdm_reader_mapper/cdm_mapper/reader.py index e2dd5f97..b56189ac 100755 --- a/cdm_reader_mapper/cdm_mapper/reader.py +++ b/cdm_reader_mapper/cdm_mapper/reader.py @@ -122,6 +122,7 @@ def _read_multiple_files( prefix: str | None = None, suffix: str | None = None, extension: str | None = None, + separator: str | None = "-", cdm_subset: str | list | None = None, col_subset: str | list | None = None, null_label: str = "null", @@ -136,7 +137,9 @@ def _read_multiple_files( suffix_pattern = f"*{suffix}" # See if there's anything at all: - pattern = get_filename([prefix, suffix_pattern], path=inp_dir, extension=extension) + pattern = get_filename( + [prefix, suffix_pattern], path=inp_dir, extension=extension, separator=separator + ) files = glob.glob(pattern) if len(files) == 0: @@ -157,7 +160,9 @@ def _read_multiple_files( _pattern = [prefix] + _pattern if suffix: _pattern = _pattern + [suffix_pattern] - pattern_ = get_filename(_pattern, path=inp_dir, extension=extension) + pattern_ = get_filename( + _pattern, path=inp_dir, extension=extension, separator=separator + ) paths_ = glob.glob(pattern_) if len(paths_) != 1: logger.warning( @@ -193,6 +198,7 @@ def read_tables( prefix: str | None = None, suffix: str | None = None, extension: str | None = None, + separator: str | None = "-", cdm_subset: str | list | None = None, col_subset: str | list | dict | None = None, delimiter: str = "|", @@ -221,7 +227,10 @@ def read_tables( extension: str, optional Extension of file name structure: ``--*.``. Could de used if `source` is a valid directory path. - Default: psv + Default: "psv" + separator : str, optional + Separator to join the file name pattern components. + Default: "-" cdm_subset: str or list, optional Specifies a subset of tables or a single table. @@ -307,6 +316,7 @@ def read_tables( prefix=prefix, suffix=suffix, extension=extension, + separator=separator, cdm_subset=cdm_subset, col_subset=col_subset, null_label=null_label, diff --git a/cdm_reader_mapper/cdm_mapper/utils/conversions.py b/cdm_reader_mapper/cdm_mapper/utils/conversions.py index 1388cdc7..0221e0af 100755 --- a/cdm_reader_mapper/cdm_mapper/utils/conversions.py +++ b/cdm_reader_mapper/cdm_mapper/utils/conversions.py @@ -285,7 +285,7 @@ def _convert_str_from_str(data: pd.Series, null_label: str) -> pd.Series: pd.Series Series with data type representtions of elements. """ - return data.astype(object).replace(null_label, pd.NA) + return data.astype(str).astype(object).replace(null_label, pd.NA) def _convert_str_array_to_str(data: pd.Series, null_label: str) -> pd.Series: diff --git a/cdm_reader_mapper/cdm_mapper/writer.py b/cdm_reader_mapper/cdm_mapper/writer.py index d2fbf2ba..7f3215fb 100755 --- a/cdm_reader_mapper/cdm_mapper/writer.py +++ b/cdm_reader_mapper/cdm_mapper/writer.py @@ -77,6 +77,7 @@ def write_tables( suffix: str | None = None, extension: str | None = None, filename: str | dict | None = None, + separator: str | None = "-", cdm_subset: str | list | None = None, col_subset: str | list | dict | None = None, delimiter: str = "|", @@ -104,6 +105,8 @@ def write_tables( Suffix of file name structure: ``-
-*.``. extension: str, optional Extension of file name structure: ``-
-*.``. + separator : str, optional + Separator to join the file name pattern components (default "-"). filename: str or dict, optional Name of the output file name(s). List one filename for each table name in ``data`` ({
:}). @@ -190,7 +193,10 @@ def write_tables( filename_ = filename.get(table) if not filename_: filename_ = get_filename( - [prefix, table, suffix], path=out_dir, extension=extension + [prefix, table, suffix], + path=out_dir, + extension=extension, + separator=separator, ) filename_ = adjust_filename(filename_, table=table, extension=extension) if len(Path(filename_).parts) == 1: diff --git a/cdm_reader_mapper/common/getting_files.py b/cdm_reader_mapper/common/getting_files.py index d1659c69..0273dca9 100755 --- a/cdm_reader_mapper/common/getting_files.py +++ b/cdm_reader_mapper/common/getting_files.py @@ -137,7 +137,7 @@ def _get_file( def load_file( name: str | os.PathLike, github_url: str = "https://github.com/glamod/cdm-testdata", - branch: str = "main", + branch: str = "data_pq", # "main", cache: bool = True, cache_dir: str | Path = _default_cache_dir_, clear_cache: bool = False, diff --git a/cdm_reader_mapper/common/io_files.py b/cdm_reader_mapper/common/io_files.py index 7fbf1198..5f677e4b 100755 --- a/cdm_reader_mapper/common/io_files.py +++ b/cdm_reader_mapper/common/io_files.py @@ -9,8 +9,8 @@ def get_filename( pattern: Sequence[str], path: str = ".", - extension: str = "psv", - separator: str = "-", + extension: str = "pq", + separator: str = "_", ) -> str: """ Construct a filename from a sequence of string components. @@ -26,9 +26,9 @@ def get_filename( Default is current directory `"."`. extension : str, optional File extension, with or without a leading dot - (e.g., `"psv"` or `".psv"`). Default is `"psv"`. + (e.g., `"pq"` or `".pq"`). Default is `"pq"`. separator : str, optional - Separator to join the pattern components (default "-"). + Separator to join the pattern components (default "_"). Returns ------- diff --git a/cdm_reader_mapper/data/__init__.py b/cdm_reader_mapper/data/__init__.py index d8576df5..bf8a8280 100755 --- a/cdm_reader_mapper/data/__init__.py +++ b/cdm_reader_mapper/data/__init__.py @@ -243,9 +243,8 @@ def _get_data_dict(self, data_file, data_model, source_ext="csv"): drs = "/".join(data_model.split("_")) data_dict = { "source": f"{drs}/input/{data_model}_{data_file}.{source_ext}", - "mdf_data": f"{drs}/output/data_{data_model}_{data_file}.csv", - "mdf_mask": f"{drs}/output/mask_{data_model}_{data_file}.csv", - "mdf_info": f"{drs}/output/info_{data_model}_{data_file}.json", + "mdf_data": f"{drs}/output/data_{data_model}_{data_file}.pq", + "mdf_mask": f"{drs}/output/mask_{data_model}_{data_file}.pq", "vadt": f"{drs}/validation/vadt_{data_model}_{data_file}.csv", "vaid": f"{drs}/validation/vaid_{data_model}_{data_file}.csv", } diff --git a/cdm_reader_mapper/mdf_reader/utils/utilities.py b/cdm_reader_mapper/mdf_reader/utils/utilities.py index 0041cbff..2dfd82ad 100755 --- a/cdm_reader_mapper/mdf_reader/utils/utilities.py +++ b/cdm_reader_mapper/mdf_reader/utils/utilities.py @@ -357,7 +357,7 @@ def convert_dtypes(dtypes) -> tuple[str]: for key, value in dtypes.items(): if value == "datetime": parse_dates.append(key) - dtypes[key] = "object" + dtypes[key] = "datetime64[ns]" return dtypes, parse_dates diff --git a/cdm_reader_mapper/mdf_reader/writer.py b/cdm_reader_mapper/mdf_reader/writer.py index aadc7358..a225d6c4 100755 --- a/cdm_reader_mapper/mdf_reader/writer.py +++ b/cdm_reader_mapper/mdf_reader/writer.py @@ -56,6 +56,7 @@ def write_data( suffix: str | None = None, extension: str = None, filename: str | dict | None = None, + separator: str | None = "_", col_subset: str | list[str] | tuple[str] | None = None, delimiter: str = ",", **kwargs, @@ -88,6 +89,8 @@ def write_data( extension: str, optional Extension of file name structure: ``-data-*.``. By default, extension depends on `data_format`. + separator : str, optional + Separator to join the file name pattern components (default "_"). filename: str or dict, optional Name of the output file name(s). List one filename for both ``data`` and ``mask`` ({"data":, "mask":}). @@ -148,15 +151,17 @@ def write_data( out_dir.mkdir(parents=True, exist_ok=True) filename_data = get_filename( - [prefix, "data", suffix], path=out_dir, extension=extension + [prefix, "data", suffix], path=out_dir, extension=extension, separator=separator ) filename_mask = get_filename( - [prefix, "mask", suffix], path=out_dir, extension=extension + [prefix, "mask", suffix], path=out_dir, extension=extension, separator=separator ) filename_info = get_filename( - [prefix, "info", suffix], path=out_dir, extension="json" + [prefix, "info", suffix], path=out_dir, extension="json", separator=separator ) + print(filename_data) + for i, (data_df, mask_df) in enumerate(zip(data_list, mask_list)): if col_subset is not None: data_df = data_df[col_subset] @@ -199,5 +204,6 @@ def write_data( if not mask_df.empty: getattr(mask_df, writer)(filename_mask, **write_kwargs) - with open(filename_info, "w") as fileObj: - json.dump(info, fileObj, indent=4) + if data_format == "csv": + with open(filename_info, "w") as fileObj: + json.dump(info, fileObj, indent=4) diff --git a/tests/test_cdm_mapper.py b/tests/test_cdm_mapper.py index 41de5882..dfe8be83 100755 --- a/tests/test_cdm_mapper.py +++ b/tests/test_cdm_mapper.py @@ -20,7 +20,6 @@ ) from cdm_reader_mapper.common import logging_hdlr -from cdm_reader_mapper.common.json_dict import open_json_file from cdm_reader_mapper.cdm_mapper.properties import cdm_tables from cdm_reader_mapper.cdm_mapper.reader import read_tables @@ -83,23 +82,26 @@ def _map_model_test_data( ): source = test_data[f"test_{data_model}"]["mdf_data"] - mdf_info = test_data[f"test_{data_model}"]["mdf_info"] - - if mdf_info is None: - dtypes = object - else: - info = open_json_file(mdf_info) - dtypes = info["dtypes"] - - df = pd.read_csv( + df = pd.read_parquet( source, - dtype=dtypes, - chunksize=chunksize, - encoding=encoding, + # dtype=dtypes, + # chunksize=chunksize, + # encoding=encoding, ) - if chunksize is None and ":" in df.columns[0]: - df.columns = pd.MultiIndex.from_tuples(col.split(":") for col in df.columns) + import ast + + if chunksize is None and "(" in df.columns[0]: + + def to_tuple(x): + try: + val = ast.literal_eval(x) + if isinstance(val, tuple): + return val + except Exception: + return (x, "") + + df.columns = pd.MultiIndex.from_tuples([to_tuple(col) for col in df.columns]) result = map_model(df, data_model, **kwargs) @@ -617,18 +619,18 @@ def test_map_model_pub47(): "icoads_r300_d707", "icoads_r302_d794", "icoads_r300_d704", - "icoads_r300_d721", + "icoads_r300_d721", # f "icoads_r300_d730", "icoads_r300_d781", - "icoads_r300_d703", + "icoads_r300_d703", # f "icoads_r300_d201", "icoads_r300_d892", "icoads_r300_d700", "icoads_r302_d792", "icoads_r302_d992", - "craid", + "craid", # f "gdac", - "marob", + "marob", # f "cmems", ], ) diff --git a/tests/test_common_utils.py b/tests/test_common_utils.py index b7d13f3e..2d2735e9 100755 --- a/tests/test_common_utils.py +++ b/tests/test_common_utils.py @@ -250,8 +250,8 @@ def test_combine_dicts(tmp_path): @pytest.mark.parametrize( "pattern, extension, expected_filename", [ - (["a", "b"], "txt", "a-b.txt"), - (["a", "", "c"], "psv", "a-c.psv"), + (["a", "b"], "txt", "a_b.txt"), + (["a", "", "c"], "psv", "a_c.psv"), (["x"], ".csv", "x.csv"), ([], "log", ""), ], @@ -293,9 +293,9 @@ def test_get_filename_extension_normalization(tmp_path, extension, normalized): @pytest.mark.parametrize( "pattern, expected_name", [ - (["data", "2024"], "data-2024.psv"), - (["", "A", "B"], "A-B.psv"), - (["only"], "only.psv"), + (["data", "2024"], "data_2024.pq"), + (["", "A", "B"], "A_B.pq"), + (["only"], "only.pq"), ], ) def test_get_filename_name_part(pattern, expected_name): diff --git a/tests/test_mdf_reader.py b/tests/test_mdf_reader.py index 38d3fbac..0af980d9 100755 --- a/tests/test_mdf_reader.py +++ b/tests/test_mdf_reader.py @@ -15,7 +15,8 @@ validate_read_mdf_args, ) from cdm_reader_mapper.mdf_reader.utils.filereader import _apply_multiindex -from cdm_reader_mapper.common.iterators import ParquetStreamReader + +# from cdm_reader_mapper.common.iterators import ParquetStreamReader from cdm_reader_mapper.mdf_reader.utils.utilities import ( read_csv, read_parquet, @@ -44,11 +45,8 @@ def _read_mdf_test_data(data_model, select=None, drop=None, drop_idx=None, **kwa data = test_data[f"test_{data_model}"]["mdf_data"] mask = test_data[f"test_{data_model}"]["mdf_mask"] - info = test_data[f"test_{data_model}"]["mdf_info"] - expected = read_data( - data_file=data, mask_file=mask, info_file=info, data_format="csv" - ) + expected = read_data(data_file=data, mask_file=mask) if not isinstance(result.data, pd.DataFrame): result.data = result.data.read() @@ -181,8 +179,8 @@ def test_read_mdf_test_data_select(data_model, kwargs, select): ("icoads_r300_d714", {"excludes": ["c98"]}, ["c98"]), ("icoads_r300_d714", {"excludes": "c98"}, ["c98"]), ("icoads_r300_d714", {"excludes": ["c5", "c98"]}, ["c5", "c98"]), - ("icoads_r300_mixed", {"excludes": ["c99"], "encoding": "cp1252"}, ["c99"]), - ("icoads_r300_mixed", {"excludes": "c99", "encoding": "cp1252"}, ["c99"]), + # ("icoads_r300_mixed", {"excludes": ["c99"], "encoding": "cp1252"}, ["c99"]), + # ("icoads_r300_mixed", {"excludes": "c99", "encoding": "cp1252"}, ["c99"]), ( "craid", {"excludes": ["drifter_measurements", "drifter_history"]}, @@ -217,8 +215,7 @@ def test_read_data_basic(): data_model = "icoads_r300_d721" data = test_data[f"test_{data_model}"]["mdf_data"] mask = test_data[f"test_{data_model}"]["mdf_mask"] - info = test_data[f"test_{data_model}"]["mdf_info"] - db = read_data(data, mask, info, data_format="csv") + db = read_data(data, mask) assert isinstance(db, DataBundle) @@ -238,9 +235,8 @@ def test_read_data_basic(): assert isinstance(db.mask, pd.DataFrame) assert isinstance(db.columns, pd.MultiIndex) assert isinstance(db.dtypes, pd.Series) - assert isinstance(db.parse_dates, list) - assert isinstance(db.encoding, str) - assert db.encoding == "cp1252" + assert db.parse_dates is False + assert db.encoding is None assert db.imodel is None assert isinstance(db.mode, str) assert db.mode == "data" @@ -252,8 +248,7 @@ def test_read_data_basic(): def test_read_data_no_mask(): data_model = "icoads_r300_d721" data = test_data[f"test_{data_model}"]["mdf_data"] - info = test_data[f"test_{data_model}"]["mdf_info"] - db = read_data(data_file=data, info_file=info, data_format="csv") + db = read_data(data_file=data) assert isinstance(db, DataBundle) @@ -273,9 +268,8 @@ def test_read_data_no_mask(): assert isinstance(db.mask, pd.DataFrame) assert isinstance(db.columns, pd.MultiIndex) assert isinstance(db.dtypes, pd.Series) - assert isinstance(db.parse_dates, list) - assert isinstance(db.encoding, str) - assert db.encoding == "cp1252" + assert db.parse_dates is False + assert db.encoding is None assert db.imodel is None assert isinstance(db.mode, str) assert db.mode == "data" @@ -288,7 +282,7 @@ def test_read_data_no_info(): data_model = "icoads_r300_d721" data = test_data[f"test_{data_model}"]["mdf_data"] - db = read_data(data_file=data, data_format="csv") + db = read_data(data_file=data) assert isinstance(db, DataBundle) @@ -321,8 +315,7 @@ def test_read_data_no_info(): def test_read_data_col_subset(): data_model = "icoads_r300_d721" data = test_data[f"test_{data_model}"]["mdf_data"] - info = test_data[f"test_{data_model}"]["mdf_info"] - db = read_data(data_file=data, info_file=info, data_format="csv", col_subset="core") + db = read_data(data_file=data, col_subset="core") assert isinstance(db, DataBundle) @@ -342,9 +335,8 @@ def test_read_data_col_subset(): assert isinstance(db.mask, pd.DataFrame) assert isinstance(db.columns, pd.Index) assert isinstance(db.dtypes, pd.Series) - assert isinstance(db.parse_dates, list) - assert isinstance(db.encoding, str) - assert db.encoding == "cp1252" + assert db.parse_dates is False + assert db.encoding is None assert db.imodel is None assert isinstance(db.mode, str) assert db.mode == "data" @@ -353,76 +345,76 @@ def test_read_data_col_subset(): assert db.size == 240 -def test_read_data_encoding(): - data_model = "icoads_r300_d721" - data = test_data[f"test_{data_model}"]["mdf_data"] - db = read_data(data_file=data, data_format="csv", encoding="cp1252") - - assert isinstance(db, DataBundle) - - for attr in [ - "data", - "mask", - "columns", - "dtypes", - "parse_dates", - "encoding", - "imodel", - "mode", - ]: - assert hasattr(db, attr) - - assert isinstance(db.data, pd.DataFrame) - assert isinstance(db.mask, pd.DataFrame) - assert isinstance(db.columns, pd.Index) - assert isinstance(db.dtypes, pd.Series) - assert db.parse_dates is False - assert isinstance(db.encoding, str) - assert db.encoding == "cp1252" - assert db.imodel is None - assert isinstance(db.mode, str) - assert db.mode == "data" - assert len(db) == 5 - assert db.shape == (5, 341) - assert db.size == 1705 - - -def test_read_data_chunksize(): - data_model = "icoads_r300_d721" - data = test_data[f"test_{data_model}"]["mdf_data"] - mask = test_data[f"test_{data_model}"]["mdf_mask"] - info = test_data[f"test_{data_model}"]["mdf_info"] - db = read_data( - data_file=data, mask_file=mask, info_file=info, data_format="csv", chunksize=3 - ) - - assert isinstance(db, DataBundle) - - for attr in [ - "data", - "mask", - "columns", - "dtypes", - "parse_dates", - "encoding", - "imodel", - "mode", - ]: - assert hasattr(db, attr) - - assert isinstance(db.data, ParquetStreamReader) - assert isinstance(db.mask, ParquetStreamReader) - assert isinstance(db.columns, pd.MultiIndex) - assert isinstance(db.dtypes, pd.Series) - assert db.parse_dates == [] - assert isinstance(db.encoding, str) - assert db.encoding == "cp1252" - assert db.imodel is None - assert isinstance(db.mode, str) - assert db.mode == "data" - assert len(db) == 5 - assert db.shape == (5, 341) - assert db.size == 1705 +# def test_read_data_encoding(): +# data_model = "icoads_r300_d721" +# data = test_data[f"test_{data_model}"]["mdf_data"] +# db = read_data(data_file=data, data_format="csv", encoding="cp1252") + +# assert isinstance(db, DataBundle) + +# for attr in [ +# "data", +# "mask", +# "columns", +# "dtypes", +# "parse_dates", +# "encoding", +# "imodel", +# "mode", +# ]: +# assert hasattr(db, attr) + +# assert isinstance(db.data, pd.DataFrame) +# assert isinstance(db.mask, pd.DataFrame) +# assert isinstance(db.columns, pd.Index) +# assert isinstance(db.dtypes, pd.Series) +# assert db.parse_dates is False +# assert isinstance(db.encoding, str) +# assert db.encoding == "cp1252" +# assert db.imodel is None +# assert isinstance(db.mode, str) +# assert db.mode == "data" +# assert len(db) == 5 +# assert db.shape == (5, 341) +# assert db.size == 1705 + + +# def test_read_data_chunksize(): +# data_model = "icoads_r300_d721" +# data = test_data[f"test_{data_model}"]["mdf_data"] +# mask = test_data[f"test_{data_model}"]["mdf_mask"] +# info = test_data[f"test_{data_model}"]["mdf_info"] +# db = read_data( +# data_file=data, mask_file=mask, info_file=info, data_format="csv", chunksize=3 +# ) + +# assert isinstance(db, DataBundle) + +# for attr in [ +# "data", +# "mask", +# "columns", +# "dtypes", +# "parse_dates", +# "encoding", +# "imodel", +# "mode", +# ]: +# assert hasattr(db, attr) + +# assert isinstance(db.data, ParquetStreamReader) +# assert isinstance(db.mask, ParquetStreamReader) +# assert isinstance(db.columns, pd.MultiIndex) +# assert isinstance(db.dtypes, pd.Series) +# assert db.parse_dates == [] +# assert isinstance(db.encoding, str) +# assert db.encoding == "cp1252" +# assert db.imodel is None +# assert isinstance(db.mode, str) +# assert db.mode == "data" +# assert len(db) == 5 +# assert db.shape == (5, 341) +# assert db.size == 1705 def test_validate_read_mdf_args_pass(tmp_path): diff --git a/tests/test_mdf_writer.py b/tests/test_mdf_writer.py index 79793499..0ef3056f 100755 --- a/tests/test_mdf_writer.py +++ b/tests/test_mdf_writer.py @@ -110,9 +110,9 @@ def test_write_data_csv(tmp_path, example_data, example_mask): **info, ) - data_file = tmp_path / "test_write-data-basic.csv" - mask_file = tmp_path / "test_write-mask-basic.csv" - info_file = tmp_path / "test_write-info-basic.json" + data_file = tmp_path / "test_write_data_basic.csv" + mask_file = tmp_path / "test_write_mask_basic.csv" + info_file = tmp_path / "test_write_info_basic.json" assert data_file.is_file() assert mask_file.is_file() @@ -149,9 +149,9 @@ def test_write_data_col_subset(tmp_path, example_data, example_mask): **info, ) - data_file = tmp_path / "test_write-data-subset.csv" - mask_file = tmp_path / "test_write-mask-subset.csv" - info_file = tmp_path / "test_write-info-subset.json" + data_file = tmp_path / "test_write_data_subset.csv" + mask_file = tmp_path / "test_write_mask_subset.csv" + info_file = tmp_path / "test_write_info_subset.json" assert data_file.is_file() assert mask_file.is_file() @@ -186,18 +186,11 @@ def test_write_data_parquet(tmp_path, example_data, example_mask): **info, ) - data_file = tmp_path / "test_write-data-basic.parquet" - mask_file = tmp_path / "test_write-mask-basic.parquet" - info_file = tmp_path / "test_write-info-basic.json" + data_file = tmp_path / "test_write_data_basic.parquet" + mask_file = tmp_path / "test_write_mask_basic.parquet" assert data_file.is_file() assert mask_file.is_file() - assert info_file.is_file() - - with open(info_file) as read_file: - info_res = json.load(read_file) - - assert info_res == info data_res = pd.read_parquet(data_file) assert_frame_equal(example_data, data_res) @@ -223,18 +216,11 @@ def test_write_data_feather(tmp_path, example_data, example_mask): **info, ) - data_file = tmp_path / "test_write-data-basic.feather" - mask_file = tmp_path / "test_write-mask-basic.feather" - info_file = tmp_path / "test_write-info-basic.json" + data_file = tmp_path / "test_write_data_basic.feather" + mask_file = tmp_path / "test_write_mask_basic.feather" assert data_file.is_file() assert mask_file.is_file() - assert info_file.is_file() - - with open(info_file) as read_file: - info_res = json.load(read_file) - - assert info_res == info data_res = pd.read_feather(data_file) assert_frame_equal(example_data, data_res) diff --git a/tests/test_reader_utilities.py b/tests/test_reader_utilities.py index 0cd060fd..9473df70 100755 --- a/tests/test_reader_utilities.py +++ b/tests/test_reader_utilities.py @@ -193,7 +193,7 @@ def test_read_csv_with_col_subset(tmp_csv_file): def test_convert_dtypes_basic(): dtypes = {"A": "int", "B": "datetime", "C": "float"} updated, dates = convert_dtypes(dtypes) - assert updated["B"] == "object" + assert updated["B"] == "datetime64[ns]" assert dates == ["B"] diff --git a/tests/test_writers.py b/tests/test_writers.py index d3908d09..4b54a280 100755 --- a/tests/test_writers.py +++ b/tests/test_writers.py @@ -32,9 +32,8 @@ def db_data(): pattern = f"test_{imodel}" data_file = test_data[pattern]["mdf_data"] - info_file = test_data[pattern]["mdf_info"] - db = read(data_file, info_file=info_file, mode="data", data_format="csv") + db = read(data_file, mode="data") db.imodel = imodel return db From 751b94ca923531ede82bd53cdea1d6adc6c9650f Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 16 Apr 2026 09:25:14 +0200 Subject: [PATCH 2/7] attempt to encode cp1252 --- .../mdf_reader/utils/filereader.py | 15 ++++++++ tests/test_mdf_reader.py | 38 +++++++++---------- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/utils/filereader.py b/cdm_reader_mapper/mdf_reader/utils/filereader.py index bd9788ba..706a4326 100755 --- a/cdm_reader_mapper/mdf_reader/utils/filereader.py +++ b/cdm_reader_mapper/mdf_reader/utils/filereader.py @@ -365,6 +365,21 @@ def read( data, mask, config = result + if config.encoding not in [None, "utf-8"]: + object_columns = data.select_dtypes(include=["object", "string"]).columns + for object_column in object_columns: + # print(data[object_column]) + # print(config.encoding) + data[object_column] = ( + data[object_column].str.encode(config.encoding).str.decode("utf-8") + ) + # print(data[object_column]) + # print('---------------------------------------------') + # if object_column == ("c99_header","ship_name"): + # exit() + + # exit() + return DataBundle( data=data, columns=config.columns, diff --git a/tests/test_mdf_reader.py b/tests/test_mdf_reader.py index 0af980d9..e03ff94a 100755 --- a/tests/test_mdf_reader.py +++ b/tests/test_mdf_reader.py @@ -78,26 +78,26 @@ def _read_mdf_test_data(data_model, select=None, drop=None, drop_idx=None, **kwa @pytest.mark.parametrize( "data_model", [ - "icoads_r300_d714", - "icoads_r300_d701", - "icoads_r300_d706", - "icoads_r300_d705", - "icoads_r300_d702", - "icoads_r300_d707", - "icoads_r302_d794", - "icoads_r300_d704", + # "icoads_r300_d714", + # "icoads_r300_d701", + # "icoads_r300_d706", + # "icoads_r300_d705", + # "icoads_r300_d702", + # "icoads_r300_d707", + # "icoads_r302_d794", + # "icoads_r300_d704", "icoads_r300_d721", - "icoads_r300_d730", - "icoads_r300_d781", - "icoads_r300_d703", - "icoads_r300_d201", - "icoads_r300_d892", - "icoads_r300_d700", - "icoads_r302_d792", - "icoads_r302_d992", - "craid", - "gdac", - "cmems", + # "icoads_r300_d730", + # "icoads_r300_d781", + # "icoads_r300_d703", + # "icoads_r300_d201", + # "icoads_r300_d892", + # "icoads_r300_d700", + # "icoads_r302_d792", + # "icoads_r302_d992", + # "craid", + # "gdac", + # "cmems", ], ) def test_read_mdf_test_data_basic(data_model): From 84ee3db128a6a77cb5b8285b38b75fa19c5cdb31 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 16 Apr 2026 10:15:56 +0200 Subject: [PATCH 3/7] completed to encode cp1252 --- .../mdf_reader/utils/filereader.py | 24 +-- tests/test_mdf_reader.py | 144 +++++++----------- 2 files changed, 60 insertions(+), 108 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/utils/filereader.py b/cdm_reader_mapper/mdf_reader/utils/filereader.py index 706a4326..92d92dab 100755 --- a/cdm_reader_mapper/mdf_reader/utils/filereader.py +++ b/cdm_reader_mapper/mdf_reader/utils/filereader.py @@ -216,6 +216,13 @@ def _process_data( data = remove_boolean_values(data, config.dtypes) config = replace(config, columns=data.columns) + if config.encoding not in [None, "utf-8"]: + object_columns = data.select_dtypes(include=["object", "string"]).columns + for object_column in object_columns: + data[object_column] = ( + data[object_column].str.encode(config.encoding).str.decode("utf-8") + ) + return data, mask, config @process_function() @@ -365,27 +372,12 @@ def read( data, mask, config = result - if config.encoding not in [None, "utf-8"]: - object_columns = data.select_dtypes(include=["object", "string"]).columns - for object_column in object_columns: - # print(data[object_column]) - # print(config.encoding) - data[object_column] = ( - data[object_column].str.encode(config.encoding).str.decode("utf-8") - ) - # print(data[object_column]) - # print('---------------------------------------------') - # if object_column == ("c99_header","ship_name"): - # exit() - - # exit() - return DataBundle( data=data, columns=config.columns, dtypes=config.dtypes, parse_dates=config.parse_dates, - encoding=config.encoding, + encoding="utf-8", mask=mask, imodel=self.imodel, ) diff --git a/tests/test_mdf_reader.py b/tests/test_mdf_reader.py index e03ff94a..62386732 100755 --- a/tests/test_mdf_reader.py +++ b/tests/test_mdf_reader.py @@ -16,7 +16,6 @@ ) from cdm_reader_mapper.mdf_reader.utils.filereader import _apply_multiindex -# from cdm_reader_mapper.common.iterators import ParquetStreamReader from cdm_reader_mapper.mdf_reader.utils.utilities import ( read_csv, read_parquet, @@ -78,26 +77,26 @@ def _read_mdf_test_data(data_model, select=None, drop=None, drop_idx=None, **kwa @pytest.mark.parametrize( "data_model", [ - # "icoads_r300_d714", - # "icoads_r300_d701", - # "icoads_r300_d706", - # "icoads_r300_d705", - # "icoads_r300_d702", - # "icoads_r300_d707", - # "icoads_r302_d794", - # "icoads_r300_d704", + "icoads_r300_d714", + "icoads_r300_d701", + "icoads_r300_d706", + "icoads_r300_d705", + "icoads_r300_d702", + "icoads_r300_d707", + "icoads_r302_d794", + "icoads_r300_d704", "icoads_r300_d721", - # "icoads_r300_d730", - # "icoads_r300_d781", - # "icoads_r300_d703", - # "icoads_r300_d201", - # "icoads_r300_d892", - # "icoads_r300_d700", - # "icoads_r302_d792", - # "icoads_r302_d992", - # "craid", - # "gdac", - # "cmems", + "icoads_r300_d730", + "icoads_r300_d781", + "icoads_r300_d703", + "icoads_r300_d201", + "icoads_r300_d892", + "icoads_r300_d700", + "icoads_r302_d792", + "icoads_r302_d992", + "craid", + "gdac", + "cmems", ], ) def test_read_mdf_test_data_basic(data_model): @@ -179,8 +178,8 @@ def test_read_mdf_test_data_select(data_model, kwargs, select): ("icoads_r300_d714", {"excludes": ["c98"]}, ["c98"]), ("icoads_r300_d714", {"excludes": "c98"}, ["c98"]), ("icoads_r300_d714", {"excludes": ["c5", "c98"]}, ["c5", "c98"]), - # ("icoads_r300_mixed", {"excludes": ["c99"], "encoding": "cp1252"}, ["c99"]), - # ("icoads_r300_mixed", {"excludes": "c99", "encoding": "cp1252"}, ["c99"]), + ("icoads_r300_mixed", {"excludes": ["c99"], "encoding": "cp1252"}, ["c99"]), + ("icoads_r300_mixed", {"excludes": "c99", "encoding": "cp1252"}, ["c99"]), ( "craid", {"excludes": ["drifter_measurements", "drifter_history"]}, @@ -345,76 +344,37 @@ def test_read_data_col_subset(): assert db.size == 240 -# def test_read_data_encoding(): -# data_model = "icoads_r300_d721" -# data = test_data[f"test_{data_model}"]["mdf_data"] -# db = read_data(data_file=data, data_format="csv", encoding="cp1252") - -# assert isinstance(db, DataBundle) - -# for attr in [ -# "data", -# "mask", -# "columns", -# "dtypes", -# "parse_dates", -# "encoding", -# "imodel", -# "mode", -# ]: -# assert hasattr(db, attr) - -# assert isinstance(db.data, pd.DataFrame) -# assert isinstance(db.mask, pd.DataFrame) -# assert isinstance(db.columns, pd.Index) -# assert isinstance(db.dtypes, pd.Series) -# assert db.parse_dates is False -# assert isinstance(db.encoding, str) -# assert db.encoding == "cp1252" -# assert db.imodel is None -# assert isinstance(db.mode, str) -# assert db.mode == "data" -# assert len(db) == 5 -# assert db.shape == (5, 341) -# assert db.size == 1705 - - -# def test_read_data_chunksize(): -# data_model = "icoads_r300_d721" -# data = test_data[f"test_{data_model}"]["mdf_data"] -# mask = test_data[f"test_{data_model}"]["mdf_mask"] -# info = test_data[f"test_{data_model}"]["mdf_info"] -# db = read_data( -# data_file=data, mask_file=mask, info_file=info, data_format="csv", chunksize=3 -# ) - -# assert isinstance(db, DataBundle) - -# for attr in [ -# "data", -# "mask", -# "columns", -# "dtypes", -# "parse_dates", -# "encoding", -# "imodel", -# "mode", -# ]: -# assert hasattr(db, attr) - -# assert isinstance(db.data, ParquetStreamReader) -# assert isinstance(db.mask, ParquetStreamReader) -# assert isinstance(db.columns, pd.MultiIndex) -# assert isinstance(db.dtypes, pd.Series) -# assert db.parse_dates == [] -# assert isinstance(db.encoding, str) -# assert db.encoding == "cp1252" -# assert db.imodel is None -# assert isinstance(db.mode, str) -# assert db.mode == "data" -# assert len(db) == 5 -# assert db.shape == (5, 341) -# assert db.size == 1705 +def test_read_data_encoded(): + data_model = "icoads_r300_d721" + data = test_data[f"test_{data_model}"]["mdf_data"] + db = read_data(data_file=data) + + assert isinstance(db, DataBundle) + + for attr in [ + "data", + "mask", + "columns", + "dtypes", + "parse_dates", + "encoding", + "imodel", + "mode", + ]: + assert hasattr(db, attr) + + assert isinstance(db.data, pd.DataFrame) + assert isinstance(db.mask, pd.DataFrame) + assert isinstance(db.columns, pd.Index) + assert isinstance(db.dtypes, pd.Series) + assert db.parse_dates is False + assert db.encoding is None + assert db.imodel is None + assert isinstance(db.mode, str) + assert db.mode == "data" + assert len(db) == 5 + assert db.shape == (5, 341) + assert db.size == 1705 def test_validate_read_mdf_args_pass(tmp_path): From e36d81c026c7ac11f2f15a5cc547e0a86c309702 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 16 Apr 2026 11:57:59 +0200 Subject: [PATCH 4/7] keep pd.NA values --- .../cdm_mapper/utils/conversions.py | 10 ++++- tests/test_cdm_mapper.py | 37 +++++-------------- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/cdm_reader_mapper/cdm_mapper/utils/conversions.py b/cdm_reader_mapper/cdm_mapper/utils/conversions.py index 0221e0af..46b1cbe4 100755 --- a/cdm_reader_mapper/cdm_mapper/utils/conversions.py +++ b/cdm_reader_mapper/cdm_mapper/utils/conversions.py @@ -285,7 +285,15 @@ def _convert_str_from_str(data: pd.Series, null_label: str) -> pd.Series: pd.Series Series with data type representtions of elements. """ - return data.astype(str).astype(object).replace(null_label, pd.NA) + + def _return_str(x, null_label): + if pd.isna(x): + return pd.NA + if x == null_label: + return pd.NA + return str(x) + + return data.apply(lambda x: _return_str(x, null_label)) def _convert_str_array_to_str(data: pd.Series, null_label: str) -> pd.Series: diff --git a/tests/test_cdm_mapper.py b/tests/test_cdm_mapper.py index dfe8be83..18f1bdd5 100755 --- a/tests/test_cdm_mapper.py +++ b/tests/test_cdm_mapper.py @@ -1,5 +1,7 @@ from __future__ import annotations +import ast + import numpy as np import pandas as pd import pytest @@ -77,21 +79,12 @@ def data_header_expected(): ) -def _map_model_test_data( - data_model, encoding="utf-8", select=None, chunksize=None, **kwargs -): +def _map_model_test_data(data_model, encoding="utf-8", select=None, **kwargs): source = test_data[f"test_{data_model}"]["mdf_data"] - df = pd.read_parquet( - source, - # dtype=dtypes, - # chunksize=chunksize, - # encoding=encoding, - ) + df = pd.read_parquet(source) - import ast - - if chunksize is None and "(" in df.columns[0]: + if "(" in df.columns[0]: def to_tuple(x): try: @@ -105,9 +98,6 @@ def to_tuple(x): result = map_model(df, data_model, **kwargs) - if chunksize: - result = result.read() - if not select: select = cdm_tables @@ -619,18 +609,18 @@ def test_map_model_pub47(): "icoads_r300_d707", "icoads_r302_d794", "icoads_r300_d704", - "icoads_r300_d721", # f + "icoads_r300_d721", "icoads_r300_d730", "icoads_r300_d781", - "icoads_r300_d703", # f + "icoads_r300_d703", "icoads_r300_d201", "icoads_r300_d892", "icoads_r300_d700", "icoads_r302_d792", "icoads_r302_d992", - "craid", # f + "craid", "gdac", - "marob", # f + "marob", "cmems", ], ) @@ -639,7 +629,7 @@ def test_map_model_test_data_basic(data_model): def test_map_model_test_data_mixed(): - _map_model_test_data("icoads_r300_mixed", encoding="cp1252") + _map_model_test_data("icoads_r300_mixed") def test_map_model_test_data_select(): @@ -648,10 +638,3 @@ def test_map_model_test_data_select(): select=["header", "observations-sst"], cdm_subset=["header", "observations-sst"], ) - - -def test_map_model_test_data_chunksize(): - _map_model_test_data( - "icoads_r300_d714", - chunksize=2, - ) From 957e14fc6c17ce8a2378feac5e1f0f73c0045a1c Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 16 Apr 2026 12:04:07 +0200 Subject: [PATCH 5/7] remove print statement --- cdm_reader_mapper/mdf_reader/writer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cdm_reader_mapper/mdf_reader/writer.py b/cdm_reader_mapper/mdf_reader/writer.py index a225d6c4..880955ec 100755 --- a/cdm_reader_mapper/mdf_reader/writer.py +++ b/cdm_reader_mapper/mdf_reader/writer.py @@ -160,8 +160,6 @@ def write_data( [prefix, "info", suffix], path=out_dir, extension="json", separator=separator ) - print(filename_data) - for i, (data_df, mask_df) in enumerate(zip(data_list, mask_list)): if col_subset is not None: data_df = data_df[col_subset] From b02c8c7e4f6e1a675996c8968ee2ecedf3cd1e53 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 16 Apr 2026 12:13:44 +0200 Subject: [PATCH 6/7] resuse main test data branch --- cdm_reader_mapper/common/getting_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdm_reader_mapper/common/getting_files.py b/cdm_reader_mapper/common/getting_files.py index 0273dca9..d1659c69 100755 --- a/cdm_reader_mapper/common/getting_files.py +++ b/cdm_reader_mapper/common/getting_files.py @@ -137,7 +137,7 @@ def _get_file( def load_file( name: str | os.PathLike, github_url: str = "https://github.com/glamod/cdm-testdata", - branch: str = "data_pq", # "main", + branch: str = "main", cache: bool = True, cache_dir: str | Path = _default_cache_dir_, clear_cache: bool = False, From ee43548768fd17f974aa9bf88e28481f4a0ef781 Mon Sep 17 00:00:00 2001 From: Ludwig Lierhammer Date: Thu, 16 Apr 2026 12:14:00 +0200 Subject: [PATCH 7/7] update CHANGELOG --- CHANGES.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 2cf09e66..17374d3d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -3,7 +3,7 @@ Changelog ========= -2.4.1 (2026-04-07) +2.4.1 (unpublished) ------------------ Contributor to this version: Ludwig Lierhammer (:user:`ludwiglierhammer`) @@ -11,18 +11,23 @@ New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * `mdf_mapper` / `cdm_mapper`: add new project CMEMS for drifting iridium buoy data (:pull:`405`) +* `mdf_mapper` / `cdm_mapper`: new parameter "separator" to define filename separator while reading and writing files (:pull:`414`) Breaking changes ^^^^^^^^^^^^^^^^ * `cdm_mapper`: update element names in MAROB CDM mapping tables (:pull:`393`) * `cdm_mapper.util.mapping_functions`: change default MAROB datetime string format to "%Y-%m-%dT%H:%M:%S" (:pull:`393`) +* `cdm_mapper`: keep pd.NA value and do not convert them to strings (:pull:`414`) +* `test_data`: load parquet files instead of csv files (:issue:`410`, :pull:`414`) +* `mdf_mapper` / `cdm_mapper`: default file name extension is "pq" while reading and writing files (:pull:`414`) Bug fixes ^^^^^^^^^ * `duplicates`: do not change data types when updating quality flags and history description (:pull:`408`) +* `mdf_reader`: decode data to "utf-8" to avoid misleading file encoding (:pull:`414`) Internal changes ^^^^^^^^^^^^^^^^