Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,31 @@
Changelog
=========

2.4.1 (2026-04-07)
2.4.1 (unpublished)
------------------
Contributor to this version: Ludwig Lierhammer (:user:`ludwiglierhammer`)

New features and enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

* `mdf_mapper` / `cdm_mapper`: add new project CMEMS for drifting iridium buoy data (:pull:`405`)
* `mdf_mapper` / `cdm_mapper`: new parameter "separator" to define filename separator while reading and writing files (:pull:`414`)

Breaking changes
^^^^^^^^^^^^^^^^

* `cdm_mapper`: update element names in MAROB CDM mapping tables (:pull:`393`)
* `cdm_mapper.util.mapping_functions`: change default MAROB datetime string format to "%Y-%m-%dT%H:%M:%S" (:pull:`393`)
* `cdm_mapper`: keep pd.NA value and do not convert them to strings (:pull:`414`)
* `test_data`: load parquet files instead of csv files (:issue:`410`, :pull:`414`)
* `mdf_mapper` / `cdm_mapper`: default file name extension is "pq" while reading and writing files (:pull:`414`)


Bug fixes
^^^^^^^^^

* `duplicates`: do not change data types when updating quality flags and history description (:pull:`408`)
* `mdf_reader`: decode data to "utf-8" to avoid misleading file encoding (:pull:`414`)

Internal changes
^^^^^^^^^^^^^^^^
Expand Down
16 changes: 13 additions & 3 deletions cdm_reader_mapper/cdm_mapper/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def _read_multiple_files(
prefix: str | None = None,
suffix: str | None = None,
extension: str | None = None,
separator: str | None = "-",
cdm_subset: str | list | None = None,
col_subset: str | list | None = None,
null_label: str = "null",
Expand All @@ -136,7 +137,9 @@ def _read_multiple_files(
suffix_pattern = f"*{suffix}"

# See if there's anything at all:
pattern = get_filename([prefix, suffix_pattern], path=inp_dir, extension=extension)
pattern = get_filename(
[prefix, suffix_pattern], path=inp_dir, extension=extension, separator=separator
)
files = glob.glob(pattern)

if len(files) == 0:
Expand All @@ -157,7 +160,9 @@ def _read_multiple_files(
_pattern = [prefix] + _pattern
if suffix:
_pattern = _pattern + [suffix_pattern]
pattern_ = get_filename(_pattern, path=inp_dir, extension=extension)
pattern_ = get_filename(
_pattern, path=inp_dir, extension=extension, separator=separator
)
paths_ = glob.glob(pattern_)
if len(paths_) != 1:
logger.warning(
Expand Down Expand Up @@ -193,6 +198,7 @@ def read_tables(
prefix: str | None = None,
suffix: str | None = None,
extension: str | None = None,
separator: str | None = "-",
cdm_subset: str | list | None = None,
col_subset: str | list | dict | None = None,
delimiter: str = "|",
Expand Down Expand Up @@ -221,7 +227,10 @@ def read_tables(
extension: str, optional
Extension of file name structure: ``<prefix>-<table>-*<suffix>.<extension>``.
Could de used if `source` is a valid directory path.
Default: psv
Default: "psv"
separator : str, optional
Separator to join the file name pattern components.
Default: "-"
cdm_subset: str or list, optional
Specifies a subset of tables or a single table.

Expand Down Expand Up @@ -307,6 +316,7 @@ def read_tables(
prefix=prefix,
suffix=suffix,
extension=extension,
separator=separator,
cdm_subset=cdm_subset,
col_subset=col_subset,
null_label=null_label,
Expand Down
10 changes: 9 additions & 1 deletion cdm_reader_mapper/cdm_mapper/utils/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,15 @@ def _convert_str_from_str(data: pd.Series, null_label: str) -> pd.Series:
pd.Series
Series with data type representtions of elements.
"""
return data.astype(object).replace(null_label, pd.NA)

def _return_str(x, null_label):
if pd.isna(x):
return pd.NA
if x == null_label:
return pd.NA
return str(x)

return data.apply(lambda x: _return_str(x, null_label))


def _convert_str_array_to_str(data: pd.Series, null_label: str) -> pd.Series:
Expand Down
8 changes: 7 additions & 1 deletion cdm_reader_mapper/cdm_mapper/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def write_tables(
suffix: str | None = None,
extension: str | None = None,
filename: str | dict | None = None,
separator: str | None = "-",
cdm_subset: str | list | None = None,
col_subset: str | list | dict | None = None,
delimiter: str = "|",
Expand Down Expand Up @@ -104,6 +105,8 @@ def write_tables(
Suffix of file name structure: ``<prefix>-<table>-*<suffix>.<extension>``.
extension: str, optional
Extension of file name structure: ``<prefix>-<table>-*<suffix>.<extension>``.
separator : str, optional
Separator to join the file name pattern components (default "-").
filename: str or dict, optional
Name of the output file name(s).
List one filename for each table name in ``data`` ({<table>:<filename>}).
Expand Down Expand Up @@ -190,7 +193,10 @@ def write_tables(
filename_ = filename.get(table)
if not filename_:
filename_ = get_filename(
[prefix, table, suffix], path=out_dir, extension=extension
[prefix, table, suffix],
path=out_dir,
extension=extension,
separator=separator,
)
filename_ = adjust_filename(filename_, table=table, extension=extension)
if len(Path(filename_).parts) == 1:
Expand Down
8 changes: 4 additions & 4 deletions cdm_reader_mapper/common/io_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
def get_filename(
pattern: Sequence[str],
path: str = ".",
extension: str = "psv",
separator: str = "-",
extension: str = "pq",
separator: str = "_",
) -> str:
"""
Construct a filename from a sequence of string components.
Expand All @@ -26,9 +26,9 @@ def get_filename(
Default is current directory `"."`.
extension : str, optional
File extension, with or without a leading dot
(e.g., `"psv"` or `".psv"`). Default is `"psv"`.
(e.g., `"pq"` or `".pq"`). Default is `"pq"`.
separator : str, optional
Separator to join the pattern components (default "-").
Separator to join the pattern components (default "_").

Returns
-------
Expand Down
5 changes: 2 additions & 3 deletions cdm_reader_mapper/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,9 +243,8 @@ def _get_data_dict(self, data_file, data_model, source_ext="csv"):
drs = "/".join(data_model.split("_"))
data_dict = {
"source": f"{drs}/input/{data_model}_{data_file}.{source_ext}",
"mdf_data": f"{drs}/output/data_{data_model}_{data_file}.csv",
"mdf_mask": f"{drs}/output/mask_{data_model}_{data_file}.csv",
"mdf_info": f"{drs}/output/info_{data_model}_{data_file}.json",
"mdf_data": f"{drs}/output/data_{data_model}_{data_file}.pq",
"mdf_mask": f"{drs}/output/mask_{data_model}_{data_file}.pq",
"vadt": f"{drs}/validation/vadt_{data_model}_{data_file}.csv",
"vaid": f"{drs}/validation/vaid_{data_model}_{data_file}.csv",
}
Expand Down
9 changes: 8 additions & 1 deletion cdm_reader_mapper/mdf_reader/utils/filereader.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,13 @@ def _process_data(
data = remove_boolean_values(data, config.dtypes)
config = replace(config, columns=data.columns)

if config.encoding not in [None, "utf-8"]:
object_columns = data.select_dtypes(include=["object", "string"]).columns
for object_column in object_columns:
data[object_column] = (
data[object_column].str.encode(config.encoding).str.decode("utf-8")
)

return data, mask, config

@process_function()
Expand Down Expand Up @@ -370,7 +377,7 @@ def read(
columns=config.columns,
dtypes=config.dtypes,
parse_dates=config.parse_dates,
encoding=config.encoding,
encoding="utf-8",
mask=mask,
imodel=self.imodel,
)
2 changes: 1 addition & 1 deletion cdm_reader_mapper/mdf_reader/utils/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ def convert_dtypes(dtypes) -> tuple[str]:
for key, value in dtypes.items():
if value == "datetime":
parse_dates.append(key)
dtypes[key] = "object"
dtypes[key] = "datetime64[ns]"
return dtypes, parse_dates


Expand Down
14 changes: 9 additions & 5 deletions cdm_reader_mapper/mdf_reader/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def write_data(
suffix: str | None = None,
extension: str = None,
filename: str | dict | None = None,
separator: str | None = "_",
col_subset: str | list[str] | tuple[str] | None = None,
delimiter: str = ",",
**kwargs,
Expand Down Expand Up @@ -88,6 +89,8 @@ def write_data(
extension: str, optional
Extension of file name structure: ``<prefix>-data-*<suffix>.<extension>``.
By default, extension depends on `data_format`.
separator : str, optional
Separator to join the file name pattern components (default "_").
filename: str or dict, optional
Name of the output file name(s).
List one filename for both ``data`` and ``mask`` ({"data":<filenameD>, "mask":<filenameM>}).
Expand Down Expand Up @@ -148,13 +151,13 @@ def write_data(
out_dir.mkdir(parents=True, exist_ok=True)

filename_data = get_filename(
[prefix, "data", suffix], path=out_dir, extension=extension
[prefix, "data", suffix], path=out_dir, extension=extension, separator=separator
)
filename_mask = get_filename(
[prefix, "mask", suffix], path=out_dir, extension=extension
[prefix, "mask", suffix], path=out_dir, extension=extension, separator=separator
)
filename_info = get_filename(
[prefix, "info", suffix], path=out_dir, extension="json"
[prefix, "info", suffix], path=out_dir, extension="json", separator=separator
)

for i, (data_df, mask_df) in enumerate(zip(data_list, mask_list)):
Expand Down Expand Up @@ -199,5 +202,6 @@ def write_data(
if not mask_df.empty:
getattr(mask_df, writer)(filename_mask, **write_kwargs)

with open(filename_info, "w") as fileObj:
json.dump(info, fileObj, indent=4)
if data_format == "csv":
with open(filename_info, "w") as fileObj:
json.dump(info, fileObj, indent=4)
43 changes: 14 additions & 29 deletions tests/test_cdm_mapper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

import ast

import numpy as np
import pandas as pd
import pytest
Expand All @@ -20,7 +22,6 @@
)

from cdm_reader_mapper.common import logging_hdlr
from cdm_reader_mapper.common.json_dict import open_json_file

from cdm_reader_mapper.cdm_mapper.properties import cdm_tables
from cdm_reader_mapper.cdm_mapper.reader import read_tables
Expand Down Expand Up @@ -78,34 +79,25 @@ def data_header_expected():
)


def _map_model_test_data(
data_model, encoding="utf-8", select=None, chunksize=None, **kwargs
):
def _map_model_test_data(data_model, encoding="utf-8", select=None, **kwargs):
source = test_data[f"test_{data_model}"]["mdf_data"]

mdf_info = test_data[f"test_{data_model}"]["mdf_info"]
df = pd.read_parquet(source)

if mdf_info is None:
dtypes = object
else:
info = open_json_file(mdf_info)
dtypes = info["dtypes"]
if "(" in df.columns[0]:

df = pd.read_csv(
source,
dtype=dtypes,
chunksize=chunksize,
encoding=encoding,
)
def to_tuple(x):
try:
val = ast.literal_eval(x)
if isinstance(val, tuple):
return val
except Exception:
return (x, "")

if chunksize is None and ":" in df.columns[0]:
df.columns = pd.MultiIndex.from_tuples(col.split(":") for col in df.columns)
df.columns = pd.MultiIndex.from_tuples([to_tuple(col) for col in df.columns])

result = map_model(df, data_model, **kwargs)

if chunksize:
result = result.read()

if not select:
select = cdm_tables

Expand Down Expand Up @@ -637,7 +629,7 @@ def test_map_model_test_data_basic(data_model):


def test_map_model_test_data_mixed():
_map_model_test_data("icoads_r300_mixed", encoding="cp1252")
_map_model_test_data("icoads_r300_mixed")


def test_map_model_test_data_select():
Expand All @@ -646,10 +638,3 @@ def test_map_model_test_data_select():
select=["header", "observations-sst"],
cdm_subset=["header", "observations-sst"],
)


def test_map_model_test_data_chunksize():
_map_model_test_data(
"icoads_r300_d714",
chunksize=2,
)
10 changes: 5 additions & 5 deletions tests/test_common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,8 +250,8 @@ def test_combine_dicts(tmp_path):
@pytest.mark.parametrize(
"pattern, extension, expected_filename",
[
(["a", "b"], "txt", "a-b.txt"),
(["a", "", "c"], "psv", "a-c.psv"),
(["a", "b"], "txt", "a_b.txt"),
(["a", "", "c"], "psv", "a_c.psv"),
(["x"], ".csv", "x.csv"),
([], "log", ""),
],
Expand Down Expand Up @@ -293,9 +293,9 @@ def test_get_filename_extension_normalization(tmp_path, extension, normalized):
@pytest.mark.parametrize(
"pattern, expected_name",
[
(["data", "2024"], "data-2024.psv"),
(["", "A", "B"], "A-B.psv"),
(["only"], "only.psv"),
(["data", "2024"], "data_2024.pq"),
(["", "A", "B"], "A_B.pq"),
(["only"], "only.pq"),
],
)
def test_get_filename_name_part(pattern, expected_name):
Expand Down
Loading
Loading