diff --git a/src/xhydro/pygmet/make_toml_config_pygmet.py b/src/xhydro/pygmet/make_toml_config_pygmet.py new file mode 100644 index 00000000..0db06f83 --- /dev/null +++ b/src/xhydro/pygmet/make_toml_config_pygmet.py @@ -0,0 +1,256 @@ +""" +Hard-coded TOML template writer. + +This module writes a new TOML config file based on a fixed template, replacing only +the fields marked with "# MAKE_DYNAMIC" in the original template. +""" + +from __future__ import annotations +from collections.abc import Mapping, Sequence +from dataclasses import dataclass +from datetime import date, datetime +from pathlib import Path +from typing import Any + + +@dataclass(frozen=True) +class RawToml: + """Inject TOML verbatim (advanced use).""" + + text: str + + +def _toml_escape_single_quoted(s: str) -> str: + """ + Manage the single quotes in strings. + + Parameters + ---------- + s : str + String that has single quotes and needs to be processed. + + Returns + ------- + str : + The string with the processed single quotes. + """ + # TOML single-quoted strings are literal except that you must escape single quotes. + return "'" + s.replace("'", "''") + "'" + + +def to_toml(value: Any) -> str: + """ + Convert common Python types to TOML literals. + + Parameters + ---------- + value : Any + Value to convert to TOML literal. + + Returns + ------- + str : + The string literal writable to TOML file. + """ + if isinstance(value, RawToml): + return value.text + if isinstance(value, bool): + return "true" if value else "false" + if isinstance(value, int): + return str(value) + if isinstance(value, float): + return repr(value) + if isinstance(value, (datetime, date)): + return _toml_escape_single_quoted(value.isoformat()) + if isinstance(value, str): + return _toml_escape_single_quoted(value) + if isinstance(value, (list, tuple)): + return "[" + ", ".join(to_toml(v) for v in value) + "]" + if value is None: + # TOML has no null; choose what your application expects. + return '""' + raise TypeError(f"Unsupported type for TOML conversion: {type(value).__name__}") + + +# Keys whose values should be provided at runtime: +DYNAMIC_KEYS: Sequence[str] = ( + "case_name", + "num_processes", + "modelsettings_file", + "input_stn_all", + "infile_grid_domain", + "outpath_parent", + "date_start", + "date_end", + "input_vars", + "target_vars", + "target_vars_WithProbability", + "minRange_vars", + "maxRange_vars", + "transform_vars", + "predictor_name_static_stn", + "predictor_name_static_grid", + "ensemble_end", + "clen", # codespell:ignore clen + "auto_corr_method", + "target_vars_max_constrain", +) + +# The hard-coded template (generated from testcase.config.static_withTemperature.toml): +TOML_TEMPLATE = """\ +######################################################################################################################## +# TEMPLATE FILE TO RUN Pygmet for downscaling / generator +######################################################################################################################## + +case_name = {case_name} + +num_processes = {num_processes} + +# model setting file +modelsettings_file = {modelsettings_file} + +######################################################################################################################## +# Input and output information +######################################################################################################################## + +input_stn_all = {input_stn_all} + +infile_grid_domain = {infile_grid_domain} + +outpath_parent = {outpath_parent} + +# file list containing dynamic predictor inputs (optional). Give it a nonsense string (e.g., "NA") can turn off dynamic +# predictors +dynamic_predictor_filelist = "NA" + +######################################################################################################################## +# Period to process +######################################################################################################################## + +date_start = {date_start} +date_end = {date_end} + +######################################################################################################################## +# Variables to process +######################################################################################################################## + +input_vars = {input_vars} + +# Target variables to output, should always contain precip as first variable +target_vars = {target_vars} + +# input variables may need some conversion to get target variables. if input_var and target_var have the same variable +# name, no need to add mapping relation +mapping_InOut_var = [] + +# Target variables that will get an additional output probability of exceedance (P(exc) and P(non exc)) +target_vars_WithProbability = {target_vars_WithProbability} + +# Set the minimum and maximum range for each variable, must be same order as input_vars +minRange_vars = {minRange_vars} +maxRange_vars = {maxRange_vars} + +############################## dynamic predictors for regression +# only useful when dynamic_predictor_filelist is valid + +# dynamic predictors for each target_vars. Empty list means no dynamic predictors +dynamic_predictor_name = [] + +# dynamic predictors may needs some processing. two keywords: "interp" an "transform" +# Example "cube_root_prec_rate:interp=linear:transform=boxcox" +dynamic_predictor_operation = [] + + +# Transformation to use for each variable, can be any of: +# 'log', 'logit', 'direct', 'sqrt', 'cbrt', 'boxcox', 'yeojohnson', 'standardize' +transform_vars = {transform_vars} + +######################################################################################################################## +# Static predictors +######################################################################################################################## + +# Names of variables in input_stn_all file (static predictors) +predictor_name_static_stn = {predictor_name_static_stn} + +# Names of variables in infile_grid_domain file (static predictors) +predictor_name_static_grid = {predictor_name_static_grid} + +######################################################################################################################## +# Stochastic and correlation settings +######################################################################################################################## + +# Number of ensemble members to generate (inclusive end index) +# run ensemble or not: true or false +ensemble_flag = true + +# ensemble settings +ensemble_start = 1 +ensemble_end = {ensemble_end} + +# link variables for random number generation dependence +linkvar = [] + +# Correlation length for each variable (same order as input_vars) +clen = {clen} # codespell:ignore clen +lag1_auto_cc = [-9999, -9999, -9999] # corresponding to target_vars +cross_cc = [] # corresponding to linkvar + +# Method for auto-correlation for each variable (same order as input_vars) +auto_corr_method = {auto_corr_method} + +######################################################################################################################## +# Constraints +######################################################################################################################## +rolling_window = 31 # 31-monthly rolling mean to remove monthly cycle +# Max constrain list (subset of target_vars) +target_vars_max_constrain = {target_vars_max_constrain} +""" + + +def render_config_toml(params: Mapping[str, Any], strict: bool = True) -> str: + """ + Render the template into a TOML config string. + + Parameters + ---------- + params : Mapping[str, Any] + The list of parameters to write to the file, in replacement of the template tags. + strict : bool + True if all DYNAMIC_KEYS must be present in `params`. + + Returns + ------- + str : + The full template with tags replaced. + """ + missing = [k for k in DYNAMIC_KEYS if k not in params] + if strict and missing: + raise KeyError(f"Missing replacements for keys: {missing}") + + mapping = {k: to_toml(params[k]) for k in DYNAMIC_KEYS if k in params} + + # format_map will raise KeyError if any placeholder isn't present; strict above prevents that. + return TOML_TEMPLATE.format_map(mapping) + + +def write_config_toml(output_path: str | Path, params: Mapping[str, Any], strict: bool = True) -> bool: + """ + Write a fresh model settings TOML file from the hard-coded template. + + Parameters + ---------- + output_path : str or Path + Path to the filled-in template file we want to write to disk. + params : Mapping[str,Any] + The list of parameters to write to the file, in replacement of the template tags. + strict : bool + True if all DYNAMIC_KEYS must be present in `params`. + + Returns + ------- + bool : + True if the code executes with no error. + """ + output_path = Path(output_path) + output_path.write_text(render_config_toml(params, strict=strict), encoding="utf-8") + return True diff --git a/src/xhydro/pygmet/make_toml_settings_pygmet.py b/src/xhydro/pygmet/make_toml_settings_pygmet.py new file mode 100644 index 00000000..3cc924d1 --- /dev/null +++ b/src/xhydro/pygmet/make_toml_settings_pygmet.py @@ -0,0 +1,234 @@ +""" +Hard-coded template writer for the PyGMET settings file. + +This module writes a new TOML settings file from a fixed template, replacing templated values with desired values. +""" + +from __future__ import annotations +from collections.abc import Mapping, Sequence +from dataclasses import dataclass +from pathlib import Path +from typing import Any + + +@dataclass(frozen=True) +class RawToml: + """Inject TOML verbatim.""" + + text: str + + +def _toml_escape_single_quoted(s: str) -> str: + """ + Manage the single quotes in strings. + + Parameters + ---------- + s : str + String that has single quotes and needs to be processed. + + Returns + ------- + str : + The string with the processed single quotes. + """ + # TOML single-quoted strings are literal except that you must escape single quotes. + return "'" + s.replace("'", "''") + "'" + + +def to_toml(value: Any) -> str: + """ + Convert common Python types to TOML literals. + + Parameters + ---------- + value : Any + Value to convert to TOML literal. + + Returns + ------- + str : + The string literal writable to TOML file. + """ + if isinstance(value, RawToml): + return value.text + if isinstance(value, bool): + return "true" if value else "false" + if isinstance(value, int): + return str(value) + if isinstance(value, float): + return repr(value) + if isinstance(value, str): + return _toml_escape_single_quoted(value) + if isinstance(value, (list, tuple)): + return "[" + ", ".join(to_toml(v) for v in value) + "]" + if value is None: + return "''" + raise TypeError(f"Unsupported type for TOML conversion: {type(value).__name__}") + + +DYNAMIC_KEYS: Sequence[str] = ( + "stn_lat_name", + "stn_lon_name", + "grid_lat_name", + "grid_lon_name", + "grid_mask_name", + "dynamic_grid_lat_name", + "dynamic_grid_lon_name", + "nearstn_min", + "nearstn_max", + "try_radius", + "initial_distance", +) + +TOML_TEMPLATE = """\ +# Some default settings + +######################################################################################################################## +# general settings +######################################################################################################################## + +# master seed: control all probabilistic process (e.g., probabilistic estimation, machine learning methods) +# a negative value means random generation without reproducibility +master_seed = 20230104 + +######################################################################################################################## +# settings for gridded estimation using regression or machine learning methods +######################################################################################################################## + +############################## station/grid file dimension name +# the spatial dims of input stations +stn_lat_name = {stn_lat_name} +stn_lon_name = {stn_lon_name} + +# the 2D spatial dims of the target grid domain +# note that target grid domain netcdf must have x/y dims, while lat and lon are 2D arrays +grid_lat_name = {grid_lat_name} +grid_lon_name = {grid_lon_name} +grid_mask_name = {grid_mask_name} + +# the 2D spatial dims of dynamic predictor inputs +dynamic_grid_lat_name = {dynamic_grid_lat_name} +dynamic_grid_lon_name = {dynamic_grid_lon_name} + +######################################################################################################################## +# default settings +# they are as useful as the above settings but using their default values does not affect model run for any case +######################################################################################################################## + +# gridding methods: locally weighted regression and meachine learning methods. +# Sklearn module is used to support most functions: https://scikit-learn.org/stable/supervised_learning.html +# Locally weighted regression. +# Two original methods are LWR:Linear and LWR:Logistic. +# Sklearn-based methods support simple usage with "model.fit()" and "model.predict" or "model.predict_prob", +# in the format of LWR:linear_model.METHOD +# Examples of METHOD are LinearRegression, LogisticRegression, Ridge, BayesianRidge, ARDRegression, Lasso, ElasticNet, Lars, etc +# Global regression using machine learnig methods: +# Machine learning methods are supported by sklearn. Parameters of methods supported by sklearn can be defined at the bottom of this +# configuration file (optional) +# Examples: ensemble.RandomForestRegressor, ensemble.RandomForestClassifier, neural_network.MLPRegressor, neural_network.MLPClassifier, +# ensemble.GradientBoostingClassifier, ensemble.GradientBoostingRegressor +# The parameters of sklearn methods can be defined in the [sklearn] section +gridcore_continuous = 'LWR:Linear' +gridcore_classification = 'LWR:Logistic' # for probability of event +n_splits = 10 # only useful for machine learning methods. cross validation to generate uncertainty estimates. + +# output random fields +output_randomfield = false + +# Number of stations to consider for each target point. nearstn_min<=nearstn_max. +nearstn_min = {nearstn_min} # nearby stations: minimum number +nearstn_max = {nearstn_max} # nearby stations: maximum number + +# first try this radius (km). if not enough, expand. Could be useful to reduce computation time for large domain search. +try_radius = {try_radius} + +# overwrite existing files +overwrite_stninfo = true +overwrite_station_cc = true +overwrite_weight = true +overwrite_cv_reg = true +overwrite_grid_reg = true +overwrite_ens = true +overwrite_spcorr = true + +######################################################################################################################## +# distance-based weight calculation +######################################################################################################################## + +initial_distance = {initial_distance} # Initial Search Distance in km (expanded if need be) + +# Weight calculation formula. Only two variables/parameters are allowed in the formula +# dist: distance between points (km in the script) +# maxdist (optional): max(initial_distance, max(dist)+1), which is a parameter used in weight calculation +# 3 is the exponential factor and is the default parameter +weight_formula = '(1 - (dist / maxdist) ** 3) ** 3' + + +######################################################################################################################## +# method-related settings +# default values can be directly used +######################################################################################################################## + +[transform] +# note: the name must be consistent with transform_vars +[transform.boxcox] +exp = 4 + +[sklearn] +# if no parameters are provided or if the section does not even exist, default parameters will be used. +# just provide the method name, no need to include the submodule name +[sklearn.RandomForestRegressor] +n_estimators = 500 # a example of RandomForestRegressor parameter +n_jobs = 5 +[sklearn.RandomForestClassifier] +n_estimators = 500 # a example of RandomForestRegressor parameter +n_jobs = 5 +""" + + +def render_model_settings_toml(params: Mapping[str, Any], strict: bool = True) -> str: + """ + Render the template into a TOML settings string. + + Parameters + ---------- + params : Mapping[str, Any] + The list of parameters to write to the file, in replacement of the template tags. + strict : bool + True if all DYNAMIC_KEYS must be present in `params`. + + Returns + ------- + str : + The full template with tags replaced. + """ + missing = [k for k in DYNAMIC_KEYS if k not in params] + if strict and missing: + raise KeyError(f"Missing replacements for keys: {missing}") + + mapping = {k: to_toml(params[k]) for k in DYNAMIC_KEYS if k in params} + return TOML_TEMPLATE.format_map(mapping) + + +def write_settings_toml(output_path: str | Path, params: Mapping[str, Any], strict: bool = True) -> bool: + """ + Write a fresh model settings TOML file from the hard-coded template. + + Parameters + ---------- + output_path : str or Path + Path to the filled-in template file we want to write to disk. + params : Mapping[str,Any] + The list of parameters to write to the file, in replacement of the template tags. + strict : bool + True if all DYNAMIC_KEYS must be present in `params`. + + Returns + ------- + bool : + True if the code executes with no error. + """ + output_path = Path(output_path) + output_path.write_text(render_model_settings_toml(params, strict=strict), encoding="utf-8") + return True diff --git a/src/xhydro/pygmet/subsample_vector_format_stations.py b/src/xhydro/pygmet/subsample_vector_format_stations.py new file mode 100644 index 00000000..dd473326 --- /dev/null +++ b/src/xhydro/pygmet/subsample_vector_format_stations.py @@ -0,0 +1,102 @@ +"""Subsampling tools to select stations for PyGMET variability.""" + +import numpy as np +import xarray as xr + + +def isel_every_about_n( + path_to_nc: str, + dim: str, + outpath: str, + rng: float | None, + threshold: float = 0.1, + n: int = 10, + jitter: int = 1, +): + """ + Select along `dim` with step sizes drawn from {n-jitter, …, n+jitter}. Works for Dataset or DataArray. + + Parameters + ---------- + path_to_nc : str + Path to the file containing the dataset of the full data series (lat/long) that we want to subsample. + dim : str + Dimension to operate the subsampling on. Example: stn for stations in a vector of stations. + outpath : str + Path to the output file to write to disk. + rng : float + Initial seed for the random number generator. Using 'None' will make it random. + threshold : float + Threshold for the amount of precipitation considered trace, as to not influence the distribution vs machine + precision of the reanalysis models. + n : int + Average interval length to jump from one to the next sampling point. + jitter : int + Noise added to the interval to randomize the search pattern and not systematically sample ever n points. + + Returns + ------- + bool : + Returns True if the function exited properly. + """ + # Clean precip (threshold and format) + ds = clean_precip(path_to_nc, threshold=threshold) + + # Get the random seed. + rng = np.random.default_rng(rng) + + # Maximum available value to sample. + k = ds.sizes[dim] + + # Draw steps until we pass the end; then build cumulative indices. + steps = [] + start = 0 + + while True: + step = n + rng.integers(-jitter, jitter + 1) + if step <= 0: + continue # guard against pathological jitter + steps.append(step) + if start + sum(steps) >= k: + break + idx = start + np.cumsum([0] + steps[:-1]) # starting index + cumulative steps + idx = idx[idx < k] + + # Build and save the dataset with the dimension subsampled according to the built index. + subset = ds.isel({dim: idx}) + subset.to_netcdf(outpath) + + return True + + +def clean_precip( + ds_path: str, + threshold: float = 0.1, +) -> xr.Dataset: + """ + Convert to float32 and remove any negative precipitation value. + + Parameters + ---------- + ds_path : str + Path to the netcdf file containing the precip to load and clean. + threshold : float + Threshold for the amount of precipitation considered trace, as to not influence the distribution vs machine + precision of the reanalysis models. + + Returns + ------- + xr.Dataset : + The cleaned precipitation dataset. + """ + # Load the precipitation dataset. + ds = xr.open_dataset(ds_path) + + # Convert to float32. + ds["precip"] = ds["precip"].astype(np.float32) + + # Apply threshold for minimum trace value. + ds["precip"].values[ds["precip"].values < threshold] = 0.0 + + # Return the cleaned dataset + return ds diff --git a/src/xhydro/pygmet/transform_to_station_order.py b/src/xhydro/pygmet/transform_to_station_order.py new file mode 100644 index 00000000..6d5d87c0 --- /dev/null +++ b/src/xhydro/pygmet/transform_to_station_order.py @@ -0,0 +1,174 @@ +"""Functions for processing OI data into formats used by PyGMET.""" + +import numpy as np +import xarray as xr + + +def convert_2d_nc_to_1d_stations(path_nc_oi_precip: str, path_nc_grid_temperature: str, outpath: str) -> bool: + """ + Transform the 2D ERA5Land data into a 1D station-like format. + + Parameters + ---------- + path_nc_oi_precip : str + Path to the nc file containing the output of the optimal interpolation results. + path_nc_grid_temperature : str + Path to the nc file containing the gridded temperature matching the times and locations of the OI precip + product. + outpath : str + Path to the output file containing the nan-less station-like vector netcdf of ERA5Land + OI grids (used as + stations in PyGMET). + + Returns + ------- + bool : + True if the code ran correctly, error message if not. + """ + # NetCDF file with ERA5Land precipitation processed with OI (only precip) + ds = xr.open_dataset(path_nc_oi_precip) + + # OI code will return a percentile, so this is to take care of that particular case. + if "percentile" in ds: + ds = ds.sel(percentile=50) + + # Transpose to work with data in correct dimensions. + ds = ds.transpose("longitude", "latitude", "time") + + # NetCDF file with ERA5Land tmin and tmax. + dstemp = xr.open_dataset(path_nc_grid_temperature) + + # Get the same times. + dstemp = dstemp.sel(time=slice(ds["time"].values[0], ds["time"].values[-1])) + + # We want to create a mask of NaN values. Precipitation processed with OI will not have NaNs so we need to use the + # temperature to find them. + maskvar = np.empty((len(ds.longitude), len(ds.latitude))) + maskvar[:] = 1 + temp_c = dstemp["tasmax"].transpose("longitude", "latitude", "time") + + # find the nan values by taking a random day. I don't like taking the 1st so I use 10 here. + nanmask = temp_c[:, :, 10].isnull() + maskvar[nanmask] = 0 + + # Make the nan mask 3D for the application to the initial dataset of lat x lon x time. + mask_3d_broadcast = np.broadcast_to(nanmask, [ds["tp"].shape[2], nanmask.shape[0], nanmask.shape[1]]) + mask_3d_broadcast = mask_3d_broadcast.transpose(1, 2, 0) + + # Apply the mask to precipitation. Recall that temperature already has the NaNs. + ds["tp"].values = ds["tp"].where(~mask_3d_broadcast, np.nan) + + # Stack in station 1D by unrolling the 3D matrices into a 2D (station x time) matrix. Do this for the precip and + # also the temperature. + da_stn = ds["tp"].stack(stn=("latitude", "longitude")) + da_stn_tmax = dstemp["tasmax"].stack(stn=("latitude", "longitude")) + da_stn_tmin = dstemp["tasmin"].stack(stn=("latitude", "longitude")) + + # Get the latitude and longitude values associated with each station. + lat_vals = da_stn.indexes["stn"].get_level_values("latitude") + lon_vals = da_stn.indexes["stn"].get_level_values("longitude") + + # Remove stations that are all NaNs. + nan_columns = da_stn.isnull().all(dim="time").values + valid_mask = ~nan_columns + lat_vals = lat_vals[valid_mask] + lon_vals = lon_vals[valid_mask] + + da_stn = da_stn.dropna(dim="stn", how="any") + da_stn_tmax = da_stn_tmax.dropna(dim="stn", how="any") + da_stn_tmin = da_stn_tmin.dropna(dim="stn", how="any") + + # Rebuild the index of stations starting at 0. + da_stn = da_stn.assign_coords(stn=np.arange(da_stn.sizes["stn"])) + + # Make the dataset, write to disk. + ds = xr.Dataset( + { + "precip": (("stn", "time"), da_stn.values.transpose()), + "tmax": (("stn", "time"), da_stn_tmax.values.transpose()), + "tmin": (("stn", "time"), da_stn_tmin.values.transpose()), + "latitude": (("stn"), lat_vals), + "longitude": (("stn"), lon_vals), + }, + coords={"time": ds.time.values, "stn": np.arange(0, da_stn.sizes["stn"])}, + ) + + ds["stnid"] = ds["stn"] + ds.to_netcdf(outpath) + + return True + + +def make_target_pygmet_grid( + ds_in_path: str, + outpath: str, +): + """ + Build the netcdf Dataset required by PyGMET to use as the output grid. + + Parameters + ---------- + ds_in_path : str + Path to the template dataset to use as the basis for 2D grid. + outpath : str + Path to the 2D mask file. + + Returns + ------- + bool : + True if the function exits with no error. + """ + # Open the dataset + ds_in = xr.open_dataset(ds_in_path) + lat_grid = ds_in["latitude"].values + lon_grid = ds_in["longitude"].values + + # Define some fixed parameters + nparam = 1 + + # Number of latitudes and longitudes (i.e. size of grid) + ny = lat_grid.shape[0] + nx = lon_grid.shape[0] + + # indices of the coordinates + param = np.arange(nparam) # ex. [0] + + # Create the variables + # step for longitude and latitude + dx = np.full((nparam,), np.round(ds_in.longitude[1].values - ds_in.longitude[0].values, 5)) + dy = np.full((nparam,), np.round(ds_in.latitude[1].values - ds_in.latitude[0].values, 5)) + + # Origins for latitude and longitude (from where the steps begin) + startx = np.array(np.round(ds_in.longitude.values[0], 5)).reshape(1) # longitude origin + starty = np.array(np.round(ds_in.latitude.values[0], 5)).reshape(1) # latitude origin + + # Rebuild 2D array of lat/lon + lon2d, lat2d = np.meshgrid(lon_grid, lat_grid) + + # Dimensions (lat, lon) + elev = np.ones((ny, nx)) # altitude (m), pygmet can use it as an extra covariate, but we don't. + + # Make a mask based on the position of NaNs in the original gridded dataset. + mask = ds_in.tasmax.isel(time=1).transpose("latitude", "longitude")[:, :].notnull().astype(int).values + + # Build the xarray Dataset + ds = xr.Dataset( + data_vars={ + "dx": (("param",), dx), + "dy": (("param",), dy), + "startx": (("param",), startx), + "starty": (("param",), starty), + "elev": (("y", "x"), elev), + "latitude": (("y", "x"), lat2d), + "longitude": (("y", "x"), lon2d), + "mask": (("y", "x"), mask), + }, + # Add coordinates + coords={ + "param": param, + "y": lat_grid, + "x": lon_grid, + }, + ) + + # Write to disk + ds.to_netcdf(outpath) diff --git a/src/xhydro/testing/helpers.py b/src/xhydro/testing/helpers.py index ebd31e25..8244bd30 100644 --- a/src/xhydro/testing/helpers.py +++ b/src/xhydro/testing/helpers.py @@ -29,7 +29,7 @@ "populate_testing_data", ] -default_testdata_version = "v2025.8.14" +default_testdata_version = "v2026.02.04" """Default version of the testing data to use when fetching datasets.""" default_testdata_repo_url = "https://raw.githubusercontent.com/hydrologie/xhydro-testdata/" diff --git a/src/xhydro/testing/registry.txt b/src/xhydro/testing/registry.txt index 4152058c..2f69afb6 100644 --- a/src/xhydro/testing/registry.txt +++ b/src/xhydro/testing/registry.txt @@ -15,6 +15,11 @@ optimal_interpolation/OI_data.zip sha256:9cd881a19fc82bda560e636d3f6a2c40718b82f optimal_interpolation/OI_data_corrected.zip sha256:48ee08325bd35c6bce5c0e52e3ee25df27c830720929060f607fb0417c476941 pmp/CMIP.CCCma.CanESM5.historical.r1i1p1f1.day.gn.zarr.zip sha256:d5775b2f09381f2f3a7cc06af76f62fb216b60252aab3a602280a513554d59ad pmp/CMIP.CCCma.CanESM5.historical.r1i1p1f1.fx.gn.zarr.zip sha256:dc7c92fc098ca5adf43e76b25e3f79b70815ed961e945952983bb68b3c380cf1 +precip_oi/ERA5Land_with_OI_flags_validation_subset.nc sha256:50fc8d562443efe82998735c50268019532493bc02b47e4a47f6672708c87b5f +precip_oi/ERA5_land_3_variables_raw_subset.nc sha256:09ee75f73bb4e88bddb6c9184245766ebe8cc6b13b58ef58a4754e523138ff1f +precip_oi/stations_flags_clean_subset.nc sha256:03513b859e6c99ab02728f49b0e3e7382375e7311c68e33c8d59d6f734eb08d1 +pygmet/subset_grid_temperature.nc sha256:66a3086d9aaacbedac6633050380b9b7685407c952a3f8629a16caea4d1ef8db +pygmet/subset_oi_tp.nc sha256:1cddcb439168891e4ad025c7708c6d0ce1017bbb013ac2fe46589dc3fb5953da ravenpy/Debit_Riviere_Rouge.nc sha256:d0a27de5eb3cb466e60669d894296bcbc4e9f590edc1ae2490685babd10b2d22 ravenpy/ERA5_Riviere_Rouge_global.nc sha256:341ac746130a0d3e3189d3a41dc8528d6bd22869a519b68e134959407ad200a3 ravenpy/hru_subset.zip sha256:969b4a0d0c1f0c24dbb5baca98676f2573178cd58ef4162aa6b9fce51ceecf06 diff --git a/tests/test_pygmet_prep.py b/tests/test_pygmet_prep.py new file mode 100644 index 00000000..7491900c --- /dev/null +++ b/tests/test_pygmet_prep.py @@ -0,0 +1,224 @@ +import tempfile + +import numpy as np +import pandas as pd +import xarray as xr +from numpy.ma.testutils import assert_almost_equal + +from xhydro.pygmet.make_toml_config_pygmet import write_config_toml +from xhydro.pygmet.make_toml_settings_pygmet import write_settings_toml +from xhydro.pygmet.subsample_vector_format_stations import isel_every_about_n +from xhydro.pygmet.transform_to_station_order import convert_2d_nc_to_1d_stations, make_target_pygmet_grid + + +def test_make_pygmet_config(): + params = { + "case_name": "My_case_01", + "num_processes": 10, + "modelsettings_file": "model.settings_xhydro.toml", + "input_stn_all": "./stations.nc", + "infile_grid_domain": "./grid_domain.nc", + "outpath_parent": "./output_dir", + "date_start": "1970-01-01", + "date_end": "2024-12-31", + "input_vars": ["precip", "tmin", "tmax"], + "target_vars": ["precip", "tmin", "tmax"], + "target_vars_WithProbability": ["precip"], + "minRange_vars": [0.0, -70.0, -70.0], + "maxRange_vars": [250.0, 70.0, 70.0], + "transform_vars": ["boxcox", "", ""], + "predictor_name_static_stn": ["latitude", "longitude"], + "predictor_name_static_grid": ["latitude", "longitude"], + "ensemble_end": 100, + "clen": [150, 800, 800], # codespell:ignore clen + "auto_corr_method": ["direct", "anomaly", "anomaly"], + "target_vars_max_constrain": ["precip"], + } + tempout = tempfile.mkdtemp() + outpath = tempout + "/model.config_xhydro.toml" + write_config_toml(outpath, params, strict=True) + + +def test_make_pygmet_settings(): + params = { + "stn_lat_name": "latitude", + "stn_lon_name": "longitude", + "grid_lat_name": "latitude", + "grid_lon_name": "longitude", + "grid_mask_name": "mask", + "dynamic_grid_lat_name": "latitude", + "dynamic_grid_lon_name": "longitude", + "nearstn_min": 2, + "nearstn_max": 16, + "try_radius": 200, + "initial_distance": 200, + } + tempout = tempfile.mkdtemp() + outpath = tempout + "/model.settings_xhydro.toml" + write_settings_toml(outpath, params, strict=True) + + +class TestPreparePygmetInputs: + # Prepare the data for the subset OI precip grid. + # Coordinates + percentile = np.array([50], dtype=np.int64) + time = pd.date_range("1970-01-01", periods=15, freq="D") + latitude = np.round(50.0 - 0.1 * np.arange(10), 1).astype(np.float64) + longitude = np.round(-78.0 + 0.1 * np.arange(10), 1).astype(np.float64) + + # Repeatable + rng = np.random.default_rng(12345) # seed = repeatable + tp = rng.random((len(percentile), len(time), len(latitude), len(longitude))).astype(np.float64) + + # Build dataset + ds = xr.Dataset( + data_vars={"tp": (("percentile", "time", "latitude", "longitude"), tp)}, + coords={ + "percentile": percentile, + "time": time, + "latitude": latitude, + "longitude": longitude, + }, + ) + + # Write the file to temp location + tmpfile = tempfile.mkdtemp() + path_nc_oi_precip = tmpfile + "/subset_oi_tp.nc" + ds.to_netcdf(path_nc_oi_precip) + + # Now prepare the data for the subset temperature grid. + # Coordinates + time = pd.date_range("1970-01-01", periods=15, freq="D") # datetime64[ns] + longitude = np.round(-78.0 + 0.1 * np.arange(10), 1).astype(np.float64) # -78.0 ... -77.1 + latitude = np.round(50.0 - 0.1 * np.arange(10), 1).astype(np.float64) # 50.0 ... 49.1 + + # Repeatable + rng = np.random.default_rng(12345) + + shape_3d = (len(longitude), len(latitude), len(time)) + + # Base mean temperature field + tmean = rng.normal(loc=12.0, scale=6.0, size=shape_3d).astype(np.float32) + + # Positive daily range + trange = rng.uniform(3.0, 12.0, size=shape_3d).astype(np.float32) + tasmin = (tmean - 0.5 * trange).astype(np.float32) + tasmax = (tmean + 0.5 * trange).astype(np.float32) + + # Altitude (m), repeatable + altitude = rng.uniform(0, 1200, size=(len(longitude), len(latitude))).astype(np.float64) + + # Build dataset + ds = xr.Dataset( + data_vars={ + "tasmax": (("longitude", "latitude", "time"), tasmax), + "tasmin": (("longitude", "latitude", "time"), tasmin), + "altitude": (("longitude", "latitude"), altitude), + }, + coords={ + "longitude": longitude, + "latitude": latitude, + "time": time, + }, + ) + + # Write file to temp location as well. + tmpfile_temperature = tempfile.mkdtemp() + path_nc_grid_temperature = tmpfile_temperature + "/subset_grid_temperature.nc" + ds.to_netcdf(path_nc_grid_temperature) + + # Intermediate data for reordered stations + stn = np.arange(100, dtype=np.int64) + time = pd.date_range("1970-01-01", periods=15, freq="D") # datetime64[ns] + # Build station lat/lon from a 10x10 grid (same values/ranges as previous grid example) + lon_1d = np.round(-78.0 + 0.1 * np.arange(10), 1).astype(np.float64) + lat_1d = np.round(50.0 - 0.1 * np.arange(10), 1).astype(np.float64) + + # IMPORTANT: indexing="ij" matches grid dim order (longitude, latitude) + lon2d, lat2d = np.meshgrid(lon_1d, lat_1d, indexing="ij") # both shape (10, 10) + + # Flatten to stations (100 points). Order="C" = latitude varies fastest within each longitude. + longitude = lon2d.ravel(order="C") + latitude = lat2d.ravel(order="C") + + # Station IDs + stnid = (stn).astype(np.int64) + + # 2) Repeatable data + rng = np.random.default_rng(12345) # seed -> repeatable + shape = (len(stn), len(time)) + + # Precip + precip = rng.gamma(shape=2.0, scale=2.0, size=shape).astype(np.float64) + + # Temperatures + tmean = rng.normal(loc=12.0, scale=6.0, size=shape).astype(np.float32) # mean °C + trange = rng.uniform(3.0, 12.0, size=shape).astype(np.float32) # daily range °C (>0) + tmin = (tmean - 0.5 * trange).astype(np.float32) + tmax = (tmean + 0.5 * trange).astype(np.float32) + + # 3) Build Dataset + ds_stn = xr.Dataset( + data_vars={ + "precip": (("stn", "time"), precip), + "tmax": (("stn", "time"), tmax), + "tmin": (("stn", "time"), tmin), + "latitude": (("stn",), latitude), + "longitude": (("stn",), longitude), + "stnid": (("stn",), stnid), + }, + coords={ + "stn": stn, + "time": time, + }, + ) + + # Write file to temp location as well. + tmpfile_reordered = tempfile.mkdtemp() + path_nc_grid_reordered = tmpfile_reordered + "/subset_reordered_grids_to_stations.nc" + ds_stn.to_netcdf(path_nc_grid_reordered) + + # tempout folder + tempout = tempfile.mkdtemp() + + def test_convert_2d_to_1d(self): + # Run the code + convert_2d_nc_to_1d_stations( + self.path_nc_oi_precip, + self.path_nc_grid_temperature, + self.tempout + "/subset_reordered_grids_to_stations.nc/", + ) + + ds = xr.open_dataset(self.tempout + "/subset_reordered_grids_to_stations.nc/") + assert_almost_equal(ds.precip.isel(stn=5, time=5).values, 0.2762543, 5) + assert_almost_equal(ds.tmax.isel(stn=6, time=5).values, 13.288241, 5) + assert ds.stn.shape[0] == 100 + assert ds.stn[-1] == 99 + + def test_subsample_stations_for_pygmet(self): + # Take roughly 1 out of every ~10 stations + isel_every_about_n( + path_to_nc=self.path_nc_grid_reordered, + dim="stn", + outpath=self.tempout + "/subset_oi_vector_format_subsampled.nc/", + rng=42, + threshold=0.1, + n=10, + jitter=3, + ) + + ds = xr.open_dataset(self.tempout + "/subset_oi_vector_format_subsampled.nc/") + assert_almost_equal(ds.precip.isel(stn=5, time=5).values, 3.057068, 5) + assert_almost_equal(ds.tmax.isel(stn=6, time=5).values, 10.385146, 5) + assert ds.stn.shape[0] == 11 + assert ds.stn[-1] == 96 + + def test_make_target_pygmet_grid(self): + make_target_pygmet_grid( + self.path_nc_grid_temperature, + self.tempout + "/new_grid_output_shape.nc", + ) + + ds = xr.open_dataset(self.tempout + "/new_grid_output_shape.nc") + assert ds.y[0] == 50 + assert ds.mask[5, 5].values == 1