diff --git a/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316/tas_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc b/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316/tas_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc new file mode 100644 index 00000000..783fd1cd Binary files /dev/null and b/data/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316/tas_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc differ diff --git a/data/obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/taNobs_AIRS-2-1_gn_200209-201609.nc b/data/obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/taNobs_AIRS-2-1_gn_200209-201609.nc new file mode 100644 index 00000000..b05734ee Binary files /dev/null and b/data/obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/taNobs_AIRS-2-1_gn_200209-201609.nc differ diff --git a/data/obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/taStderr_AIRS-2-1_gn_200209-201609.nc b/data/obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/taStderr_AIRS-2-1_gn_200209-201609.nc new file mode 100644 index 00000000..03b869be Binary files /dev/null and b/data/obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/taStderr_AIRS-2-1_gn_200209-201609.nc differ diff --git a/registry.txt b/registry.txt index ca6fd453..0b4cdaa2 100644 --- a/registry.txt +++ b/registry.txt @@ -14,6 +14,7 @@ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/fx/areacella/gn/v20191115/are CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/rlut/gn/v20210316/rlut_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc 4bcdb5108c884a13299cc855a3a78ff1af1a28c45b00dc9a388d510035328886 CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/rsdt/gn/v20210316/rsdt_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc bc1ba4b5b91dc40f80318b73dedffd75daec25e317167119794366ea7d48779d CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/rsut/gn/v20210316/rsut_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc 9c236e425fc94a067888f5ae48377eb75cc29f9b4a697621ea68a6c46c1aa0f4 +CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316/tas_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc 994e32564109768dcd7343663f7113ecdaefa91a519ecc442af6fcff877f528e CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316/tas_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-018012.nc cacab3637a4cc94e5cc6d7267ec4c6f96cd5d618b0f8a097005dbfe89b2c5eb9 CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/fx/areacella/gn/v20210316/areacella_fx_ACCESS-ESM1-5_piControl_r1i1p1f1_gn.nc fbdf118bd3677eef2a3a63993cf6492a74c34b2a6650bdb6018711ad66e36594 CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsdt/gn/v20210318/rsdt_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc 187d291702e4a969792fedca9db832657d96e385ea67747cf6432e7eab1af779 @@ -21,4 +22,6 @@ CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsut/gn/v20210318/rsu CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/v20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc 3124671936cb2554af0a1f48b814fa8bb186a0ee2af6bcc86b5cb126b107d7a2 CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Omon/tos/gn/v20210318/tos_Omon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc 10d13b1250f5483e5d6105b0dd811658849324c03f27539b83642062a1151b93 CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/fx/areacella/gn/v20210318/areacella_fx_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn.nc 064b48e5b2971cb4e8edad95b27fbbfc2f6dcdc2de99e2df2944d9c2b0db4910 +obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/taNobs_AIRS-2-1_gn_200209-201609.nc 0d166cf478089a910bb856d41cd8493e63a7fcd05e252345a285c374868d418a +obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/taStderr_AIRS-2-1_gn_200209-201609.nc 807fedb2e3f55939de0d894d4902fee0d476c533277f57064bc3b7c66b6b57ee obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/ta_AIRS-2-1_gn_200209-201609.nc 689d9f175fab93428f1431f6625c8fd7f66700eaf332053485faf31992980b2a diff --git a/scripts/fetch_test_data.py b/scripts/fetch_test_data.py index 1bd47872..e46c4431 100644 --- a/scripts/fetch_test_data.py +++ b/scripts/fetch_test_data.py @@ -1,11 +1,6 @@ -""" -Fetch test data - -Fetch and downscale test data from ESGF. -""" - import os import pathlib +from abc import ABC, abstractmethod from pathlib import Path from typing import Any @@ -17,9 +12,261 @@ OUTPUT_PATH = Path("data") -def fetch_datasets( - search_facets: dict[str, Any], remove_ensembles: bool, time_span: tuple[str, str] | None -) -> pd.DataFrame: +class DataRequest(ABC): + """ + Represents a request for a dataset + + A polymorphic association is used to capture the different types of datasets as each + dataset type may have different metadata fields and may need to be handled + differently to generate the sample data. + """ + + def __init__(self, remove_ensembles: bool, time_span: tuple[str, str]): + self.remove_ensembles = remove_ensembles + self.time_span = time_span + + @abstractmethod + def decimate_dataset(self, dataset: xr.Dataset, time_span: tuple[str, str] | None) -> xr.Dataset | None: + """Downscale the dataset to a smaller size.""" + pass + + @abstractmethod + def create_out_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: str) -> pathlib.Path: + """Create the output filename for the dataset.""" + pass + + +class CMIP6Request(DataRequest): + """ + Represents a CMIP6 dataset request + + """ + + def __init__(self, facets: dict[str, Any], remove_ensembles: bool, time_span: tuple[str, str] | None): + self.avail_facets = [ + "mip_era", + "activity_drs", + "institution_id", + "source_id", + "experiment_id", + "member_id", + "table_id", + "variable_id", + "grid_label", + "version", + "data_node", + ] + + self.facets = facets + + super().__init__(remove_ensembles, time_span) + + self.cmip6_path_items = [ + "mip_era", + "activity_drs", + "institution_id", + "source_id", + "experiment_id", + "member_id", + "table_id", + "variable_id", + "grid_label", + ] + + self.cmip6_filename_paths = [ + "variable_id", + "table_id", + "source_id", + "experiment_id", + "member_id", + "grid_label", + ] + + assert all(key in self.avail_facets for key in self.cmip6_path_items), "Error message" + assert all(key in self.avail_facets for key in self.cmip6_filename_paths), "Error message" + + def decimate_dataset(self, dataset: xr.Dataset, time_span: tuple[str, str] | None) -> xr.Dataset | None: + """ + Downscale the dataset to a smaller size. + + Parameters + ---------- + dataset + The dataset to downscale + time_span + The time span to extract from a dataset + + Returns + ------- + xr.Dataset + The downscaled dataset + """ + has_latlon = "lat" in dataset.dims and "lon" in dataset.dims + has_ij = "i" in dataset.dims and "j" in dataset.dims + + if has_latlon: + assert len(dataset.lat.dims) == 1 and len(dataset.lon.dims) == 1 + result = dataset.interp(lat=dataset.lat[:10], lon=dataset.lon[:10]) + elif has_ij: + # 2d lat/lon grid (generally ocean variables) + # Choose a starting point around the middle of the grid to maximise chance that it has values + # TODO: Be smarter about this? + j_midpoint = len(dataset.j) // 2 + result = dataset.interp(i=dataset.i[:10], j=dataset.j[j_midpoint : j_midpoint + 10]) + else: + raise ValueError("Cannot decimate this grid: too many dimensions") + + if "time" in dataset.dims and time_span is not None: + result = result.sel(time=slice(*time_span)) + if result.time.size == 0: + result = None + + return result + + def create_out_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: str) -> pathlib.Path: + """ + Create the output filename for the dataset. + + Parameters + ---------- + ds + Loaded dataset + + Returns + ------- + The output filename + """ + output_path = ( + Path(os.path.join(*[metadata[item] for item in self.cmip6_path_items])) + / f"v{metadata['version']}" + ) + filename_prefix = "_".join([metadata[item] for item in self.cmip6_filename_paths]) + + if "time" in ds.dims: + time_range = ( + f"{ds.time.min().dt.strftime('%Y%m').item()}-{ds.time.max().dt.strftime('%Y%m').item()}" + ) + filename = f"{filename_prefix}_{time_range}.nc" + else: + filename = f"{filename_prefix}.nc" + + return output_path / filename + + +class Obs4MIPsRequest(DataRequest): + """ + Represents a Obs4MIPs dataset request + + """ + + def __init__(self, facets: dict[str, Any], remove_ensembles: bool, time_span: tuple[str, str] | None): + self.avail_facets = [ + "activity_id", + "institution_id", + "source_id", + "frequency", + "variable_id", + "grid_label", + "version", + "data_node", + ] + + self.facets = facets + + super().__init__(remove_ensembles, time_span) + + self.obs4mips_path_items = [ + "activity_id", + "institution_id", + "source_id", + "variable_id", + "grid_label", + ] + + self.obs4mips_filename_paths = [ + "variable_id", + "source_id", + "grid_label", + ] + + assert all(key in self.avail_facets for key in self.obs4mips_path_items), "Error message" + assert all(key in self.avail_facets for key in self.obs4mips_filename_paths), "Error message" + + def decimate_dataset(self, dataset: xr.Dataset, time_span: tuple[str, str] | None) -> xr.Dataset | None: + """ + Downscale the dataset to a smaller size. + + Parameters + ---------- + dataset + The dataset to downscale + time_span + The time span to extract from a dataset + + Returns + ------- + xr.Dataset + The downscaled dataset + """ + has_latlon = "lat" in dataset.dims and "lon" in dataset.dims + has_ij = "i" in dataset.dims and "j" in dataset.dims + + if has_latlon: + assert len(dataset.lat.dims) == 1 and len(dataset.lon.dims) == 1 + result = dataset.interp(lat=dataset.lat[:10], lon=dataset.lon[:10]) + elif has_ij: + # 2d lat/lon grid (generally ocean variables) + # Choose a starting point around the middle of the grid to maximise chance that it has values + # TODO: Be smarter about this? + j_midpoint = len(dataset.j) // 2 + result = dataset.interp(i=dataset.i[:10], j=dataset.j[j_midpoint : j_midpoint + 10]) + else: + raise ValueError("Cannot decimate this grid: too many dimensions") + + if "time" in dataset.dims and time_span is not None: + result = result.sel(time=slice(*time_span)) + if result.time.size == 0: + result = None + + return result + + def create_out_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: str) -> pathlib.Path: + """ + Create the output filename for the dataset. + + Parameters + ---------- + ds + Loaded dataset + + Returns + ------- + The output filename + """ + output_path = ( + Path(os.path.join(*[metadata[item] for item in self.obs4mips_path_items])) + / f"v{metadata['version']}" + ) + if ds_filename.name.split("_")[0] == ds.variable_id: + filename_prefix = "_".join([metadata[item] for item in self.obs4mips_filename_paths]) + else: + filename_prefix = ds_filename.name.split("_")[0] + "_" + filename_prefix += "_".join( + [metadata[item] for item in self.obs4mips_filename_paths if item != "variable_id"] + ) + + if "time" in ds.dims: + time_range = ( + f"{ds.time.min().dt.strftime('%Y%m').item()}-{ds.time.max().dt.strftime('%Y%m').item()}" + ) + filename = f"{filename_prefix}_{time_range}.nc" + else: + filename = f"{filename_prefix}.nc" + + return output_path / filename + + +def fetch_datasets(request: DataRequest) -> pd.DataFrame: """ Fetch the datasets from ESGF. @@ -37,20 +284,19 @@ def fetch_datasets( """ cat = ESGFCatalog() - cat.search(**search_facets) - if remove_ensembles: + cat.search(**request.facets) + if request.remove_ensembles: cat.remove_ensembles() path_dict = cat.to_path_dict(prefer_streaming=False, minimal_keys=False) merged_df = cat.df.merge(pd.Series(path_dict, name="files"), left_on="key", right_index=True) - if time_span: - merged_df["time_start"] = time_span[0] - merged_df["time_end"] = time_span[1] - + if request.time_span: + merged_df["time_start"] = request.time_span[0] + merged_df["time_end"] = request.time_span[1] return merged_df -def deduplicate_datasets(datasets: pd.DataFrame) -> pd.DataFrame: +def deduplicate_datasets(request: DataRequest) -> pd.DataFrame: """ Deduplicate a dataset collection. @@ -67,6 +313,7 @@ def deduplicate_datasets(datasets: pd.DataFrame) -> pd.DataFrame: pd.DataFrame The deduplicated dataset collection spanning the times requested """ + datasets = fetch_datasets(request) def _deduplicate_group(group: pd.DataFrame) -> pd.DataFrame: first = group.iloc[0].copy() @@ -78,46 +325,7 @@ def _deduplicate_group(group: pd.DataFrame) -> pd.DataFrame: return datasets.groupby("key").apply(_deduplicate_group, include_groups=False).reset_index() -def decimate_dataset(dataset: xr.Dataset, time_span: tuple[str, str] | None) -> xr.Dataset | None: - """ - Downscale the dataset to a smaller size. - - Parameters - ---------- - dataset - The dataset to downscale - time_span - The time span to extract from a dataset - - Returns - ------- - xr.Dataset - The downscaled dataset - """ - has_latlon = "lat" in dataset.dims and "lon" in dataset.dims - has_ij = "i" in dataset.dims and "j" in dataset.dims - - if has_latlon: - assert len(dataset.lat.dims) == 1 and len(dataset.lon.dims) == 1 - result = dataset.interp(lat=dataset.lat[:10], lon=dataset.lon[:10]) - elif has_ij: - # 2d lat/lon grid (generally ocean variables) - # Choose a starting point around the middle of the grid to maximise chance that it has values - # TODO: Be smarter about this? - j_midpoint = len(dataset.j) // 2 - result = dataset.interp(i=dataset.i[:10], j=dataset.j[j_midpoint : j_midpoint + 10]) - else: - raise ValueError("Cannot decimate this grid: too many dimensions") - - if "time" in dataset.dims and time_span is not None: - result = result.sel(time=slice(*time_span)) - if result.time.size == 0: - result = None - - return result - - -def create_out_filename(metadata: pd.Series, ds: xr.Dataset) -> pathlib.Path: +def create_sample_dataset(request: DataRequest): """ Create the output filename for the dataset. @@ -130,126 +338,70 @@ def create_out_filename(metadata: pd.Series, ds: xr.Dataset) -> pathlib.Path: ------- The output filename """ - cmip6_path_items = [ - "mip_era", - "activity_drs", - "institution_id", - "source_id", - "experiment_id", - "member_id", - "table_id", - "variable_id", - "grid_label", - ] - - cmip6_filename_paths = [ - "variable_id", - "table_id", - "source_id", - "experiment_id", - "member_id", - "grid_label", - ] - - obs4mips_path_items = [ - "activity_id", - "institution_id", - "source_id", - "variable_id", - "grid_label", - ] - - obs4mips_filename_paths = [ - "variable_id", - "source_id", - "grid_label", - ] + datasets = deduplicate_datasets(request) + for _, dataset in datasets.iterrows(): + for ds_filename in dataset["files"]: + ds_orig = xr.open_dataset(ds_filename) + ds_decimated = request.decimate_dataset(ds_orig, request.time_span) + if ds_decimated is None: + continue - if metadata.project == "obs4MIPs": - output_path = ( - Path(os.path.join(*[metadata[item] for item in obs4mips_path_items])) / f"v{metadata['version']}" - ) - filename_prefix = "_".join([metadata[item] for item in obs4mips_filename_paths]) - else: - output_path = ( - Path(os.path.join(*[metadata[item] for item in cmip6_path_items])) / f"v{metadata['version']}" - ) - filename_prefix = "_".join([metadata[item] for item in cmip6_filename_paths]) + output_filename = OUTPUT_PATH / request.create_out_filename(dataset, ds_decimated, ds_filename) + output_filename.parent.mkdir(parents=True, exist_ok=True) + ds_decimated.to_netcdf(output_filename) - if "time" in ds.dims: - time_range = f"{ds.time.min().dt.strftime('%Y%m').item()}-{ds.time.max().dt.strftime('%Y%m').item()}" - filename = f"{filename_prefix}_{time_range}.nc" - else: - filename = f"{filename_prefix}.nc" - return output_path / filename + # Regenerate the registry.txt file + pooch.make_registry(OUTPUT_PATH, "registry.txt") if __name__ == "__main__": - facets_to_fetch = [ + datasets_to_fetch = [ # Example metric data - dict( - source_id="ACCESS-ESM1-5", - frequency=["fx", "mon"], - variable_id=["areacella", "tas", "tos", "rsut", "rlut", "rsdt"], - experiment_id=["ssp126", "historical"], + CMIP6Request( + facets=dict( + source_id="ACCESS-ESM1-5", + frequency=["fx", "mon"], + variable_id=["areacella", "tas", "tos", "rsut", "rlut", "rsdt"], + experiment_id=["ssp126", "historical"], + ), remove_ensembles=True, time_span=("2000", "2025"), ), # ESMValTool ECS data - dict( - source_id="ACCESS-ESM1-5", - frequency=["fx", "mon"], - variable_id=["areacella", "rlut", "rsdt", "rsut", "tas"], - experiment_id=["abrupt-4xCO2", "piControl"], + CMIP6Request( + facets=dict( + source_id="ACCESS-ESM1-5", + frequency=["fx", "mon"], + variable_id=["areacella", "rlut", "rsdt", "rsut", "tas"], + experiment_id=["abrupt-4xCO2", "piControl"], + ), remove_ensembles=True, time_span=("0101", "0125"), ), # ESMValTool TCR data - dict( - source_id="ACCESS-ESM1-5", - frequency=["fx", "mon"], - variable_id=["areacella", "tas"], - experiment_id=["1pctCO2", "piControl"], + CMIP6Request( + facets=dict( + source_id="ACCESS-ESM1-5", + frequency=["fx", "mon"], + variable_id=["areacella", "tas"], + experiment_id=["1pctCO2", "piControl"], + ), remove_ensembles=True, time_span=("0101", "0180"), ), # Obs4MIPs AIRS data - dict( - project="obs4MIPs", - institution_id="NASA-JPL", - frequency="mon", - source_id="AIRS-2-1", - variable_id="ta", + Obs4MIPsRequest( + facets=dict( + project="obs4MIPs", + institution_id="NASA-JPL", + frequency="mon", + source_id="AIRS-2-1", + variable_id="ta", + ), remove_ensembles=False, time_span=("2002", "2016"), ), ] - dataset_metadata_collection: list[pd.DataFrame] = [] - for facets in facets_to_fetch: - dataset_metadata_collection.append( - fetch_datasets( - facets, - remove_ensembles=facets.pop("remove_ensembles", False), - time_span=facets.pop("time_span", None), - ) - ) - - # Combine all datasets - datasets = deduplicate_datasets(pd.concat(dataset_metadata_collection)) - - for _, dataset in datasets.iterrows(): - for ds_filename in dataset["files"]: - if ds_filename.name.split("_")[0] != dataset.variable_id: - continue - ds_orig = xr.open_dataset(ds_filename) - ds_decimated = decimate_dataset(ds_orig, time_span=(dataset["time_start"], dataset["time_end"])) - if ds_decimated is None: - continue - - output_filename = OUTPUT_PATH / create_out_filename(dataset, ds_decimated) - output_filename.parent.mkdir(parents=True, exist_ok=True) - ds_decimated.to_netcdf(output_filename) - - # Regenerate the registry.txt file - pooch.make_registry(OUTPUT_PATH, "registry.txt") + for dataset_requested in datasets_to_fetch: + create_sample_dataset(dataset_requested)