diff --git a/changelog/11.feature.md b/changelog/11.feature.md new file mode 100644 index 00000000..03f5011b --- /dev/null +++ b/changelog/11.feature.md @@ -0,0 +1 @@ +Added Obs4MIPs sample data to the fetching script. diff --git a/changelog/12.feature.md b/changelog/12.feature.md new file mode 100644 index 00000000..2204412d --- /dev/null +++ b/changelog/12.feature.md @@ -0,0 +1 @@ +Restructured code into classes so that additional sample datasets can be fetched easily in the future. diff --git a/data/obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/ta_AIRS-2-1_gn_200209-201609.nc b/data/obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/ta_AIRS-2-1_gn_200209-201609.nc new file mode 100644 index 00000000..b5200e4e Binary files /dev/null and b/data/obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/ta_AIRS-2-1_gn_200209-201609.nc differ diff --git a/registry.txt b/registry.txt index bd3e3a92..ca6fd453 100644 --- a/registry.txt +++ b/registry.txt @@ -21,3 +21,4 @@ CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsut/gn/v20210318/rsu CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/v20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc 3124671936cb2554af0a1f48b814fa8bb186a0ee2af6bcc86b5cb126b107d7a2 CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Omon/tos/gn/v20210318/tos_Omon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc 10d13b1250f5483e5d6105b0dd811658849324c03f27539b83642062a1151b93 CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/fx/areacella/gn/v20210318/areacella_fx_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn.nc 064b48e5b2971cb4e8edad95b27fbbfc2f6dcdc2de99e2df2944d9c2b0db4910 +obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/ta_AIRS-2-1_gn_200209-201609.nc 689d9f175fab93428f1431f6625c8fd7f66700eaf332053485faf31992980b2a diff --git a/scripts/fetch_test_data.py b/scripts/fetch_test_data.py index 489143fc..1bd47872 100644 --- a/scripts/fetch_test_data.py +++ b/scripts/fetch_test_data.py @@ -42,7 +42,6 @@ def fetch_datasets( cat.remove_ensembles() path_dict = cat.to_path_dict(prefer_streaming=False, minimal_keys=False) - merged_df = cat.df.merge(pd.Series(path_dict, name="files"), left_on="key", right_index=True) if time_span: merged_df["time_start"] = time_span[0] @@ -152,10 +151,30 @@ def create_out_filename(metadata: pd.Series, ds: xr.Dataset) -> pathlib.Path: "grid_label", ] - output_path = ( - Path(os.path.join(*[metadata[item] for item in cmip6_path_items])) / f"v{metadata['version']}" - ) - filename_prefix = "_".join([metadata[item] for item in cmip6_filename_paths]) + obs4mips_path_items = [ + "activity_id", + "institution_id", + "source_id", + "variable_id", + "grid_label", + ] + + obs4mips_filename_paths = [ + "variable_id", + "source_id", + "grid_label", + ] + + if metadata.project == "obs4MIPs": + output_path = ( + Path(os.path.join(*[metadata[item] for item in obs4mips_path_items])) / f"v{metadata['version']}" + ) + filename_prefix = "_".join([metadata[item] for item in obs4mips_filename_paths]) + else: + output_path = ( + Path(os.path.join(*[metadata[item] for item in cmip6_path_items])) / f"v{metadata['version']}" + ) + filename_prefix = "_".join([metadata[item] for item in cmip6_filename_paths]) if "time" in ds.dims: time_range = f"{ds.time.min().dt.strftime('%Y%m').item()}-{ds.time.max().dt.strftime('%Y%m').item()}" @@ -194,6 +213,16 @@ def create_out_filename(metadata: pd.Series, ds: xr.Dataset) -> pathlib.Path: remove_ensembles=True, time_span=("0101", "0180"), ), + # Obs4MIPs AIRS data + dict( + project="obs4MIPs", + institution_id="NASA-JPL", + frequency="mon", + source_id="AIRS-2-1", + variable_id="ta", + remove_ensembles=False, + time_span=("2002", "2016"), + ), ] dataset_metadata_collection: list[pd.DataFrame] = [] @@ -210,10 +239,10 @@ def create_out_filename(metadata: pd.Series, ds: xr.Dataset) -> pathlib.Path: datasets = deduplicate_datasets(pd.concat(dataset_metadata_collection)) for _, dataset in datasets.iterrows(): - print(dataset.key) for ds_filename in dataset["files"]: + if ds_filename.name.split("_")[0] != dataset.variable_id: + continue ds_orig = xr.open_dataset(ds_filename) - ds_decimated = decimate_dataset(ds_orig, time_span=(dataset["time_start"], dataset["time_end"])) if ds_decimated is None: continue