diff --git a/CHANGELOG.md b/CHANGELOG.md index a1875c2..9cf6b2f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -## [0.6.2 - 20260-04-30] +## [0.6.3 - 2026-05-07] +### CHANGED +- Update gebco fetchez module to fetch new 2026 grid, and allow subsetting! +- Moved recipe validation from cli to fetchez.recipe +- Updated verbosity in fetchez.core + + +## [0.6.2 - 2026-04-30] ### ADDED - ProfileRegistry - streams - ReaderRegistry - streams @@ -21,7 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - core.run_fetchez now returns the final list of entries for use in the api and elsewhere. - Update run cli to allow for inherited options for modules. -## BUGFIX +### BUGFIX - fix bug in earthdata.icesat2 for harmony fetching. ## [0.5.5 - 2026-04-24] diff --git a/CITATION.cff b/CITATION.cff index 19fa8a9..d0027f1 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -16,6 +16,6 @@ authors: website: https://ciresdem.github.io/fetchez/ title: "Fetchez" -version: 0.6.2 -date-released: 2026-04-24 +version: 0.6.3 +date-released: 2026-05-07 url: "https://github.com/ciresdem/fetchez" diff --git a/README.md b/README.md index 64706bf..fdd13cd 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@

Fetch geospatial data with ease.

- Version + Version License Python PyPI version diff --git a/src/fetchez/cli/recipes.py b/src/fetchez/cli/recipes.py index 42ec2c4..54477ef 100644 --- a/src/fetchez/cli/recipes.py +++ b/src/fetchez/cli/recipes.py @@ -23,39 +23,6 @@ RECIPE_COMMANDS = ["copy", "dump", "info", "list", "validate", "run", "schemas"] -def validate_dependencies(recipe_obj): - """Interrogates all modules and hooks to ensure heavy dependencies exist.""" - - errors = [] - - # Check all global hooks - for hook in getattr(recipe_obj, "global_hooks", []): - if hasattr(hook, "_validate_deps"): - passed, msg = hook._validate_deps() - if not passed: - errors.append(f"[{hook.name}] {msg}") - - # Check all streaming hooks inside the modules - for mod in getattr(recipe_obj, "modules", []): - for hook in getattr(mod, "hooks", []): - if hasattr(hook, "_validate_deps"): - passed, msg = hook._validate_deps() - if not passed: - errors.append(f"[{mod.name} -> {hook.name}] {msg}") - - if errors: - click.secho("\n[ DEPENDENCY VALIDATION CHECK FAILED ]", fg="red", bold=True) - click.secho( - "The following dependencies are missing for this recipe:", fg="yellow" - ) - for error in errors: - click.echo(f" {error}") - click.echo( - "\nPlease install the required packages or modify the recipe and try again.\n" - ) - sys.exit(1) - - def _load_yaml(target): base_config = None if os.path.exists(target) and not os.path.isdir(target): @@ -186,12 +153,7 @@ def copy_recipe(name): @recipes_group.command("validate", cls=FetchezMainCommand) @click.argument("name") def recipe_validate(name): - """Check a recipe for syntax errors and missing modules/hooks.""" - - from fetchez.registry import ModuleRegistry, HookRegistry - - ModuleRegistry.load_all() - HookRegistry.load_all() + """Check a recipe for syntax errors, logical issues, and missing dependencies.""" base_config = _load_yaml(name) if not base_config: @@ -200,57 +162,20 @@ def recipe_validate(name): ) sys.exit(1) - errors = 0 click.secho(f"Validating {name}...", fg="blue") - validate_dependencies(base_config) - - for mod in base_config.get("modules", []): - mod_name = mod.get("module") - mod_keys = mod.keys() - valid_keys = ["module", "hooks", "args"] - - for key in mod_keys: - if key not in valid_keys: - click.secho( - f" Module `{mod_name}` has unexpected reference to `{key}`", - fg="red", - ) - errors += 1 - - if not ModuleRegistry.get_class(mod_name) and mod_name not in [ - "file", - "local_fs", - ]: - click.secho(f" Missing Module: '{mod_name}'", fg="red") - errors += 1 - else: - click.secho(f" Valid Module: '{mod_name}'", fg="green") - - for hook in mod.get("hooks", []): - if not HookRegistry.get_class(hook.get("name")): - click.secho( - f" Missing Hook: '{hook.get('name')}' (in module {mod_name})", - fg="red", - ) - errors += 1 - else: - click.secho( - f" Valid Hook: '{hook.get('name')}' (in module {mod_name})", - fg="green", - ) - - for hook in base_config.get("global_hooks", []): - if not HookRegistry.get_class(hook.get("name")): - click.secho(f" Missing Global Hook: '{hook.get('name')}'", fg="red") - errors += 1 - else: - click.secho(f" Valid Hook: '{hook.get('name')}'", fg="green") - - if errors == 0: + recipe_obj = Recipe.from_dict(base_config) + is_valid, errors = recipe_obj.validate() + + if is_valid: click.secho("Recipe appears valid!", fg="green", bold=True) else: - click.secho(f"Failed validation with {errors} errors.", fg="red", bold=True) + click.secho( + f"\n[ VALIDATION FAILED WITH {len(errors)} ERRORS ]", fg="red", bold=True + ) + for error in errors: + click.echo(f" {error}") + click.echo("\nPlease modify the recipe and try again.\n") sys.exit(1) diff --git a/src/fetchez/core.py b/src/fetchez/core.py index a423bb6..32c63b3 100644 --- a/src/fetchez/core.py +++ b/src/fetchez/core.py @@ -468,7 +468,7 @@ def fetch_req( return req except Exception as e: - logger.warning(f"Attempt {attempt + 1}/{tries} failed: {e}") + logger.debug(f"Attempt {attempt + 1}/{tries} failed: {e}") if current_timeout: current_timeout *= 2 if current_read_timeout: @@ -610,7 +610,7 @@ def fetch_file( if req.status_code == 416: # Range No Good: Local file is likely corrupt. # Delete .part and retry from scratch (next loop iteration) - logger.warning( + logger.debug( f"Invalid Range for {os.path.basename(dst_fn)}. Restarting..." ) if os.path.exists(part_fn): @@ -704,7 +704,7 @@ def fetch_file( ) as e: if attempt < tries - 1: wait_time = (attempt + 1) * 2 - logger.warning(f"Download failed: {e}. Retrying in {wait_time}s...") + logger.debug(f"Download failed: {e}. Retrying in {wait_time}s...") time.sleep(wait_time) else: logger.warning(f"Failed to download {self.url}: {e}") @@ -888,7 +888,7 @@ def run_fetchez(modules: List["FetchModule"], threads: int = 3, global_hooks=Non original_entry.update({"status": status}) if status != 0: logger.error( - f"Failed: {os.path.basename(original_entry['dst_fn'])}" + f"Failed to fetch: {os.path.basename(original_entry['dst_fn'])}" ) except Exception as e: logger.error(f"Worker exception: {e}") @@ -932,7 +932,7 @@ def run_fetchez(modules: List["FetchModule"], threads: int = 3, global_hooks=Non ) collections.deque(stream, maxlen=0) except Exception as e: - logger.error( + logger.exception( f"Stream processing error in {os.path.basename(item.get('dst_fn', ''))}: {e}" ) diff --git a/src/fetchez/hooks/sidecar.py b/src/fetchez/hooks/sidecar.py index 45f2dc0..1277cb1 100644 --- a/src/fetchez/hooks/sidecar.py +++ b/src/fetchez/hooks/sidecar.py @@ -58,6 +58,9 @@ def run(self, entries): try: with open(meta_fn, "w") as f: json.dump(meta_data, f, indent=2) + + # Add sidecar as an artifact to the entry + entry.setdefault("artifacts", {})[self.name] = meta_fn except Exception as e: logger.warning(f"Failed to write sidecar for {filepath}: {e}") diff --git a/src/fetchez/modules/gebco.py b/src/fetchez/modules/gebco.py index fa2e700..505995f 100644 --- a/src/fetchez/modules/gebco.py +++ b/src/fetchez/modules/gebco.py @@ -13,24 +13,34 @@ :license: MIT, see LICENSE for more details. """ +import math import logging +import urllib.parse from fetchez.modules import FetchModule logger = logging.getLogger(__name__) -# ============================================================================= -# Data Sources -# ============================================================================= +# Base NCSS endpoints for GEBCO 2026 +GEBCO_NCSS_URLS = { + "grid": "https://dap.ceda.ac.uk/thredds/ncss/bodc/gebco/global/gebco_2026/ice_surface_elevation/netcdf/GEBCO_2026.nc", + "tid": "https://dap.ceda.ac.uk/thredds/ncss/bodc/gebco/global/gebco_2026/type_identifier_grid/netcdf/gebco_2026_tid.nc", + "sub_ice": "https://dap.ceda.ac.uk/thredds/ncss/bodc/gebco/global/gebco_2026/sub_ice_topo/netcdf/GEBCO_2026_sub_ice_topo.nc", +} -# BODC / Official GEBCO - Global Zipped Downloads -GEBCO_GLOBAL_URLS = { - "grid": "https://www.bodc.ac.uk/data/open_download/gebco/gebco_2024/geotiff/", - "tid": "https://www.bodc.ac.uk/data/open_download/gebco/gebco_2024_tid/geotiff/", - "sub_ice": "https://www.bodc.ac.uk/data/open_download/gebco/gebco_2024_sub_ice_topo/geotiff/", +# Base OpenDAP (DODS) endpoints for GEBCO 2026 +GEBCO_DAP_URLS = { + "grid": "https://dap.ceda.ac.uk/thredds/dodsC/bodc/gebco/global/gebco_2026/ice_surface_elevation/netcdf/GEBCO_2026.nc", + "tid": "https://dap.ceda.ac.uk/thredds/dodsC/bodc/gebco/global/gebco_2026/type_identifier_grid/netcdf/gebco_2026_tid.nc", + "sub_ice": "https://dap.ceda.ac.uk/thredds/dodsC/bodc/gebco/global/gebco_2026/sub_ice_topo/netcdf/GEBCO_2026_sub_ice_topo.nc", } class GEBCO(FetchModule): + """Fetch GEBCO 2026 bathymetry dynamically via THREDDS NCSS. + Requires ZERO external dependencies (No GDAL required). + Downloads true, mathematically cropped .nc files! + """ + name = "gebco" meta_category = "Bathymetry" meta_desc = "General Bathymetric Chart of the Oceans (GEBCO)" @@ -41,50 +51,128 @@ class GEBCO(FetchModule): meta_license = "Public Domain / Attribution" meta_urls = {"home": "https://www.gebco.net/"} - """Fetch GEBCO global bathymetry data. + def __init__(self, layer="grid", include_tid=False, **kwargs): + super().__init__(name="gebco_opendap", **kwargs) + self.layer = layer.lower() + self.include_tid = str(include_tid).lower() in ["true", "1", "yes"] - GEBCO provides a global terrain model at ~15 arc-seconds (~500m). + def run(self): + if not getattr(self, "region", None) or self.region.to_list() == [ + -180, + 180, + -90, + 90, + ]: + logger.error( + "You must provide a strict bounding region (-R) to use the NCSS subsetter!" + ) + return - Layers: - - grid: Standard bathymetry/topography (Ice Surface). Default. - - sub_ice: Bedrock elevation (Ice removed). - - tid: Type Identifier (Source of data per pixel). + w, e, s, n = ( + self.region.xmin, + self.region.xmax, + self.region.ymin, + self.region.ymax, + ) - Examples: - fetchez gebco -R -90/-89/25/26 - fetchez gebco --layer tid -R ... - fetchez gebco --layer sub_ice - """ + base_query = { + "north": n, + "south": s, + "west": w, + "east": e, + "horizStride": 1, + "addLatLon": "true", + "accept": "netcdf", + } - def __init__(self, layer="grid", **kwargs): - """ - Args: - layer (str): 'grid', 'tid', or 'sub_ice'. - global_grid (bool): Legacy flag, forces source='global'. - """ + grid_base = GEBCO_NCSS_URLS.get(self.layer, GEBCO_NCSS_URLS["grid"]) + grid_query = base_query.copy() + grid_query["var"] = "elevation" - super().__init__(name="gebco", **kwargs) + grid_url = f"{grid_base}?{urllib.parse.urlencode(grid_query)}" + grid_fn = f"gebco_2026_{self.layer}_{w}_{e}_{s}_{n}.nc" + + self.add_entry_to_results(url=grid_url, dst_fn=grid_fn, data_type="netcdf") + + if self.include_tid: + tid_base = GEBCO_NCSS_URLS["tid"] + tid_query = base_query.copy() + tid_query["var"] = "tid" + + tid_url = f"{tid_base}?{urllib.parse.urlencode(tid_query)}" + tid_fn = f"gebco_2026_tid_{w}_{e}_{s}_{n}.nc" + + self.add_entry_to_results(url=tid_url, dst_fn=tid_fn, data_type="netcdf") + + +class GEBCO_OpenDAP(FetchModule): + """Fetch GEBCO 2026 bathymetry dynamically via OpenDAP HTTP Subsetting. + Requires ZERO external dependencies (No GDAL required). + """ + + name = "gebco_opendap" + meta_category = "Bathymetry" + meta_desc = "GEBCO 2026 via OpenDAP Array Subsetting" + meta_agency = "GEBCO / IHO / IOC" + meta_tags = ["gebco", "bathymetry", "global", "wcs", "tid"] + meta_region = "Global" + meta_resolution = "15 arc-seconds (~500m)" + meta_license = "Public Domain / Attribution" + meta_urls = {"home": "https://www.gebco.net/"} + def __init__(self, layer="grid", include_tid=False, **kwargs): + super().__init__(name="gebco_opendap", **kwargs) self.layer = layer.lower() + # Ensure boolean parsing from YAML + self.include_tid = str(include_tid).lower() in ["true", "1", "yes"] - if self.layer not in GEBCO_GLOBAL_URLS: - valid = ", ".join(GEBCO_GLOBAL_URLS.keys()) - logger.warning( - f"Unknown GEBCO layer '{self.layer}'. Defaulting to 'grid'. Valid: {valid}" + def run(self): + if not getattr(self, "region", None) or self.region.to_list() == [ + -180, + 180, + -90, + 90, + ]: + logger.error( + "You must provide a strict bounding region (-R) to use the OpenDAP subsetter!" ) - self.layer = "grid" + return - def run(self): - """Setup for Official Global Download.""" + w, e, s, n = ( + self.region.xmin, + self.region.xmax, + self.region.ymin, + self.region.ymax, + ) - url = GEBCO_GLOBAL_URLS.get(self.layer) - if not url: - logger.error(f"No Global URL available for layer '{self.layer}'.") - return + # GEBCO is 15 arc-seconds (240 pixels per degree) + # Grid Extents: -180 to 180 (X), -90 to 90 (Y) + + x1 = max(0, int(math.floor((w + 180) * 240))) + x2 = min(86400, int(math.ceil((e + 180) * 240))) + + y1 = max(0, int(math.floor((s + 90) * 240))) + y2 = min(43200, int(math.ceil((n + 90) * 240))) + + # Query format: ?variable[y1:1:y2][x1:1:x2],lat[y1:1:y2],lon[x1:1:x2] + grid_base = GEBCO_DAP_URLS.get(self.layer, GEBCO_DAP_URLS["grid"]) + z_var = "elevation" - dst_fn = f"gebco_2024_{self.layer}_global.zip" - self.add_entry_to_results( - url=url, - dst_fn=dst_fn, - data_type="archive", + grid_query = ( + f"?{z_var}[{y1}:1:{y2}][{x1}:1:{x2}],lat[{y1}:1:{y2}],lon[{x1}:1:{x2}]" ) + # grid_query = f"?{z_var}[0:1:0][0:1:0],lat[{y1}:1:{y2}],lon[{x1}:1:{x2}]" + grid_url = f"{grid_base}.dods{grid_query}" + + grid_fn = f"gebco_2026_{self.layer}_{w}_{e}_{s}_{n}.nc" + + self.add_entry_to_results(url=grid_url, dst_fn=grid_fn, data_type="netcdf") + + if self.include_tid: + tid_base = GEBCO_DAP_URLS["tid"] + tid_query = ( + f"?tid[{y1}:1:{y2}][{x1}:1:{x2}],lat[{y1}:1:{y2}],lon[{x1}:1:{x2}]" + ) + tid_url = f"{tid_base}.nc{tid_query}" + tid_fn = f"gebco_2026_tid_{w}_{e}_{s}_{n}.nc" + self.add_entry_to_results(url=tid_url, dst_fn=tid_fn, data_type="netcdf") diff --git a/src/fetchez/modules/local_fs.py b/src/fetchez/modules/local_fs.py index a360f4a..ddfa245 100644 --- a/src/fetchez/modules/local_fs.py +++ b/src/fetchez/modules/local_fs.py @@ -40,7 +40,7 @@ class LocalFS(FetchModule): """Local data path Datalists.""" - def __init__(self, path=".", ext=".tif", datatype="raster", **kwargs): + def __init__(self, path=".", ext=".tif", datatype=None, **kwargs): super().__init__(name="local_fs", **kwargs) self.path = os.path.abspath(path) self.ext = ext if ext.startswith(".") else f".{ext}" diff --git a/src/fetchez/recipe.py b/src/fetchez/recipe.py index 36221e3..6e1ff74 100644 --- a/src/fetchez/recipe.py +++ b/src/fetchez/recipe.py @@ -402,3 +402,86 @@ def _generate_receipt(self): logger.info(f"📄 Full processing receipt saved to: {receipt_filename}") logger.info("=" * 67) + + def validate(self): + """Validates the recipe for syntax, missing plugins, dependencies, and logical errors. + + Returns: + bool: True if valid, False if errors exist. + list: List of error messages. + """ + + ModuleRegistry.load_all() + HookRegistry.load_all() + + errors = [] + claimed_outputs = set() + + def check_output_collision(hook_dict, context_name): + """Helper to check if a hook is clobbering an existing file.""" + + out_file = hook_dict.get("args", {}).get("output") + if out_file: + if out_file in claimed_outputs: + errors.append( + f"[{context_name}] Output Collision: Multiple hooks are attempting to write to '{out_file}'." + ) + claimed_outputs.add(out_file) + + # Validate Modules + for mod in self.config.get("modules", []): + mod_name = mod.get("module") + mod_keys = mod.keys() + valid_keys = ["module", "bundle", "hooks", "args", "region"] + + for key in mod_keys: + if key not in valid_keys: + errors.append( + f"Module `{mod_name}` has unexpected reference to `{key}`" + ) + + if not ModuleRegistry.get_class(mod_name) and mod_name not in [ + "file", + "local_fs", + ]: + errors.append(f"Missing Module: '{mod_name}'") + + # Check Module-level Hooks + # mod_hook_counts = {} + for hook in mod.get("hooks", []): + h_name = hook.get("name") + HookCls = HookRegistry.get_class(h_name) + + if not HookCls: + errors.append(f"Missing Hook: '{h_name}' (in module {mod_name})") + continue + + # Dependency Check + if hasattr(HookCls, "_validate_deps"): + passed, msg = HookCls()._validate_deps() + if not passed: + errors.append( + f"[{mod_name} -> {h_name}] Missing Dependency: {msg}" + ) + + check_output_collision(hook, f"Module: {mod_name}") + + # Validate Global Hooks + # global_hook_counts = {} + for hook in self.config.get("global_hooks", []): + h_name = hook.get("name") + HookCls = HookRegistry.get_class(h_name) + + if not HookCls: + errors.append(f"Missing Global Hook: '{h_name}'") + continue + + # Dependency Check + if hasattr(HookCls, "_validate_deps"): + passed, msg = HookCls()._validate_deps() + if not passed: + errors.append(f"[Global -> {h_name}] Missing Dependency: {msg}") + + check_output_collision(hook, "Global Hooks") + + return len(errors) == 0, errors diff --git a/src/fetchez/utils.py b/src/fetchez/utils.py index 637e5d3..a425ee9 100644 --- a/src/fetchez/utils.py +++ b/src/fetchez/utils.py @@ -323,6 +323,7 @@ def fn_url_p(fn): def inc2str(inc): """Convert a WGS84 geographic increment to a string identifier.""" + import fractions return str(fractions.Fraction(str(inc * 3600)).limit_denominator(10)).replace(