diff --git a/CHANGELOG.md b/CHANGELOG.md index a1875c2..9cf6b2f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -## [0.6.2 - 20260-04-30] +## [0.6.3 - 2026-05-07] +### CHANGED +- Update gebco fetchez module to fetch new 2026 grid, and allow subsetting! +- Moved recipe validation from cli to fetchez.recipe +- Updated verbosity in fetchez.core + + +## [0.6.2 - 2026-04-30] ### ADDED - ProfileRegistry - streams - ReaderRegistry - streams @@ -21,7 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - core.run_fetchez now returns the final list of entries for use in the api and elsewhere. - Update run cli to allow for inherited options for modules. -## BUGFIX +### BUGFIX - fix bug in earthdata.icesat2 for harmony fetching. ## [0.5.5 - 2026-04-24] diff --git a/CITATION.cff b/CITATION.cff index 19fa8a9..d0027f1 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -16,6 +16,6 @@ authors: website: https://ciresdem.github.io/fetchez/ title: "Fetchez" -version: 0.6.2 -date-released: 2026-04-24 +version: 0.6.3 +date-released: 2026-05-07 url: "https://github.com/ciresdem/fetchez" diff --git a/README.md b/README.md index 64706bf..fdd13cd 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@
Fetch geospatial data with ease.
-
+
diff --git a/src/fetchez/cli/recipes.py b/src/fetchez/cli/recipes.py
index 42ec2c4..54477ef 100644
--- a/src/fetchez/cli/recipes.py
+++ b/src/fetchez/cli/recipes.py
@@ -23,39 +23,6 @@
RECIPE_COMMANDS = ["copy", "dump", "info", "list", "validate", "run", "schemas"]
-def validate_dependencies(recipe_obj):
- """Interrogates all modules and hooks to ensure heavy dependencies exist."""
-
- errors = []
-
- # Check all global hooks
- for hook in getattr(recipe_obj, "global_hooks", []):
- if hasattr(hook, "_validate_deps"):
- passed, msg = hook._validate_deps()
- if not passed:
- errors.append(f"[{hook.name}] {msg}")
-
- # Check all streaming hooks inside the modules
- for mod in getattr(recipe_obj, "modules", []):
- for hook in getattr(mod, "hooks", []):
- if hasattr(hook, "_validate_deps"):
- passed, msg = hook._validate_deps()
- if not passed:
- errors.append(f"[{mod.name} -> {hook.name}] {msg}")
-
- if errors:
- click.secho("\n[ DEPENDENCY VALIDATION CHECK FAILED ]", fg="red", bold=True)
- click.secho(
- "The following dependencies are missing for this recipe:", fg="yellow"
- )
- for error in errors:
- click.echo(f" {error}")
- click.echo(
- "\nPlease install the required packages or modify the recipe and try again.\n"
- )
- sys.exit(1)
-
-
def _load_yaml(target):
base_config = None
if os.path.exists(target) and not os.path.isdir(target):
@@ -186,12 +153,7 @@ def copy_recipe(name):
@recipes_group.command("validate", cls=FetchezMainCommand)
@click.argument("name")
def recipe_validate(name):
- """Check a recipe for syntax errors and missing modules/hooks."""
-
- from fetchez.registry import ModuleRegistry, HookRegistry
-
- ModuleRegistry.load_all()
- HookRegistry.load_all()
+ """Check a recipe for syntax errors, logical issues, and missing dependencies."""
base_config = _load_yaml(name)
if not base_config:
@@ -200,57 +162,20 @@ def recipe_validate(name):
)
sys.exit(1)
- errors = 0
click.secho(f"Validating {name}...", fg="blue")
- validate_dependencies(base_config)
-
- for mod in base_config.get("modules", []):
- mod_name = mod.get("module")
- mod_keys = mod.keys()
- valid_keys = ["module", "hooks", "args"]
-
- for key in mod_keys:
- if key not in valid_keys:
- click.secho(
- f" Module `{mod_name}` has unexpected reference to `{key}`",
- fg="red",
- )
- errors += 1
-
- if not ModuleRegistry.get_class(mod_name) and mod_name not in [
- "file",
- "local_fs",
- ]:
- click.secho(f" Missing Module: '{mod_name}'", fg="red")
- errors += 1
- else:
- click.secho(f" Valid Module: '{mod_name}'", fg="green")
-
- for hook in mod.get("hooks", []):
- if not HookRegistry.get_class(hook.get("name")):
- click.secho(
- f" Missing Hook: '{hook.get('name')}' (in module {mod_name})",
- fg="red",
- )
- errors += 1
- else:
- click.secho(
- f" Valid Hook: '{hook.get('name')}' (in module {mod_name})",
- fg="green",
- )
-
- for hook in base_config.get("global_hooks", []):
- if not HookRegistry.get_class(hook.get("name")):
- click.secho(f" Missing Global Hook: '{hook.get('name')}'", fg="red")
- errors += 1
- else:
- click.secho(f" Valid Hook: '{hook.get('name')}'", fg="green")
-
- if errors == 0:
+ recipe_obj = Recipe.from_dict(base_config)
+ is_valid, errors = recipe_obj.validate()
+
+ if is_valid:
click.secho("Recipe appears valid!", fg="green", bold=True)
else:
- click.secho(f"Failed validation with {errors} errors.", fg="red", bold=True)
+ click.secho(
+ f"\n[ VALIDATION FAILED WITH {len(errors)} ERRORS ]", fg="red", bold=True
+ )
+ for error in errors:
+ click.echo(f" {error}")
+ click.echo("\nPlease modify the recipe and try again.\n")
sys.exit(1)
diff --git a/src/fetchez/core.py b/src/fetchez/core.py
index a423bb6..32c63b3 100644
--- a/src/fetchez/core.py
+++ b/src/fetchez/core.py
@@ -468,7 +468,7 @@ def fetch_req(
return req
except Exception as e:
- logger.warning(f"Attempt {attempt + 1}/{tries} failed: {e}")
+ logger.debug(f"Attempt {attempt + 1}/{tries} failed: {e}")
if current_timeout:
current_timeout *= 2
if current_read_timeout:
@@ -610,7 +610,7 @@ def fetch_file(
if req.status_code == 416:
# Range No Good: Local file is likely corrupt.
# Delete .part and retry from scratch (next loop iteration)
- logger.warning(
+ logger.debug(
f"Invalid Range for {os.path.basename(dst_fn)}. Restarting..."
)
if os.path.exists(part_fn):
@@ -704,7 +704,7 @@ def fetch_file(
) as e:
if attempt < tries - 1:
wait_time = (attempt + 1) * 2
- logger.warning(f"Download failed: {e}. Retrying in {wait_time}s...")
+ logger.debug(f"Download failed: {e}. Retrying in {wait_time}s...")
time.sleep(wait_time)
else:
logger.warning(f"Failed to download {self.url}: {e}")
@@ -888,7 +888,7 @@ def run_fetchez(modules: List["FetchModule"], threads: int = 3, global_hooks=Non
original_entry.update({"status": status})
if status != 0:
logger.error(
- f"Failed: {os.path.basename(original_entry['dst_fn'])}"
+ f"Failed to fetch: {os.path.basename(original_entry['dst_fn'])}"
)
except Exception as e:
logger.error(f"Worker exception: {e}")
@@ -932,7 +932,7 @@ def run_fetchez(modules: List["FetchModule"], threads: int = 3, global_hooks=Non
)
collections.deque(stream, maxlen=0)
except Exception as e:
- logger.error(
+ logger.exception(
f"Stream processing error in {os.path.basename(item.get('dst_fn', ''))}: {e}"
)
diff --git a/src/fetchez/hooks/sidecar.py b/src/fetchez/hooks/sidecar.py
index 45f2dc0..1277cb1 100644
--- a/src/fetchez/hooks/sidecar.py
+++ b/src/fetchez/hooks/sidecar.py
@@ -58,6 +58,9 @@ def run(self, entries):
try:
with open(meta_fn, "w") as f:
json.dump(meta_data, f, indent=2)
+
+ # Add sidecar as an artifact to the entry
+ entry.setdefault("artifacts", {})[self.name] = meta_fn
except Exception as e:
logger.warning(f"Failed to write sidecar for {filepath}: {e}")
diff --git a/src/fetchez/modules/gebco.py b/src/fetchez/modules/gebco.py
index fa2e700..505995f 100644
--- a/src/fetchez/modules/gebco.py
+++ b/src/fetchez/modules/gebco.py
@@ -13,24 +13,34 @@
:license: MIT, see LICENSE for more details.
"""
+import math
import logging
+import urllib.parse
from fetchez.modules import FetchModule
logger = logging.getLogger(__name__)
-# =============================================================================
-# Data Sources
-# =============================================================================
+# Base NCSS endpoints for GEBCO 2026
+GEBCO_NCSS_URLS = {
+ "grid": "https://dap.ceda.ac.uk/thredds/ncss/bodc/gebco/global/gebco_2026/ice_surface_elevation/netcdf/GEBCO_2026.nc",
+ "tid": "https://dap.ceda.ac.uk/thredds/ncss/bodc/gebco/global/gebco_2026/type_identifier_grid/netcdf/gebco_2026_tid.nc",
+ "sub_ice": "https://dap.ceda.ac.uk/thredds/ncss/bodc/gebco/global/gebco_2026/sub_ice_topo/netcdf/GEBCO_2026_sub_ice_topo.nc",
+}
-# BODC / Official GEBCO - Global Zipped Downloads
-GEBCO_GLOBAL_URLS = {
- "grid": "https://www.bodc.ac.uk/data/open_download/gebco/gebco_2024/geotiff/",
- "tid": "https://www.bodc.ac.uk/data/open_download/gebco/gebco_2024_tid/geotiff/",
- "sub_ice": "https://www.bodc.ac.uk/data/open_download/gebco/gebco_2024_sub_ice_topo/geotiff/",
+# Base OpenDAP (DODS) endpoints for GEBCO 2026
+GEBCO_DAP_URLS = {
+ "grid": "https://dap.ceda.ac.uk/thredds/dodsC/bodc/gebco/global/gebco_2026/ice_surface_elevation/netcdf/GEBCO_2026.nc",
+ "tid": "https://dap.ceda.ac.uk/thredds/dodsC/bodc/gebco/global/gebco_2026/type_identifier_grid/netcdf/gebco_2026_tid.nc",
+ "sub_ice": "https://dap.ceda.ac.uk/thredds/dodsC/bodc/gebco/global/gebco_2026/sub_ice_topo/netcdf/GEBCO_2026_sub_ice_topo.nc",
}
class GEBCO(FetchModule):
+ """Fetch GEBCO 2026 bathymetry dynamically via THREDDS NCSS.
+ Requires ZERO external dependencies (No GDAL required).
+ Downloads true, mathematically cropped .nc files!
+ """
+
name = "gebco"
meta_category = "Bathymetry"
meta_desc = "General Bathymetric Chart of the Oceans (GEBCO)"
@@ -41,50 +51,128 @@ class GEBCO(FetchModule):
meta_license = "Public Domain / Attribution"
meta_urls = {"home": "https://www.gebco.net/"}
- """Fetch GEBCO global bathymetry data.
+ def __init__(self, layer="grid", include_tid=False, **kwargs):
+ super().__init__(name="gebco_opendap", **kwargs)
+ self.layer = layer.lower()
+ self.include_tid = str(include_tid).lower() in ["true", "1", "yes"]
- GEBCO provides a global terrain model at ~15 arc-seconds (~500m).
+ def run(self):
+ if not getattr(self, "region", None) or self.region.to_list() == [
+ -180,
+ 180,
+ -90,
+ 90,
+ ]:
+ logger.error(
+ "You must provide a strict bounding region (-R) to use the NCSS subsetter!"
+ )
+ return
- Layers:
- - grid: Standard bathymetry/topography (Ice Surface). Default.
- - sub_ice: Bedrock elevation (Ice removed).
- - tid: Type Identifier (Source of data per pixel).
+ w, e, s, n = (
+ self.region.xmin,
+ self.region.xmax,
+ self.region.ymin,
+ self.region.ymax,
+ )
- Examples:
- fetchez gebco -R -90/-89/25/26
- fetchez gebco --layer tid -R ...
- fetchez gebco --layer sub_ice
- """
+ base_query = {
+ "north": n,
+ "south": s,
+ "west": w,
+ "east": e,
+ "horizStride": 1,
+ "addLatLon": "true",
+ "accept": "netcdf",
+ }
- def __init__(self, layer="grid", **kwargs):
- """
- Args:
- layer (str): 'grid', 'tid', or 'sub_ice'.
- global_grid (bool): Legacy flag, forces source='global'.
- """
+ grid_base = GEBCO_NCSS_URLS.get(self.layer, GEBCO_NCSS_URLS["grid"])
+ grid_query = base_query.copy()
+ grid_query["var"] = "elevation"
- super().__init__(name="gebco", **kwargs)
+ grid_url = f"{grid_base}?{urllib.parse.urlencode(grid_query)}"
+ grid_fn = f"gebco_2026_{self.layer}_{w}_{e}_{s}_{n}.nc"
+
+ self.add_entry_to_results(url=grid_url, dst_fn=grid_fn, data_type="netcdf")
+
+ if self.include_tid:
+ tid_base = GEBCO_NCSS_URLS["tid"]
+ tid_query = base_query.copy()
+ tid_query["var"] = "tid"
+
+ tid_url = f"{tid_base}?{urllib.parse.urlencode(tid_query)}"
+ tid_fn = f"gebco_2026_tid_{w}_{e}_{s}_{n}.nc"
+
+ self.add_entry_to_results(url=tid_url, dst_fn=tid_fn, data_type="netcdf")
+
+
+class GEBCO_OpenDAP(FetchModule):
+ """Fetch GEBCO 2026 bathymetry dynamically via OpenDAP HTTP Subsetting.
+ Requires ZERO external dependencies (No GDAL required).
+ """
+
+ name = "gebco_opendap"
+ meta_category = "Bathymetry"
+ meta_desc = "GEBCO 2026 via OpenDAP Array Subsetting"
+ meta_agency = "GEBCO / IHO / IOC"
+ meta_tags = ["gebco", "bathymetry", "global", "wcs", "tid"]
+ meta_region = "Global"
+ meta_resolution = "15 arc-seconds (~500m)"
+ meta_license = "Public Domain / Attribution"
+ meta_urls = {"home": "https://www.gebco.net/"}
+ def __init__(self, layer="grid", include_tid=False, **kwargs):
+ super().__init__(name="gebco_opendap", **kwargs)
self.layer = layer.lower()
+ # Ensure boolean parsing from YAML
+ self.include_tid = str(include_tid).lower() in ["true", "1", "yes"]
- if self.layer not in GEBCO_GLOBAL_URLS:
- valid = ", ".join(GEBCO_GLOBAL_URLS.keys())
- logger.warning(
- f"Unknown GEBCO layer '{self.layer}'. Defaulting to 'grid'. Valid: {valid}"
+ def run(self):
+ if not getattr(self, "region", None) or self.region.to_list() == [
+ -180,
+ 180,
+ -90,
+ 90,
+ ]:
+ logger.error(
+ "You must provide a strict bounding region (-R) to use the OpenDAP subsetter!"
)
- self.layer = "grid"
+ return
- def run(self):
- """Setup for Official Global Download."""
+ w, e, s, n = (
+ self.region.xmin,
+ self.region.xmax,
+ self.region.ymin,
+ self.region.ymax,
+ )
- url = GEBCO_GLOBAL_URLS.get(self.layer)
- if not url:
- logger.error(f"No Global URL available for layer '{self.layer}'.")
- return
+ # GEBCO is 15 arc-seconds (240 pixels per degree)
+ # Grid Extents: -180 to 180 (X), -90 to 90 (Y)
+
+ x1 = max(0, int(math.floor((w + 180) * 240)))
+ x2 = min(86400, int(math.ceil((e + 180) * 240)))
+
+ y1 = max(0, int(math.floor((s + 90) * 240)))
+ y2 = min(43200, int(math.ceil((n + 90) * 240)))
+
+ # Query format: ?variable[y1:1:y2][x1:1:x2],lat[y1:1:y2],lon[x1:1:x2]
+ grid_base = GEBCO_DAP_URLS.get(self.layer, GEBCO_DAP_URLS["grid"])
+ z_var = "elevation"
- dst_fn = f"gebco_2024_{self.layer}_global.zip"
- self.add_entry_to_results(
- url=url,
- dst_fn=dst_fn,
- data_type="archive",
+ grid_query = (
+ f"?{z_var}[{y1}:1:{y2}][{x1}:1:{x2}],lat[{y1}:1:{y2}],lon[{x1}:1:{x2}]"
)
+ # grid_query = f"?{z_var}[0:1:0][0:1:0],lat[{y1}:1:{y2}],lon[{x1}:1:{x2}]"
+ grid_url = f"{grid_base}.dods{grid_query}"
+
+ grid_fn = f"gebco_2026_{self.layer}_{w}_{e}_{s}_{n}.nc"
+
+ self.add_entry_to_results(url=grid_url, dst_fn=grid_fn, data_type="netcdf")
+
+ if self.include_tid:
+ tid_base = GEBCO_DAP_URLS["tid"]
+ tid_query = (
+ f"?tid[{y1}:1:{y2}][{x1}:1:{x2}],lat[{y1}:1:{y2}],lon[{x1}:1:{x2}]"
+ )
+ tid_url = f"{tid_base}.nc{tid_query}"
+ tid_fn = f"gebco_2026_tid_{w}_{e}_{s}_{n}.nc"
+ self.add_entry_to_results(url=tid_url, dst_fn=tid_fn, data_type="netcdf")
diff --git a/src/fetchez/modules/local_fs.py b/src/fetchez/modules/local_fs.py
index a360f4a..ddfa245 100644
--- a/src/fetchez/modules/local_fs.py
+++ b/src/fetchez/modules/local_fs.py
@@ -40,7 +40,7 @@ class LocalFS(FetchModule):
"""Local data path Datalists."""
- def __init__(self, path=".", ext=".tif", datatype="raster", **kwargs):
+ def __init__(self, path=".", ext=".tif", datatype=None, **kwargs):
super().__init__(name="local_fs", **kwargs)
self.path = os.path.abspath(path)
self.ext = ext if ext.startswith(".") else f".{ext}"
diff --git a/src/fetchez/recipe.py b/src/fetchez/recipe.py
index 36221e3..6e1ff74 100644
--- a/src/fetchez/recipe.py
+++ b/src/fetchez/recipe.py
@@ -402,3 +402,86 @@ def _generate_receipt(self):
logger.info(f"📄 Full processing receipt saved to: {receipt_filename}")
logger.info("=" * 67)
+
+ def validate(self):
+ """Validates the recipe for syntax, missing plugins, dependencies, and logical errors.
+
+ Returns:
+ bool: True if valid, False if errors exist.
+ list: List of error messages.
+ """
+
+ ModuleRegistry.load_all()
+ HookRegistry.load_all()
+
+ errors = []
+ claimed_outputs = set()
+
+ def check_output_collision(hook_dict, context_name):
+ """Helper to check if a hook is clobbering an existing file."""
+
+ out_file = hook_dict.get("args", {}).get("output")
+ if out_file:
+ if out_file in claimed_outputs:
+ errors.append(
+ f"[{context_name}] Output Collision: Multiple hooks are attempting to write to '{out_file}'."
+ )
+ claimed_outputs.add(out_file)
+
+ # Validate Modules
+ for mod in self.config.get("modules", []):
+ mod_name = mod.get("module")
+ mod_keys = mod.keys()
+ valid_keys = ["module", "bundle", "hooks", "args", "region"]
+
+ for key in mod_keys:
+ if key not in valid_keys:
+ errors.append(
+ f"Module `{mod_name}` has unexpected reference to `{key}`"
+ )
+
+ if not ModuleRegistry.get_class(mod_name) and mod_name not in [
+ "file",
+ "local_fs",
+ ]:
+ errors.append(f"Missing Module: '{mod_name}'")
+
+ # Check Module-level Hooks
+ # mod_hook_counts = {}
+ for hook in mod.get("hooks", []):
+ h_name = hook.get("name")
+ HookCls = HookRegistry.get_class(h_name)
+
+ if not HookCls:
+ errors.append(f"Missing Hook: '{h_name}' (in module {mod_name})")
+ continue
+
+ # Dependency Check
+ if hasattr(HookCls, "_validate_deps"):
+ passed, msg = HookCls()._validate_deps()
+ if not passed:
+ errors.append(
+ f"[{mod_name} -> {h_name}] Missing Dependency: {msg}"
+ )
+
+ check_output_collision(hook, f"Module: {mod_name}")
+
+ # Validate Global Hooks
+ # global_hook_counts = {}
+ for hook in self.config.get("global_hooks", []):
+ h_name = hook.get("name")
+ HookCls = HookRegistry.get_class(h_name)
+
+ if not HookCls:
+ errors.append(f"Missing Global Hook: '{h_name}'")
+ continue
+
+ # Dependency Check
+ if hasattr(HookCls, "_validate_deps"):
+ passed, msg = HookCls()._validate_deps()
+ if not passed:
+ errors.append(f"[Global -> {h_name}] Missing Dependency: {msg}")
+
+ check_output_collision(hook, "Global Hooks")
+
+ return len(errors) == 0, errors
diff --git a/src/fetchez/utils.py b/src/fetchez/utils.py
index 637e5d3..a425ee9 100644
--- a/src/fetchez/utils.py
+++ b/src/fetchez/utils.py
@@ -323,6 +323,7 @@ def fn_url_p(fn):
def inc2str(inc):
"""Convert a WGS84 geographic increment to a string identifier."""
+
import fractions
return str(fractions.Fraction(str(inc * 3600)).limit_denominator(10)).replace(