Skip to content

BUG: aws ParamvalidationError in lsdb notebook #172

@bsipocz

Description

@bsipocz

Example failing job: https://github.com/Caltech-IPAC/irsa-tutorials/actions/runs/19221240496/job/54939630573#step:5:1105

It's persistent enough to show after manual restart 2 hours after the first run. I would expect though that it's something upstream.

 _____ tutorials\parquet-catalog-demos\irsa-hats-with-lsdb.ipynb::Cell 30 ______
Notebook cell execution failed
Cell 30: Cell execution caused an exception

Input:
def get_nworkers(catalog):
    return min(os.cpu_count(), catalog.npartitions + 1)

with Client(n_workers=get_nworkers(euclid_x_ztf), 
            threads_per_worker=2, 
            memory_limit='auto') as client:
    print(f"This may take more than a few minutes to complete. You can monitor progress in Dask dashboard at {client.dashboard_link}")
    euclid_x_ztf_df = euclid_x_ztf.compute() # this will load the data into memory finally

Traceback:

---------------------------------------------------------------------------
ParamValidationError                      Traceback (most recent call last)
Cell In[1], line 8
      4 with Client(n_workers=get_nworkers(euclid_x_ztf), 
      5             threads_per_worker=2, 
      6             memory_limit='auto') as client:
      7     print(f"This may take more than a few minutes to complete. You can monitor progress in Dask dashboard at {client.dashboard_link}")
----> 8     euclid_x_ztf_df = euclid_x_ztf.compute() # this will load the data into memory finally

File D:\a\irsa-tutorials\irsa-tutorials\.tox\py312-test\Lib\site-packages\lsdb\catalog\dataset\dataset.py:62, in Dataset.compute(self)
     60 def compute(self) -> npd.NestedFrame:
     61     """Compute dask distributed dataframe to pandas dataframe"""
---> 62     return self._ddf.compute()

File D:\a\irsa-tutorials\irsa-tutorials\.tox\py312-test\Lib\site-packages\lsdb\nested\core.py:436, in NestedFrame.compute(self, **kwargs)
    434 def compute(self, **kwargs):
    435     """Compute this Dask collection, returning the underlying dataframe or series."""
--> 436     return npd.NestedFrame(super().compute(**kwargs))

File D:\a\irsa-tutorials\irsa-tutorials\.tox\py312-test\Lib\site-packages\dask\base.py:373, in DaskMethodsMixin.compute(self, **kwargs)
    349 def compute(self, **kwargs):
    350     """Compute this dask collection
    351 
    352     This turns a lazy Dask collection into its in-memory equivalent.
   (...)    371     dask.compute
    372     """
--> 373     (result,) = compute(self, traverse=False, **kwargs)
    374     return result

File D:\a\irsa-tutorials\irsa-tutorials\.tox\py312-test\Lib\site-packages\dask\base.py:681, in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
    678     expr = expr.optimize()
    679     keys = list(flatten(expr.__dask_keys__()))
--> 681     results = schedule(expr, keys, **kwargs)
    683 return repack(results)

File D:\a\irsa-tutorials\irsa-tutorials\.tox\py312-test\Lib\site-packages\lsdb\loaders\hats\read_hats.py:428, in read_pixel()
    393 def read_pixel(
    394     pixel: HealpixPixel,
    395     *,
   (...)    403     **kwargs,
    404 ) -> npd.NestedFrame:
    405     """Utility method to read a single pixel's parquet file from disk.
    406 
    407     NB: `columns` is necessary as an argument, even if None, so that dask-expr
   (...)    426         The pixel data, as read from its parquet file.
    427     """
--> 428     return _read_parquet_file(
    429         path_generator(
    430             catalog_base_dir,
    431             pixel,
    432             query_url_params,
    433             npix_suffix,
    434         ),
    435         columns=columns,
    436         schema=schema,
    437         index_column=index_column,
    438         **kwargs,
    439     )

File D:\a\irsa-tutorials\irsa-tutorials\.tox\py312-test\Lib\site-packages\lsdb\loaders\hats\read_hats.py:457, in _read_parquet_file()
    450 if (
    451     columns is not None
    452     and schema is not None
    353     logger.debug(
    354         'Warning: %s.%s() is deprecated', service_name, operation_name
    355     )
    356 request_context = {
    357     'client_region': self.meta.region_name,
    358     'client_config': self.meta.config,
   (...)    362     'auth_options': self._service_model.metadata.get('auth'),
    363 }
--> 365 api_params = await self._emit_api_params(
    366     api_params=api_params,
    367     operation_model=operation_model,
    368     context=request_context,
    369 )
    370 (
    371     endpoint_url,
    372     additional_headers,
   (...)    375     operation_model, api_params, request_context
    376 )
    377 if properties:
    378     # Pass arbitrary endpoint info with the Request
    379     # for use during construction.

File D:\a\irsa-tutorials\irsa-tutorials\.tox\py312-test\Lib\site-packages\aiobotocore\client.py:488, in _emit_api_params()
    480 responses = await self.meta.events.emit(
    481     f'provide-client-params.{service_id}.{operation_name}',
    482     params=api_params,
    483     model=operation_model,
    484     context=context,
    485 )
    486 api_params = first_non_none_response(responses, default=api_params)
--> 488 await self.meta.events.emit(
    489     f'before-parameter-build.{service_id}.{operation_name}',
    490     params=api_params,
    491     model=operation_model,
    492     context=context,
    493 )
    494 return api_params

File D:\a\irsa-tutorials\irsa-tutorials\.tox\py312-test\Lib\site-packages\aiobotocore\hooks.py:68, in _emit()
     65 logger.debug('Event %s: calling handler %s', event_name, handler)
     67 # Await the handler if its a coroutine.
---> 68 response = await resolve_awaitable(handler(**kwargs))
     69 responses.append((handler, response))
     70 if stop_on_response and response is not None:

File D:\a\irsa-tutorials\irsa-tutorials\.tox\py312-test\Lib\site-packages\botocore\handlers.py:322, in validate_bucket_name()
    316 if not VALID_BUCKET.search(bucket) and not VALID_S3_ARN.search(bucket):
    317     error_msg = (
    318         f'Invalid bucket name "{bucket}": Bucket name must match '
    319         f'the regex "{VALID_BUCKET.pattern}" or be an ARN matching '
    320         f'the regex "{VALID_S3_ARN.pattern}"'
    321     )
--> 322     raise ParamValidationError(report=error_msg)

ParamValidationError: Parameter validation failed:
Invalid bucket name "ipac-irsa-ztf\contributed\dr23\objects\hats\ztf_dr23_objects-hats_margin_10arcsec\dataset\Norder=4\Dir=0\Npix=639.parquet": Bucket name must match the regex "^[a-zA-Z0-9.\-_]{1,255}$" or be an ARN matching the regex "^arn:(aws).*:(s3|s3-object-lambda):[a-z\-0-9]*:[0-9]{12}:accesspoint[/:][a-zA-Z0-9\-.]{1,63}$|^arn:(aws).*:s3-outposts:[a-z\-0-9]+:[0-9]{12}:outpost[/:][a-zA-Z0-9\-]{1,63}[/:]accesspoint[/:][a-zA-Z0-9\-]{1,63}$"

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingcontent: cloudContent related issues/PRs for notebooks with cloud hosted data relevanceupstream

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions