diff --git a/.env.dev b/.env.dev index 02a8f29c..df405ed8 100644 --- a/.env.dev +++ b/.env.dev @@ -30,3 +30,8 @@ RSC_LICENSE= # (Note that the local file backend always uses a temporary directory.) # # PINS_TEST_S3__PATH="ci-pins" + +# Databricks backend ---- +DATABRICKS_HOST= +DATABRICKS_TOKEN= +DATABRICKS_VOLUME= diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 72025c42..e08d5200 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -71,6 +71,8 @@ jobs: AWS_REGION: "us-east-1" AZURE_STORAGE_ACCOUNT_NAME: ${{ secrets.AZURE_STORAGE_ACCOUNT_NAME }} AZURE_STORAGE_ACCOUNT_KEY: ${{ secrets.AZURE_STORAGE_ACCOUNT_KEY }} + DATABRICKS_HOST: ${{ secrets.DATABRICKS_HOST }} + DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} PYTEST_OPTS: ${{ matrix.pytest_opts }} REQUIREMENTS: ${{ matrix.requirements }} ACTION_OS: ${{ matrix.os }} diff --git a/docs/_quarto.yml b/docs/_quarto.yml index 12bf9c0a..afb51204 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -49,6 +49,8 @@ website: href: reference/board_gcs.qmd - text: "`board_azure`" href: reference/board_azure.qmd + - text: "`board_databricks`" + href: reference/board_databricks.qmd - text: "`board_connect`" href: reference/board_connect.qmd - text: "`board_url`" @@ -99,6 +101,7 @@ quartodoc: - board_s3 - board_gcs - board_azure + - board_databricks - board_connect - board_url - board diff --git a/pins/__init__.py b/pins/__init__.py index 8bfc4e3d..4ce19945 100644 --- a/pins/__init__.py +++ b/pins/__init__.py @@ -22,6 +22,7 @@ board_azure, board_s3, board_gcs, + board_databricks, board, ) from .boards import board_deparse diff --git a/pins/boards.py b/pins/boards.py index f3c6b8ef..c1eeb2f2 100644 --- a/pins/boards.py +++ b/pins/boards.py @@ -870,6 +870,8 @@ def board_deparse(board: BaseBoard): return f"board_gcs({repr(board.board)}{allow_pickle})" elif prot == "http": return f"board_url({repr(board.board)}, {board.pin_paths}{allow_pickle})" + elif prot == "dbc": + return f"board_databricks({repr(board.board)}{allow_pickle})" else: raise NotImplementedError( f"board deparsing currently not supported for protocol: {prot}" diff --git a/pins/constructors.py b/pins/constructors.py index 6cb5d117..dd9663da 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -10,6 +10,7 @@ from .boards import BaseBoard, BoardManual, BoardRsConnect, board_deparse from .cache import PinsAccessTimeCache, PinsCache, PinsRscCacheMapper, prefix_cache from .config import get_cache_dir, get_data_dir +from .errors import PinsError # Kept here for backward-compatibility reasons # Note that this is not a constructor, but a function to represent them. @@ -87,6 +88,11 @@ def board( fs = RsConnectFs(**storage_options) + elif protocol == "dbc": + from pins.databricks.fs import DatabricksFs + + fs = DatabricksFs(**storage_options) + else: fs = fsspec.filesystem(protocol, **storage_options) @@ -569,3 +575,61 @@ def board_azure(path, versioned=True, cache=DEFAULT, allow_pickle_read=None): opts = {"use_listings_cache": False} return board("abfs", path, versioned, cache, allow_pickle_read, storage_options=opts) + + +def board_databricks(path, versioned=True, cache=DEFAULT, allow_pickle_read=None): + """Create a board to read and write pins from an Databricks Volume folder. + + Parameters + ---------- + path: + The path to the target folder inside Unity Catalog. The path must include the + catalog, schema, and volume names, preceded by 'Volumes/', for example: + "/Volumes/my-catalog/my-schema/my-volume". + versioned: + Whether or not pins should be versioned. + cache: + Whether to use a cache. By default, pins attempts to select the right cache + directory, given your filesystem. If `None` is passed, then no cache will be + used. You can set the cache using the `PINS_CACHE_DIR` environment variable. + allow_pickle_read: optional, bool + Whether to allow reading pins that use the pickle protocol. Pickles are unsafe, + and can execute arbitrary code. Only allow reading pickles if you trust the + board to execute Python code on your computer. + + You can enable reading pickles by setting this to `True`, or by setting the + environment variable `PINS_ALLOW_PICKLE_READ`. If both are set, this argument + takes precedence. + + Notes + ----- + The Databricks board uses the `databricks-sdk` library to authenticate and interact + with the Databricks Volume. + + See + + + Examples + -------- + + >>> import pytest; pytest.skip() + + >>> import pins + >>> from dotenv import load_dotenv + >>> load_dotenv() # eg, for a .env file with DATABRICKS_HOST and DATABRICKS_TOKEN set + >>> board = pins.board_databricks("/Volumes/examples/my-board/test-volume") + >>> board.pin_list() + ['df_csv'] + + >>> board.pin_read("df_csv") + x y z + 0 1 a 3 + 1 2 b 4 + """ + try: + import databricks.sdk # noqa: F401 + except ModuleNotFoundError: + raise PinsError( + "Install the `databricks-sdk` package for Databricks board support." + ) + return board("dbc", path, versioned, cache, allow_pickle_read) diff --git a/pins/databricks/__init__.py b/pins/databricks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pins/databricks/fs.py b/pins/databricks/fs.py new file mode 100644 index 00000000..58bcd95f --- /dev/null +++ b/pins/databricks/fs.py @@ -0,0 +1,196 @@ +import shutil +from io import BytesIO +from pathlib import Path, PurePath + +from fsspec import AbstractFileSystem + +from pins.errors import PinsError + + +class DatabricksFs(AbstractFileSystem): + protocol = "dbc" + + def ls(self, path, detail=False, **kwargs): + return self._databricks_ls(path, detail) + + def exists(self, path: str, **kwargs): + return self._databricks_exists(path) + + def open(self, path: str, mode: str = "rb", *args, **kwargs): + if mode != "rb": + raise NotImplementedError + return self._databricks_open(path) + + def get(self, rpath, lpath, recursive=False, **kwargs): + self._databricks_get(rpath, lpath, recursive, **kwargs) + + def mkdir(self, path, create_parents=True, **kwargs): + if not create_parents: + raise NotImplementedError + self._databricks_mkdir(path) + + def put( + self, + lpath, + rpath, + recursive=True, + maxdepth=None, + **kwargs, + ): + if not recursive: + raise NotImplementedError + if maxdepth is not None: + raise NotImplementedError + self._databricks_put(lpath, rpath) + + def rm(self, path, recursive=True, maxdepth=None) -> None: + if not recursive: + raise NotImplementedError + if maxdepth is not None: + raise NotImplementedError + if self._databricks_exists(path): + self._databricks_rm_dir(path) + + @staticmethod + def _databricks_put(lpath, rpath): + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient() + path = Path(lpath).absolute() + orig_path = path + + def _upload_files(path): + contents = Path(path) + for item in contents.iterdir(): + abs_path = PurePath(path).joinpath(item) + is_file = Path(abs_path).is_file() + if is_file: + rel_path = abs_path.relative_to(orig_path) + db_path = PurePath(rpath).joinpath(rel_path) + file = open(abs_path, "rb") + w.files.upload(str(db_path), BytesIO(file.read()), overwrite=True) + else: + _upload_files(abs_path) + + _upload_files(path) + + def _databricks_get(self, board, rpath, lpath, recursive=False, **kwargs): + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient() + file_type = self._databricks_is_type(rpath) + if file_type == "file": + board.fs.get(rpath, lpath, **kwargs) + return + + def _get_files(path, recursive, **kwargs): + raw_contents = w.files.list_directory_contents(path) + contents = list(raw_contents) + details = list(map(self._databricks_content_details, contents)) + for item in details: + item_path = item.get("path") + if item.get("is_directory"): + if recursive: + _get_files(item_path, recursive=recursive, **kwargs) + else: + rel_path = PurePath(item_path).relative_to(rpath) + target_path = PurePath(lpath).joinpath(rel_path) + board.fs.get(item_path, str(target_path)) + + _get_files(rpath, recursive, **kwargs) + + def _databricks_open(self, path): + from databricks.sdk import WorkspaceClient + + if not self._databricks_exists(path): + raise PinsError(f"File or directory does not exist at path: {path}") + w = WorkspaceClient() + resp = w.files.download(path) + f = BytesIO() + shutil.copyfileobj(resp.contents, f) + f.seek(0) + return f + + def _databricks_exists(self, path: str): + if self._databricks_is_type(path) == "nothing": + return False + else: + return True + + @staticmethod + def _databricks_is_type(path: str): + from databricks.sdk import WorkspaceClient + from databricks.sdk.errors import NotFound + + w = WorkspaceClient() + try: + w.files.get_metadata(path) + except NotFound: + try: + w.files.get_directory_metadata(path) + except NotFound: + return "nothing" + else: + return "directory" + else: + return "file" + + def _databricks_ls(self, path, detail): + from databricks.sdk import WorkspaceClient + + if not self._databricks_exists(path): + raise PinsError(f"File or directory does not exist at path: {path}") + w = WorkspaceClient() + if self._databricks_is_type(path) == "file": + if detail: + return [dict(name=path, size=None, type="file")] + else: + return path + + contents_raw = w.files.list_directory_contents(path) + contents = list(contents_raw) + items = [] + for item in contents: + item = self._databricks_content_details(item) + item_path = item.get("path") + item_path = item_path.rstrip("/") + if detail: + if item.get("is_directory"): + item_type = "directory" + else: + item_type = "file" + items.append(dict(name=item_path, size=None, type=item_type)) + else: + items.append(item_path) + return items + + def _databricks_rm_dir(self, path): + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient() + raw_contents = w.files.list_directory_contents(path) + contents = list(raw_contents) + details = list(map(self._databricks_content_details, contents)) + for item in details: + item_path = item.get("path") + if item.get("is_directory"): + self._databricks_rm_dir(item_path) + else: + w.files.delete(item_path) + w.files.delete_directory(path) + + @staticmethod + def _databricks_mkdir(path): + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient() + w.files.create_directory(path) + + @staticmethod + def _databricks_content_details(item): + details = { + "path": item.path, + "name": item.name, + "is_directory": item.is_directory, + } + return details diff --git a/pins/drivers.py b/pins/drivers.py index f0262979..2b5a0045 100644 --- a/pins/drivers.py +++ b/pins/drivers.py @@ -24,6 +24,8 @@ def load_path(filename: str, path_to_version, pin_type=None): filename = "data.csv" if path_to_version is not None: + if isinstance(path_to_version, str): + path_to_version = path_to_version.rstrip("/") path_to_file = f"{path_to_version}/{filename}" else: # BoardUrl doesn't have versions, and the file is the full url diff --git a/pins/tests/conftest.py b/pins/tests/conftest.py index 6a3690e7..2a862096 100644 --- a/pins/tests/conftest.py +++ b/pins/tests/conftest.py @@ -6,10 +6,17 @@ from importlib_resources import files from pytest import mark as m -from pins.tests.helpers import BoardBuilder, RscBoardBuilder, Snapshot, rm_env +from pins.tests.helpers import ( + BoardBuilder, + DbcBoardBuilder, + RscBoardBuilder, + Snapshot, + rm_env, +) EXAMPLE_REL_PATH = "pins/tests/pins-compat" PATH_TO_EXAMPLE_BOARD = files("pins") / "tests/pins-compat" +PATH_TO_EXAMPLE_BOARD_DBC = "/Volumes/workshops/my-board/my-volume/test" PATH_TO_EXAMPLE_VERSION = PATH_TO_EXAMPLE_BOARD / "df_csv/20220214T163720Z-9bfad/" EXAMPLE_PIN_NAME = "df_csv" @@ -21,6 +28,7 @@ pytest.param(lambda: BoardBuilder("s3"), id="s3", marks=m.fs_s3), pytest.param(lambda: BoardBuilder("gcs"), id="gcs", marks=m.fs_gcs), pytest.param(lambda: BoardBuilder("abfs"), id="abfs", marks=m.fs_abfs), + pytest.param(lambda: DbcBoardBuilder("dbc"), id="dbc", marks=m.fs_dbc), ] # rsc should only be used once, because users are created at docker setup time diff --git a/pins/tests/helpers.py b/pins/tests/helpers.py index bfe6341e..38fd0f18 100644 --- a/pins/tests/helpers.py +++ b/pins/tests/helpers.py @@ -1,4 +1,5 @@ import contextlib +import functools import json import os import shutil @@ -13,6 +14,7 @@ from importlib_resources import files from pins.boards import BaseBoard, BoardRsConnect +from pins.constructors import board_databricks DEFAULT_CREATION_DATE = datetime(2020, 1, 13, 23, 58, 59) @@ -20,12 +22,15 @@ # TODO: should use pkg_resources for this path? RSC_KEYS_FNAME = "pins/tests/rsconnect_api_keys.json" +DATABRICKS_VOLUME = "/Volumes/workshops/my-board/my-volume/test" + BOARD_CONFIG = { "file": {"path": ["PINS_TEST_FILE__PATH", None]}, "s3": {"path": ["PINS_TEST_S3__PATH", "ci-pins"]}, "gcs": {"path": ["PINS_TEST_GCS__PATH", "pins-python"]}, "abfs": {"path": ["PINS_TEST_AZURE__PATH", "ci-pins"]}, "rsc": {"path": ["PINS_TEST_RSC__PATH", RSC_SERVER_URL]}, + "dbc": {"path": ["PINS_TEST_DBC__PATH", DATABRICKS_VOLUME]}, } # TODO: Backend initialization should be independent of helpers, but these @@ -33,6 +38,41 @@ # putting imports inside rsconnect particulars for now +def skip_if_dbc(func): + """Decorator to skip test if board protocol is 'dbc'""" + + @functools.wraps(func) + def wrapper(*args, **kwargs): + import inspect + + board = None + + # Get function signature to map args to parameter names. + # We have to do this since parameterized pytest runs passes in + # args in different orders + sig = inspect.signature(func) + bound_args = sig.bind_partial(*args, **kwargs) + all_args = {**bound_args.arguments, **kwargs} + + if "board" in all_args: + board = all_args["board"] + elif "board_with_cache" in all_args: + board = all_args["board_with_cache"] + else: + # Check all arguments for something that looks like a board + for arg_value in all_args.values(): + if hasattr(arg_value, "fs") and hasattr(arg_value.fs, "protocol"): + board = arg_value + break + + if board and board.fs.protocol == "dbc": + pytest.skip("All Databricks tests must be read only") + + return func(*args, **kwargs) + + return wrapper + + def rsc_from_key(name): from pins.rsconnect.api import RsConnectApi @@ -171,7 +211,6 @@ def create_tmp_board(self, src_board=None, versioned=True): from pins.rsconnect.fs import PinBundleManifest # noqa board = BoardRsConnect("", rsc_fs_from_key("derek"), versioned=versioned) - if src_board is None: return board @@ -203,6 +242,39 @@ def teardown(self): self.teardown_board(self.create_tmp_board()) +class DbcBoardBuilder(BoardBuilder): + def __init__(self, fs_name, path=None, *args, **kwargs): + self.path = None + self.fs_name = fs_name + self.current_board = None + self.volume = DATABRICKS_VOLUME + + def create_tmp_board(self, src_board=None, versioned=True): + # TODO: use temp boards when boards are not read-only + # temp_name = str(uuid.uuid4()) + # board_name = os.path.join(self.volume, temp_name) + # db_board = board_databricks(board_name, cache=None) + # board = BaseBoard(board_name, fs=db_board.fs, versioned=versioned) + # if src_board is not None: + # board.fs.put(src_board, board_name) + + db_board = board_databricks(self.volume, cache=None) + board = BaseBoard(self.volume, fs=db_board.fs, versioned=versioned) + self.current_board = board + return board + + def teardown_board(self, board): + pass + # TODO: update when board not read-only + # board.fs.rm(board.board) + + def teardown(self): + pass + # TODO: update when board not read-only + # board = board_databricks(self.volume) + # board.fs.rm(self.current_board.board) + + # Snapshot ==================================================================== diff --git a/pins/tests/test_boards.py b/pins/tests/test_boards.py index 4afeecc0..ee22bd3d 100644 --- a/pins/tests/test_boards.py +++ b/pins/tests/test_boards.py @@ -15,7 +15,7 @@ from pins.config import PINS_ENV_INSECURE_READ from pins.errors import PinsError, PinsInsecureReadError, PinsVersionError from pins.meta import MetaRaw -from pins.tests.helpers import DEFAULT_CREATION_DATE, rm_env +from pins.tests.helpers import DEFAULT_CREATION_DATE, rm_env, skip_if_dbc @fixture @@ -71,12 +71,14 @@ def test_board_validate_pin_name_root(board): # pin_write =================================================================== +@skip_if_dbc def test_board_pin_write_default_title(board): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) meta = board.pin_write(df, "df_csv", title=None, type="csv") assert meta.title == "df_csv: a pinned 3 x 2 DataFrame" +@skip_if_dbc def test_board_pin_write_prepare_pin(board, tmp_path: Path): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) @@ -87,6 +89,7 @@ def test_board_pin_write_prepare_pin(board, tmp_path: Path): assert not (tmp_path / "df_csv.csv").is_dir() +@skip_if_dbc def test_board_pin_write_roundtrip(board): df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) @@ -100,6 +103,7 @@ def test_board_pin_write_roundtrip(board): assert loaded_df.equals(df) +@skip_if_dbc def test_board_pin_write_type_not_specified_error(board): class C: pass @@ -108,6 +112,7 @@ class C: board.pin_write(C(), "cool_pin") +@skip_if_dbc def test_board_pin_write_type_error(board): class C: pass @@ -118,6 +123,7 @@ class C: assert "MY_TYPE" in exc_info.value.args[0] +@skip_if_dbc def test_board_pin_write_feather_deprecated(board): df = pd.DataFrame({"x": [1, 2, 3]}) @@ -125,6 +131,7 @@ def test_board_pin_write_feather_deprecated(board): board.pin_write(df, "cool_pin", type="feather") +@skip_if_dbc def test_board_pin_write_file_raises_error(board, tmp_path): df = pd.DataFrame({"x": [1, 2, 3]}) @@ -136,6 +143,7 @@ def test_board_pin_write_file_raises_error(board, tmp_path): board.pin_write(path, "cool_pin", type="file") +@skip_if_dbc @pytest.mark.parametrize("force_identical_write", [True, False]) def test_board_pin_write_force_identical_write_pincount(board, force_identical_write): df = pd.DataFrame({"x": [1, 2, 3]}) @@ -153,6 +161,7 @@ def test_board_pin_write_force_identical_write_pincount(board, force_identical_w assert len(versions) == 1 +@skip_if_dbc def test_board_pin_write_force_identical_write_msg( board, capfd: pytest.CaptureFixture[str] ): @@ -170,6 +179,7 @@ def test_board_pin_write_force_identical_write_msg( assert len(versions) == 1 +@skip_if_dbc def test_board_pin_download(board_with_cache, tmp_path): # create and save data df = pd.DataFrame({"x": [1, 2, 3]}) @@ -188,6 +198,7 @@ def test_board_pin_download(board_with_cache, tmp_path): board_with_cache.pin_read("cool_pin") +@skip_if_dbc def test_board_pin_download_filename_many_suffixes(board_with_cache, tmp_path): # create and save data df = pd.DataFrame({"x": [1, 2, 3]}) @@ -204,6 +215,7 @@ def test_board_pin_download_filename_many_suffixes(board_with_cache, tmp_path): assert df.x.tolist() == [1, 2, 3] +@skip_if_dbc def test_board_pin_download_filename_no_suffixes(board_with_cache, tmp_path): # create and save data df = pd.DataFrame({"x": [1, 2, 3]}) @@ -220,6 +232,7 @@ def test_board_pin_download_filename_no_suffixes(board_with_cache, tmp_path): assert df.x.tolist() == [1, 2, 3] +@skip_if_dbc def test_board_pin_download_filename(board_with_cache, tmp_path): # create and save data df = pd.DataFrame({"x": [1, 2, 3]}) @@ -235,6 +248,7 @@ def test_board_pin_download_filename(board_with_cache, tmp_path): assert Path(pin_path).name == "data.csv" +@skip_if_dbc def test_board_pin_download_no_cache_error(board, tmp_path): df = pd.DataFrame({"x": [1, 2, 3]}) path = tmp_path / "data.csv" @@ -253,6 +267,7 @@ def test_board_pin_download_no_cache_error(board, tmp_path): (pin_path,) = board.pin_download("cool_pin") +@skip_if_dbc def test_board_pin_upload_path_list(board_with_cache, tmp_path): # create and save data df = pd.DataFrame({"x": [1, 2, 3]}) @@ -266,6 +281,7 @@ def test_board_pin_upload_path_list(board_with_cache, tmp_path): (pin_path,) = board_with_cache.pin_download("cool_pin") +@skip_if_dbc def test_board_pin_download_filename_multifile(board_with_cache, tmp_path): # create and save data df = pd.DataFrame({"x": [1, 2, 3]}) @@ -310,6 +326,7 @@ def test_board_pin_write_rsc_index_html(board, tmp_path: Path, snapshot): # pin_write against different types ------------------------------------------- +@skip_if_dbc @parametrize( "obj, type_", [ @@ -333,6 +350,7 @@ def test_board_pin_write_type(board, obj, type_, request): obj == dst_obj +@skip_if_dbc def test_board_pin_read_insecure_fail_default(board): board.pin_write({"a": 1}, "test_pin", type="joblib", title="some title") with pytest.raises(PinsInsecureReadError) as exc_info: @@ -341,6 +359,7 @@ def test_board_pin_read_insecure_fail_default(board): assert "joblib" in exc_info.value.args[0] +@skip_if_dbc def test_board_pin_read_insecure_fail_board_flag(board): # board flag prioritized over env var with rm_env(PINS_ENV_INSECURE_READ): @@ -351,6 +370,7 @@ def test_board_pin_read_insecure_fail_board_flag(board): board.pin_read("test_pin") +@skip_if_dbc def test_board_pin_read_insecure_succeed_board_flag(board): # board flag prioritized over env var with rm_env(PINS_ENV_INSECURE_READ): @@ -363,6 +383,7 @@ def test_board_pin_read_insecure_succeed_board_flag(board): # pin_write with unversioned boards =========================================== +@skip_if_dbc @pytest.mark.parametrize("versioned", [None, False]) def test_board_unversioned_pin_write_unversioned_force_identical_write( versioned, board_unversioned @@ -389,6 +410,7 @@ def test_board_unversioned_pin_write_unversioned_force_identical_write( assert board_unversioned.pin_read("test_pin") == {"a": 2} +@skip_if_dbc @pytest.mark.parametrize("versioned", [None, False]) def test_board_unversioned_pin_write_unversioned(versioned, board_unversioned): board_unversioned.pin_write({"a": 1}, "test_pin", type="json", versioned=versioned) @@ -398,6 +420,7 @@ def test_board_unversioned_pin_write_unversioned(versioned, board_unversioned): assert board_unversioned.pin_read("test_pin") == {"a": 2} +@skip_if_dbc def test_board_unversioned_pin_write_versioned(board_unversioned): board_unversioned.pin_write({"a": 1}, "test_pin", type="json", versioned=False) board_unversioned.pin_write({"a": 2}, "test_pin", type="json", versioned=True) @@ -405,6 +428,7 @@ def test_board_unversioned_pin_write_versioned(board_unversioned): assert len(board_unversioned.pin_versions("test_pin")) == 2 +@skip_if_dbc def test_board_versioned_pin_write_unversioned(board): # should fall back to the versioned setting of the board board.pin_write({"a": 1}, "test_pin", type="json") @@ -426,6 +450,9 @@ def pin_name(): @pytest.fixture def pin_del(board, df, pin_name): + # TODO: update when dbc boards no longer read-only + if board.fs.protocol == "dbc": + pytest.skip() # 1min ago to avoid name collision one_min_ago = datetime.now() - timedelta(minutes=1) meta_old = board.pin_write( @@ -443,6 +470,9 @@ def pin_del(board, df, pin_name): @pytest.fixture def pin_prune(board, df, pin_name): + # TODO: update when dbc boards no longer read-only + if board.fs.protocol == "dbc": + pytest.skip() today = datetime.now() day_ago = today - timedelta(days=1, minutes=1) two_days_ago = today - timedelta(days=2, minutes=1) @@ -536,6 +566,7 @@ def test_board_pin_versions_prune_days(board, pin_prune, pin_name, days): assert len(new_versions) == days +@skip_if_dbc def test_board_pin_versions_prune_days_protect_most_recent(board, pin_name): """To address https://github.com/rstudio/pins-python/issues/297""" # Posit cannot handle days, since it involves pulling metadata @@ -579,6 +610,7 @@ def test_board_pin_versions_prune_days_protect_most_recent(board, pin_name): ("the-title", ["x-pin-1", "x-pin-2", "y-pin-1", "y-z"]), ], ) +@skip_if_dbc def test_board_pin_search_name(board, df, search, matches): if board.fs.protocol == "rsc": matches = ["derek/" + m for m in matches] diff --git a/pins/tests/test_compat.py b/pins/tests/test_compat.py index 6b66aadd..1477d1c3 100644 --- a/pins/tests/test_compat.py +++ b/pins/tests/test_compat.py @@ -3,8 +3,12 @@ import pytest from pins.errors import PinsError -from pins.tests.conftest import PATH_TO_EXAMPLE_BOARD, PATH_TO_MANIFEST_BOARD -from pins.tests.helpers import xfail_fs +from pins.tests.conftest import ( + PATH_TO_EXAMPLE_BOARD, + PATH_TO_EXAMPLE_BOARD_DBC, + PATH_TO_MANIFEST_BOARD, +) +from pins.tests.helpers import skip_if_dbc, xfail_fs NOT_A_PIN = "not_a_pin_abcdefg" PIN_CSV = "df_csv" @@ -15,7 +19,8 @@ @pytest.fixture(scope="session") def board(backend): board = backend.create_tmp_board(str(PATH_TO_EXAMPLE_BOARD.absolute())) - + if board.fs.protocol == "dbc": + board = backend.create_tmp_board(str(PATH_TO_EXAMPLE_BOARD_DBC)) yield board backend.teardown_board(board) @@ -25,7 +30,7 @@ def board(backend): def board_manifest(backend): # skip on rsconnect, since it can't add a manifest and the pin names # are too short for use to upload (rsc requires names > 3 characters) - if backend.fs_name == "rsc": + if backend.fs_name in ["rsc", "dbc"]: pytest.skip() board = backend.create_tmp_board(str(PATH_TO_MANIFEST_BOARD.absolute())) @@ -45,6 +50,18 @@ def test_compat_pin_list(board): if board.fs.protocol == "rsc": # rsc backend uses / for full name dst_sorted = [f"{board.user_name}/{content}" for content in dst_sorted] + if board.fs.protocol == "dbc": + # TODO: update to match when not read-only + dst_sorted = [ + "cool_pin", + "cool_pin2", + "cool_pin3", + "data", + "df_csv", + "reviews", + "reviews2", + "reviews3", + ] assert src_sorted == dst_sorted @@ -57,7 +74,11 @@ def test_compat_pin_versions(board): pytest.skip("RSC uses bundle ids as pin versions") versions = board.pin_versions("df_csv", as_df=False) v_strings = list(v.version for v in versions) - assert v_strings == ["20220214T163718Z-eceac", "20220214T163720Z-9bfad"] + # TODO: update when dbc is not read-only + if board.fs.protocol == "dbc": + v_strings == ["20250410T083026Z-a173c"] + else: + assert v_strings == ["20220214T163718Z-eceac", "20220214T163720Z-9bfad"] @pytest.mark.skip("Used to diagnose os listdir ordering") @@ -92,6 +113,16 @@ def test_compat_pin_meta(board): # TODO: afaik the bundle id is largely non-deterministic, so not possible # to test, but should think a bit more about it. assert meta.name == "derek/df_csv" + # TODO: update when dbc boards are not read-only + elif board.fs.protocol == "dbc": + assert meta.title == "df_csv: a pinned 3 x 2 DataFrame" + assert meta.description is None + assert meta.created == "20250410T083026Z" + assert meta.file == "df_csv.csv" + assert meta.file_size == 16 + assert meta.pin_hash == "a173cd6a53908980" + assert meta.type == "csv" + return else: assert meta.version.version == "20220214T163720Z-9bfad" assert meta.version.created == datetime.datetime(2022, 2, 14, 16, 37, 20) @@ -122,9 +153,15 @@ def test_compat_pin_meta_pin_missing(board): @xfail_fs("rsc") def test_compat_pin_meta_version_arg(board): # note that in RSConnect the version is the bundle id - meta = board.pin_meta(PIN_CSV, "20220214T163718Z-eceac") - assert meta.version.version == "20220214T163718Z-eceac" - assert meta.version.hash == "eceac" + # TODO: update when dbc is not read-only + if board.fs.protocol == "dbc": + meta = board.pin_meta(PIN_CSV, "20250410T083026Z-a173c") + assert meta.version.version == "20250410T083026Z-a173c" + assert meta.version.hash == "a173c" + else: + meta = board.pin_meta(PIN_CSV, "20220214T163718Z-eceac") + assert meta.version.version == "20220214T163718Z-eceac" + assert meta.version.hash == "eceac" def test_compat_pin_meta_version_arg_error(board): @@ -146,12 +183,18 @@ def test_compat_pin_read(board): p_data = PATH_TO_EXAMPLE_BOARD / "df_csv" / "20220214T163720Z-9bfad" / "df_csv.csv" src_df = board.pin_read("df_csv") - dst_df = pd.read_csv(p_data) + + # TODO: update when dbc boards are not read-only + if board.fs.protocol == "dbc": + dst_df = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + else: + dst_df = pd.read_csv(p_data) assert isinstance(src_df, pd.DataFrame) assert src_df.equals(dst_df) +@skip_if_dbc def test_compat_pin_read_supported_rds(board): pytest.importorskip("rdata") import pandas as pd diff --git a/pins/tests/test_constructors.py b/pins/tests/test_constructors.py index 3ff9c252..d0106510 100644 --- a/pins/tests/test_constructors.py +++ b/pins/tests/test_constructors.py @@ -11,7 +11,7 @@ PATH_TO_EXAMPLE_BOARD, PATH_TO_EXAMPLE_VERSION, ) -from pins.tests.helpers import rm_env +from pins.tests.helpers import rm_env, skip_if_dbc @pytest.fixture @@ -41,6 +41,8 @@ def construct_from_board(board): if fs_name in ["file", ("file", "local")]: board = c.board_folder(board.board) + elif fs_name == "dbc": + board = c.board_databricks(board.board) elif fs_name == "rsc": board = c.board_rsconnect( server_url=board.fs.api.server_url, api_key=board.fs.api.api_key @@ -176,6 +178,7 @@ def board(backend): backend.teardown_board(board) +@skip_if_dbc # passes, but skipping since this cannot clean itself up properly def test_constructor_boards(board, df_csv, tmp_cache): # TODO: would be nice to have fixtures for each board constructor # doesn't need to copy over pins-compat content @@ -188,7 +191,11 @@ def test_constructor_boards(board, df_csv, tmp_cache): df = board.pin_read("df_csv") # check data - assert_frame_equal(df, df_csv) + # TODO: update when dbc boards are not read-only + if board.fs.protocol == "dbc": + pass + else: + assert_frame_equal(df, df_csv) # check the cache structure ----------------------------------------------- @@ -230,6 +237,7 @@ def board2(backend): backend.teardown_board(board2) +@skip_if_dbc def test_constructor_boards_multi_user(board2, df_csv, tmp_cache): prot = board2.fs.protocol fs_name = prot if isinstance(prot, str) else prot[0] diff --git a/pyproject.toml b/pyproject.toml index 146ef540..1deb7bb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,9 @@ check = [ "pyright==1.1.372", # Pinned; manually sync with .github/workflows/code-checks.yml "ruff==0.5.4", # Pinned; manually sync with pre-commit-config.yaml "types-appdirs", + "databricks-sdk" ] +databricks = ["databricks-sdk"] doc = [ "ipykernel", "ipython<=8.12", @@ -64,6 +66,7 @@ test = [ "pytest-parallel", "s3fs", "rdata", + "databricks-sdk", ] [build-system] @@ -88,6 +91,7 @@ markers = [ "fs_gcs: mark test to only run on Google Cloud Storage bucket filesystem", "fs_abfs: mark test to only run on Azure Datalake filesystem", "fs_rsc: mark test to only run on Posit Connect filesystem", + "fs_dbc: mark test to only run on Databricks Volume filesystem", "skip_on_github: skip this test if running on GitHub", ] testpaths = ["pins"] diff --git a/requirements/dev.txt b/requirements/dev.txt index effdf5a3..0f8fd751 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -2,23 +2,23 @@ # This file is autogenerated by pip-compile with Python 3.11 # by the following command: # -# pip-compile --extra=doc --extra=test --extra=check --output-file=- --strip-extras pyproject.toml +# pip-compile --extra=check --extra=doc --extra=test --output-file=- --strip-extras pyproject.toml # ---index-url https://pypi.python.org/simple/ ---trusted-host pypi.org -adlfs==2024.7.0 +adlfs==2024.12.0 # via pins (pyproject.toml) -aiobotocore==2.13.1 +aiobotocore==2.22.0 # via s3fs -aiohttp==3.9.5 +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.7 # via # adlfs # aiobotocore # gcsfs # s3fs -aioitertools==0.11.0 +aioitertools==0.12.0 # via aiobotocore -aiosignal==1.3.1 +aiosignal==1.3.2 # via aiohttp annotated-types==0.7.0 # via pydantic @@ -28,59 +28,62 @@ appnope==0.1.4 # via # ipykernel # ipython -asttokens==2.4.1 +asttokens==3.0.0 # via stack-data -attrs==23.2.0 +attrs==25.3.0 # via # aiohttp # jsonschema # pytest # referencing # sphobjinv -azure-core==1.30.2 +azure-core==1.34.0 # via # adlfs # azure-identity # azure-storage-blob azure-datalake-store==0.0.53 # via adlfs -azure-identity==1.17.1 +azure-identity==1.23.0 # via adlfs -azure-storage-blob==12.20.0 +azure-storage-blob==12.25.1 # via adlfs backcall==0.2.0 # via ipython -beartype==0.18.5 +beartype==0.21.0 # via plum-dispatch -botocore==1.34.131 +black==25.1.0 + # via quartodoc +botocore==1.37.3 # via aiobotocore -build==1.2.1 +build==1.2.2.post1 # via pip-tools -cachetools==5.4.0 +cachetools==5.5.2 # via google-auth -certifi==2024.7.4 +certifi==2025.4.26 # via # requests # sphobjinv -cffi==1.16.0 +cffi==1.17.1 # via # azure-datalake-store # cryptography cfgv==3.4.0 # via pre-commit -charset-normalizer==3.3.2 +charset-normalizer==3.4.2 # via requests -click==8.1.7 +click==8.2.1 # via + # black # pip-tools # quartodoc colorama==0.4.6 # via griffe comm==0.2.2 # via ipykernel -cramjam==2.8.3 +cramjam==2.10.0 # via fastparquet -cryptography==42.0.8 +cryptography==45.0.3 # via # azure-identity # azure-storage-blob @@ -88,121 +91,126 @@ cryptography==42.0.8 # pyjwt databackend==0.0.3 # via pins (pyproject.toml) -debugpy==1.8.2 +databricks-sdk==0.55.0 + # via pins (pyproject.toml) +debugpy==1.8.14 # via ipykernel decopatch==1.4.10 # via pytest-cases -decorator==5.1.1 +decorator==5.2.1 # via # gcsfs # ipython -distlib==0.3.8 +distlib==0.3.9 # via virtualenv -executing==2.0.1 +executing==2.2.0 # via stack-data -fastjsonschema==2.20.0 +fastjsonschema==2.21.1 # via nbformat -fastparquet==2024.5.0 +fastparquet==2024.11.0 # via pins (pyproject.toml) -filelock==3.15.4 +filelock==3.18.0 # via virtualenv -frozenlist==1.4.1 +frozenlist==1.6.2 # via # aiohttp # aiosignal -fsspec==2024.6.1 +fsspec==2025.5.1 # via - # pins (pyproject.toml) # adlfs # fastparquet # gcsfs + # pins (pyproject.toml) # s3fs -gcsfs==2024.6.1 +gcsfs==2025.5.1 # via pins (pyproject.toml) -google-api-core==2.19.1 +google-api-core==2.25.0 # via # google-cloud-core # google-cloud-storage -google-auth==2.32.0 +google-auth==2.40.2 # via + # databricks-sdk # gcsfs # google-api-core # google-auth-oauthlib # google-cloud-core # google-cloud-storage -google-auth-oauthlib==1.2.1 +google-auth-oauthlib==1.2.2 # via gcsfs -google-cloud-core==2.4.1 +google-cloud-core==2.4.3 # via google-cloud-storage -google-cloud-storage==2.17.0 +google-cloud-storage==3.1.0 # via gcsfs -google-crc32c==1.5.0 +google-crc32c==1.7.1 # via # google-cloud-storage # google-resumable-media -google-resumable-media==2.7.1 +google-resumable-media==2.7.2 # via google-cloud-storage -googleapis-common-protos==1.63.2 +googleapis-common-protos==1.70.0 # via google-api-core -griffe==0.48.0 +griffe==1.7.3 # via quartodoc -humanize==4.10.0 +humanize==4.12.3 # via pins (pyproject.toml) -identify==2.6.0 +identify==2.6.12 # via pre-commit -idna==3.7 +idna==3.10 # via # requests # yarl -importlib-metadata==8.0.0 +importlib-metadata==8.7.0 # via # pins (pyproject.toml) # quartodoc -importlib-resources==6.4.0 +importlib-resources==6.5.2 # via # pins (pyproject.toml) # quartodoc -iniconfig==2.0.0 +iniconfig==2.1.0 # via pytest ipykernel==6.29.5 # via pins (pyproject.toml) ipython==8.12.0 # via - # pins (pyproject.toml) # ipykernel -isodate==0.6.1 + # pins (pyproject.toml) +isodate==0.7.2 # via azure-storage-blob -jedi==0.19.1 +jedi==0.19.2 # via ipython -jinja2==3.1.4 +jinja2==3.1.6 # via pins (pyproject.toml) jmespath==1.0.1 - # via botocore -joblib==1.4.2 + # via + # aiobotocore + # botocore +joblib==1.5.1 # via pins (pyproject.toml) -jsonschema==4.23.0 +jsonschema==4.24.0 # via # nbformat # sphobjinv -jsonschema-specifications==2023.12.1 +jsonschema-specifications==2025.4.1 # via jsonschema -jupyter-client==8.6.2 +jupyter-client==8.6.3 # via # ipykernel # nbclient -jupyter-core==5.7.2 +jupyter-core==5.8.1 # via # ipykernel # jupyter-client # nbclient # nbformat -makefun==1.15.4 +makefun==1.16.0 # via # decopatch # pytest-cases markdown-it-py==3.0.0 # via rich -markupsafe==2.1.5 +markupsafe==3.0.2 # via jinja2 matplotlib-inline==0.1.7 # via @@ -210,110 +218,120 @@ matplotlib-inline==0.1.7 # ipython mdurl==0.1.2 # via markdown-it-py -msal==1.29.0 +msal==1.32.3 # via # azure-datalake-store # azure-identity # msal-extensions -msal-extensions==1.2.0 +msal-extensions==1.3.1 # via azure-identity -multidict==6.0.5 +multidict==6.4.4 # via + # aiobotocore # aiohttp # yarl -nbclient==0.10.0 +mypy-extensions==1.1.0 + # via black +nbclient==0.10.2 # via pins (pyproject.toml) nbformat==5.10.4 # via - # pins (pyproject.toml) # nbclient + # pins (pyproject.toml) nest-asyncio==1.6.0 # via ipykernel nodeenv==1.9.1 # via # pre-commit # pyright -numpy==2.0.0 +numpy==2.2.6 # via # fastparquet # pandas - # pyarrow # rdata # xarray oauthlib==3.2.2 # via requests-oauthlib -packaging==24.1 +packaging==25.0 # via + # black # build # fastparquet # ipykernel # pytest # pytest-cases # xarray -pandas==2.2.2 +pandas==2.2.3 # via - # pins (pyproject.toml) # fastparquet + # pins (pyproject.toml) # rdata # xarray parso==0.8.4 # via jedi +pathspec==0.12.1 + # via black pexpect==4.9.0 # via ipython pickleshare==0.7.5 # via ipython pip-tools==7.4.1 # via pins (pyproject.toml) -platformdirs==4.2.2 +platformdirs==4.3.8 # via + # black # jupyter-core # virtualenv -pluggy==1.5.0 +pluggy==1.6.0 # via pytest -plum-dispatch==2.5.1.post1 +plum-dispatch==2.5.7 # via quartodoc -portalocker==2.10.1 - # via msal-extensions -pre-commit==3.7.1 +pre-commit==4.2.0 # via pins (pyproject.toml) -prompt-toolkit==3.0.47 +prompt-toolkit==3.0.51 # via ipython -proto-plus==1.24.0 +propcache==0.3.1 + # via + # aiohttp + # yarl +proto-plus==1.26.1 # via google-api-core -protobuf==5.27.2 +protobuf==6.31.1 # via # google-api-core # googleapis-common-protos # proto-plus -psutil==6.0.0 +psutil==7.0.0 # via ipykernel ptyprocess==0.7.0 # via pexpect -pure-eval==0.2.2 +pure-eval==0.2.3 # via stack-data py==1.11.0 # via pytest -pyarrow==16.1.0 +pyarrow==20.0.0 # via pins (pyproject.toml) -pyasn1==0.6.0 +pyasn1==0.6.1 # via # pyasn1-modules # rsa -pyasn1-modules==0.4.0 +pyasn1-modules==0.4.2 # via google-auth pycparser==2.22 # via cffi -pydantic==2.8.2 +pydantic==2.11.5 # via quartodoc -pydantic-core==2.20.1 +pydantic-core==2.33.2 # via pydantic -pygments==2.18.0 +pygments==2.19.1 # via # ipython # rich -pyjwt==2.8.0 - # via msal -pyproject-hooks==1.1.0 +pyjwt==2.10.1 + # via + # msal + # pyjwt +pyproject-hooks==1.2.0 # via # build # pip-tools @@ -324,7 +342,7 @@ pytest==7.1.3 # pins (pyproject.toml) # pytest-dotenv # pytest-parallel -pytest-cases==3.8.5 +pytest-cases==3.8.6 # via pins (pyproject.toml) pytest-dotenv==0.5.2 # via pins (pyproject.toml) @@ -332,70 +350,72 @@ pytest-parallel==0.1.1 # via pins (pyproject.toml) python-dateutil==2.9.0.post0 # via + # aiobotocore # botocore # jupyter-client # pandas -python-dotenv==1.0.1 +python-dotenv==1.1.0 # via pytest-dotenv -pytz==2024.1 +pytz==2025.2 # via pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via # pins (pyproject.toml) # pre-commit # quartodoc -pyzmq==26.0.3 +pyzmq==26.4.0 # via # ipykernel # jupyter-client -quartodoc==0.7.5 +quartodoc==0.10.0 # via pins (pyproject.toml) rdata==0.11.2 # via pins (pyproject.toml) -referencing==0.35.1 +referencing==0.36.2 # via # jsonschema # jsonschema-specifications requests==2.32.3 # via - # pins (pyproject.toml) # azure-core # azure-datalake-store + # databricks-sdk # gcsfs # google-api-core # google-cloud-storage # msal + # pins (pyproject.toml) # quartodoc # requests-oauthlib requests-oauthlib==2.0.0 # via google-auth-oauthlib -rich==13.7.1 +rich==14.0.0 # via plum-dispatch -rpds-py==0.19.0 +rpds-py==0.25.1 # via # jsonschema # referencing -rsa==4.9 +rsa==4.9.1 # via google-auth -s3fs==2024.6.1 +ruff==0.5.4 + # via pins (pyproject.toml) +s3fs==2025.5.1 # via pins (pyproject.toml) -six==1.16.0 +six==1.17.0 # via - # asttokens # azure-core - # isodate # python-dateutil -sphobjinv==2.3.1.1 +sphobjinv==2.3.1.3 # via quartodoc stack-data==0.6.3 # via ipython tabulate==0.9.0 # via quartodoc -tblib==3.0.0 +tblib==3.1.0 # via pytest-parallel -tomli==2.0.1 +tomli==2.2.1 # via pytest -tornado==6.4.1 +tornado==6.5.1 # via # ipykernel # jupyter-client @@ -411,38 +431,44 @@ traitlets==5.14.3 # nbformat types-appdirs==1.4.3.5 # via pins (pyproject.toml) -typing-extensions==4.12.2 +typing-extensions==4.14.0 # via # azure-core # azure-identity # azure-storage-blob + # pins (pyproject.toml) + # plum-dispatch # pydantic # pydantic-core # quartodoc # rdata -tzdata==2024.1 + # referencing + # typing-inspection +typing-inspection==0.4.1 + # via pydantic +tzdata==2025.2 # via pandas -urllib3==2.2.2 +urllib3==2.4.0 # via # botocore # requests -virtualenv==20.26.3 +virtualenv==20.31.2 # via pre-commit -watchdog==4.0.1 +watchdog==6.0.0 # via quartodoc wcwidth==0.2.13 # via prompt-toolkit -wheel==0.43.0 +wheel==0.45.1 # via pip-tools -wrapt==1.16.0 +wrapt==1.17.2 # via aiobotocore -xarray==2024.11.0 +xarray==2025.4.0 # via rdata -xxhash==3.4.1 +xxhash==3.5.0 # via pins (pyproject.toml) -yarl==1.9.4 +yarl==1.20.0 # via aiohttp -zipp==3.19.2 +zipp==3.22.0 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: