diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 4cc38db5dc..49b6f8a666 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -146,16 +146,20 @@ For the FileIO there are several configuration options available: -| Key | Example | Description | -| ---------------------- | ----------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| adls.connection-string | AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqF...;BlobEndpoint= | A [connection string](https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string). This could be used to use FileIO with any adls-compatible object storage service that has a different endpoint (like [azurite](https://github.com/azure/azurite)). | -| adls.account-name | devstoreaccount1 | The account that you want to connect to | -| adls.account-key | Eby8vdM02xNOcqF... | The key to authentication against the account. | -| adls.sas-token | NuHOuuzdQN7VRM%2FOpOeqBlawRCA845IY05h9eu1Yte4%3D | The shared access signature | -| adls.tenant-id | ad667be4-b811-11ed-afa1-0242ac120002 | The tenant-id | -| adls.client-id | ad667be4-b811-11ed-afa1-0242ac120002 | The client-id | -| adls.client-secret | oCA3R6P\*ka#oa1Sms2J74z... | The client-secret | -| adls.account-host | accountname1.blob.core.windows.net | The storage account host. See [AzureBlobFileSystem](https://github.com/fsspec/adlfs/blob/adb9c53b74a0d420625b86dd00fbe615b43201d2/adlfs/spec.py#L125) for reference | +| Key | Example | Description | +|------------------------------|---------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| adls.connection-string | AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqF...;BlobEndpoint= | A [connection string](https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string). This could be used to use FileIO with any adls-compatible object storage service that has a different endpoint (like [azurite](https://github.com/azure/azurite)). | +| adls.account-name | devstoreaccount1 | The account that you want to connect to | +| adls.account-key | Eby8vdM02xNOcqF... | The key to authentication against the account. | +| adls.sas-token | NuHOuuzdQN7VRM%2FOpOeqBlawRCA845IY05h9eu1Yte4%3D | The shared access signature | +| adls.tenant-id | ad667be4-b811-11ed-afa1-0242ac120002 | The tenant-id | +| adls.client-id | ad667be4-b811-11ed-afa1-0242ac120002 | The client-id | +| adls.client-secret | oCA3R6P\*ka#oa1Sms2J74z... | The client-secret | +| adls.account-host | accountname1.blob.core.windows.net | The storage account host. See [AzureBlobFileSystem](https://github.com/fsspec/adlfs/blob/adb9c53b74a0d420625b86dd00fbe615b43201d2/adlfs/spec.py#L125) for reference | +| adls.blob-storage-authority | .blob.core.windows.net | The hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference | +| adls.dfs-storage-authority | .dfs.core.windows.net | The hostname[:port] of the Data Lake Gen 2 Service. Defaults to `.dfs.core.windows.net`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference | +| adls.blob-storage-scheme | https | Either `http` or `https`. Defaults to `https`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference | +| adls.dfs-storage-scheme | https | Either `http` or `https`. Defaults to `https`. Useful for connecting to a local emulator, like [azurite](https://github.com/azure/azurite). See [AzureFileSystem](https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system) for reference | diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index b6fa934fdd..a5e6d7c6c4 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -82,6 +82,10 @@ ADLS_CLIENT_ID = "adls.client-id" ADLS_CLIENT_SECRET = "adls.client-secret" ADLS_ACCOUNT_HOST = "adls.account-host" +ADLS_BLOB_STORAGE_AUTHORITY = "adls.blob-storage-authority" +ADLS_DFS_STORAGE_AUTHORITY = "adls.dfs-storage-authority" +ADLS_BLOB_STORAGE_SCHEME = "adls.blob-storage-scheme" +ADLS_DFS_STORAGE_SCHEME = "adls.dfs-storage-scheme" GCS_TOKEN = "gcs.oauth2.token" GCS_TOKEN_EXPIRES_AT_MS = "gcs.oauth2.token-expires-at" GCS_PROJECT_ID = "gcs.project-id" diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 1aaab32dbe..744c9a2118 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -85,6 +85,13 @@ ) from pyiceberg.expressions.visitors import visit as boolean_expression_visit from pyiceberg.io import ( + ADLS_ACCOUNT_KEY, + ADLS_ACCOUNT_NAME, + ADLS_BLOB_STORAGE_AUTHORITY, + ADLS_BLOB_STORAGE_SCHEME, + ADLS_DFS_STORAGE_AUTHORITY, + ADLS_DFS_STORAGE_SCHEME, + ADLS_SAS_TOKEN, AWS_ACCESS_KEY_ID, AWS_REGION, AWS_ROLE_ARN, @@ -394,6 +401,9 @@ def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSyste elif scheme in {"gs", "gcs"}: return self._initialize_gcs_fs() + elif scheme in {"abfs", "abfss", "wasb", "wasbs"}: + return self._initialize_azure_fs() + elif scheme in {"file"}: return self._initialize_local_fs() @@ -475,6 +485,43 @@ def _initialize_s3_fs(self, netloc: Optional[str]) -> FileSystem: return S3FileSystem(**client_kwargs) + def _initialize_azure_fs(self) -> FileSystem: + from packaging import version + + MIN_PYARROW_VERSION_SUPPORTING_AZURE_FS = "20.0.0" + if version.parse(pyarrow.__version__) < version.parse(MIN_PYARROW_VERSION_SUPPORTING_AZURE_FS): + raise ImportError( + f"pyarrow version >= {MIN_PYARROW_VERSION_SUPPORTING_AZURE_FS} required for AzureFileSystem support, " + f"but found version {pyarrow.__version__}." + ) + + from pyarrow.fs import AzureFileSystem + + client_kwargs: Dict[str, str] = {} + + if account_name := self.properties.get(ADLS_ACCOUNT_NAME): + client_kwargs["account_name"] = account_name + + if account_key := self.properties.get(ADLS_ACCOUNT_KEY): + client_kwargs["account_key"] = account_key + + if blob_storage_authority := self.properties.get(ADLS_BLOB_STORAGE_AUTHORITY): + client_kwargs["blob_storage_authority"] = blob_storage_authority + + if dfs_storage_authority := self.properties.get(ADLS_DFS_STORAGE_AUTHORITY): + client_kwargs["dfs_storage_authority"] = dfs_storage_authority + + if blob_storage_scheme := self.properties.get(ADLS_BLOB_STORAGE_SCHEME): + client_kwargs["blob_storage_scheme"] = blob_storage_scheme + + if dfs_storage_scheme := self.properties.get(ADLS_DFS_STORAGE_SCHEME): + client_kwargs["dfs_storage_scheme"] = dfs_storage_scheme + + if sas_token := self.properties.get(ADLS_SAS_TOKEN): + client_kwargs["sas_token"] = sas_token + + return AzureFileSystem(**client_kwargs) + def _initialize_hdfs_fs(self, scheme: str, netloc: Optional[str]) -> FileSystem: from pyarrow.fs import HadoopFileSystem diff --git a/tests/conftest.py b/tests/conftest.py index 729e29cb0c..a584f98c10 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -52,6 +52,12 @@ from pyiceberg.catalog.noop import NoopCatalog from pyiceberg.expressions import BoundReference from pyiceberg.io import ( + ADLS_ACCOUNT_KEY, + ADLS_ACCOUNT_NAME, + ADLS_BLOB_STORAGE_AUTHORITY, + ADLS_BLOB_STORAGE_SCHEME, + ADLS_DFS_STORAGE_AUTHORITY, + ADLS_DFS_STORAGE_SCHEME, GCS_PROJECT_ID, GCS_SERVICE_HOST, GCS_TOKEN, @@ -348,6 +354,11 @@ def table_schema_with_all_types() -> Schema: ) +@pytest.fixture(params=["abfs", "abfss", "wasb", "wasbs"]) +def adls_scheme(request: pytest.FixtureRequest) -> str: + return request.param + + @pytest.fixture(scope="session") def pyarrow_schema_simple_without_ids() -> "pa.Schema": import pyarrow as pa @@ -2088,6 +2099,26 @@ def fsspec_fileio_gcs(request: pytest.FixtureRequest) -> FsspecFileIO: return fsspec.FsspecFileIO(properties=properties) +@pytest.fixture +def adls_fsspec_fileio(request: pytest.FixtureRequest) -> Generator[FsspecFileIO, None, None]: + from azure.storage.blob import BlobServiceClient + + azurite_url = request.config.getoption("--adls.endpoint") + azurite_account_name = request.config.getoption("--adls.account-name") + azurite_account_key = request.config.getoption("--adls.account-key") + azurite_connection_string = f"DefaultEndpointsProtocol=http;AccountName={azurite_account_name};AccountKey={azurite_account_key};BlobEndpoint={azurite_url}/{azurite_account_name};" + properties = { + "adls.connection-string": azurite_connection_string, + "adls.account-name": azurite_account_name, + } + + bbs = BlobServiceClient.from_connection_string(conn_str=azurite_connection_string) + bbs.create_container("tests") + yield fsspec.FsspecFileIO(properties=properties) + bbs.delete_container("tests") + bbs.close() + + @pytest.fixture def pyarrow_fileio_gcs(request: pytest.FixtureRequest) -> "PyArrowFileIO": from pyiceberg.io.pyarrow import PyArrowFileIO @@ -2101,6 +2132,34 @@ def pyarrow_fileio_gcs(request: pytest.FixtureRequest) -> "PyArrowFileIO": return PyArrowFileIO(properties=properties) +@pytest.fixture +def pyarrow_fileio_adls(request: pytest.FixtureRequest) -> Generator[Any, None, None]: + from azure.storage.blob import BlobServiceClient + + from pyiceberg.io.pyarrow import PyArrowFileIO + + azurite_url = request.config.getoption("--adls.endpoint") + azurite_scheme, azurite_authority = azurite_url.split("://", 1) + + azurite_account_name = request.config.getoption("--adls.account-name") + azurite_account_key = request.config.getoption("--adls.account-key") + azurite_connection_string = f"DefaultEndpointsProtocol=http;AccountName={azurite_account_name};AccountKey={azurite_account_key};BlobEndpoint={azurite_url}/{azurite_account_name};" + properties = { + ADLS_ACCOUNT_NAME: azurite_account_name, + ADLS_ACCOUNT_KEY: azurite_account_key, + ADLS_BLOB_STORAGE_AUTHORITY: azurite_authority, + ADLS_DFS_STORAGE_AUTHORITY: azurite_authority, + ADLS_BLOB_STORAGE_SCHEME: azurite_scheme, + ADLS_DFS_STORAGE_SCHEME: azurite_scheme, + } + + bbs = BlobServiceClient.from_connection_string(conn_str=azurite_connection_string) + bbs.create_container("warehouse") + yield PyArrowFileIO(properties=properties) + bbs.delete_container("warehouse") + bbs.close() + + def aws_credentials() -> None: os.environ["AWS_ACCESS_KEY_ID"] = "testing" os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" @@ -2162,26 +2221,6 @@ def fixture_dynamodb(_aws_credentials: None) -> Generator[boto3.client, None, No yield boto3.client("dynamodb", region_name="us-east-1") -@pytest.fixture -def adls_fsspec_fileio(request: pytest.FixtureRequest) -> Generator[FsspecFileIO, None, None]: - from azure.storage.blob import BlobServiceClient - - azurite_url = request.config.getoption("--adls.endpoint") - azurite_account_name = request.config.getoption("--adls.account-name") - azurite_account_key = request.config.getoption("--adls.account-key") - azurite_connection_string = f"DefaultEndpointsProtocol=http;AccountName={azurite_account_name};AccountKey={azurite_account_key};BlobEndpoint={azurite_url}/{azurite_account_name};" - properties = { - "adls.connection-string": azurite_connection_string, - "adls.account-name": azurite_account_name, - } - - bbs = BlobServiceClient.from_connection_string(conn_str=azurite_connection_string) - bbs.create_container("tests") - yield fsspec.FsspecFileIO(properties=properties) - bbs.delete_container("tests") - bbs.close() - - @pytest.fixture(scope="session") def empty_home_dir_path(tmp_path_factory: pytest.TempPathFactory) -> str: home_path = str(tmp_path_factory.mktemp("home")) diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index e90f3a46fc..a114f5521e 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -24,9 +24,11 @@ from unittest.mock import MagicMock, patch from uuid import uuid4 +import pyarrow import pyarrow as pa import pyarrow.parquet as pq import pytest +from packaging import version from pyarrow.fs import FileType, LocalFileSystem, S3FileSystem from pyiceberg.exceptions import ResolveError @@ -106,6 +108,11 @@ from tests.catalog.test_base import InMemoryCatalog from tests.conftest import UNIFIED_AWS_SESSION_PROPERTIES +skip_if_pyarrow_too_old = pytest.mark.skipif( + version.parse(pyarrow.__version__) < version.parse("20.0.0"), + reason="Requires pyarrow version >= 20.0.0", +) + def test_pyarrow_infer_local_fs_from_path() -> None: """Test path with `file` scheme and no scheme both use LocalFileSystem""" @@ -1672,7 +1679,7 @@ def test_new_output_file_gcs(pyarrow_fileio_gcs: PyArrowFileIO) -> None: @pytest.mark.gcs @pytest.mark.skip(reason="Open issue on Arrow: https://github.com/apache/arrow/issues/36993") def test_write_and_read_file_gcs(pyarrow_fileio_gcs: PyArrowFileIO) -> None: - """Test writing and reading a file using FsspecInputFile and FsspecOutputFile""" + """Test writing and reading a file using PyArrowFile""" location = f"gs://warehouse/{uuid4()}.txt" output_file = pyarrow_fileio_gcs.new_output(location=location) with output_file.create() as f: @@ -1689,7 +1696,7 @@ def test_write_and_read_file_gcs(pyarrow_fileio_gcs: PyArrowFileIO) -> None: @pytest.mark.gcs def test_getting_length_of_file_gcs(pyarrow_fileio_gcs: PyArrowFileIO) -> None: - """Test getting the length of an FsspecInputFile and FsspecOutputFile""" + """Test getting the length of PyArrowFile""" filename = str(uuid4()) output_file = pyarrow_fileio_gcs.new_output(location=f"gs://warehouse/{filename}") @@ -1753,7 +1760,7 @@ def test_read_specified_bytes_for_file_gcs(pyarrow_fileio_gcs: PyArrowFileIO) -> @pytest.mark.gcs @pytest.mark.skip(reason="Open issue on Arrow: https://github.com/apache/arrow/issues/36993") def test_raise_on_opening_file_not_found_gcs(pyarrow_fileio_gcs: PyArrowFileIO) -> None: - """Test that an fsspec input file raises appropriately when the gcs file is not found""" + """Test that PyArrowFile raises appropriately when the gcs file is not found""" filename = str(uuid4()) input_file = pyarrow_fileio_gcs.new_input(location=f"gs://warehouse/{filename}") @@ -1815,7 +1822,7 @@ def test_converting_an_outputfile_to_an_inputfile_gcs(pyarrow_fileio_gcs: PyArro @pytest.mark.gcs @pytest.mark.skip(reason="Open issue on Arrow: https://github.com/apache/arrow/issues/36993") def test_writing_avro_file_gcs(generated_manifest_entry_file: str, pyarrow_fileio_gcs: PyArrowFileIO) -> None: - """Test that bytes match when reading a local avro file, writing it using fsspec file-io, and then reading it again""" + """Test that bytes match when reading a local avro file, writing it using pyarrow file-io, and then reading it again""" filename = str(uuid4()) with PyArrowFileIO().new_input(location=generated_manifest_entry_file).open() as f: b1 = f.read() @@ -1828,6 +1835,192 @@ def test_writing_avro_file_gcs(generated_manifest_entry_file: str, pyarrow_filei pyarrow_fileio_gcs.delete(f"gs://warehouse/{filename}") +@pytest.mark.adls +@skip_if_pyarrow_too_old +def test_new_input_file_adls(pyarrow_fileio_adls: PyArrowFileIO, adls_scheme: str) -> None: + """Test creating a new input file from pyarrow file-io""" + filename = str(uuid4()) + + input_file = pyarrow_fileio_adls.new_input(f"{adls_scheme}://warehouse/{filename}") + + assert isinstance(input_file, PyArrowFile) + assert input_file.location == f"{adls_scheme}://warehouse/{filename}" + + +@pytest.mark.adls +@skip_if_pyarrow_too_old +def test_new_output_file_adls(pyarrow_fileio_adls: PyArrowFileIO, adls_scheme: str) -> None: + """Test creating a new output file from pyarrow file-io""" + filename = str(uuid4()) + + output_file = pyarrow_fileio_adls.new_output(f"{adls_scheme}://warehouse/{filename}") + + assert isinstance(output_file, PyArrowFile) + assert output_file.location == f"{adls_scheme}://warehouse/{filename}" + + +@pytest.mark.adls +@skip_if_pyarrow_too_old +def test_write_and_read_file_adls(pyarrow_fileio_adls: PyArrowFileIO, adls_scheme: str) -> None: + """Test writing and reading a file using PyArrowFile""" + location = f"{adls_scheme}://warehouse/{uuid4()}.txt" + output_file = pyarrow_fileio_adls.new_output(location=location) + with output_file.create() as f: + assert f.write(b"foo") == 3 + + assert output_file.exists() + + input_file = pyarrow_fileio_adls.new_input(location=location) + with input_file.open() as f: + assert f.read() == b"foo" + + pyarrow_fileio_adls.delete(input_file) + + +@pytest.mark.adls +@skip_if_pyarrow_too_old +def test_getting_length_of_file_adls(pyarrow_fileio_adls: PyArrowFileIO, adls_scheme: str) -> None: + """Test getting the length of PyArrowFile""" + filename = str(uuid4()) + + output_file = pyarrow_fileio_adls.new_output(location=f"{adls_scheme}://warehouse/{filename}") + with output_file.create() as f: + f.write(b"foobar") + + assert len(output_file) == 6 + + input_file = pyarrow_fileio_adls.new_input(location=f"{adls_scheme}://warehouse/{filename}") + assert len(input_file) == 6 + + pyarrow_fileio_adls.delete(output_file) + + +@pytest.mark.adls +@skip_if_pyarrow_too_old +def test_file_tell_adls(pyarrow_fileio_adls: PyArrowFileIO, adls_scheme: str) -> None: + location = f"{adls_scheme}://warehouse/{uuid4()}" + + output_file = pyarrow_fileio_adls.new_output(location=location) + with output_file.create() as write_file: + write_file.write(b"foobar") + + input_file = pyarrow_fileio_adls.new_input(location=location) + with input_file.open() as f: + f.seek(0) + assert f.tell() == 0 + f.seek(1) + assert f.tell() == 1 + f.seek(3) + assert f.tell() == 3 + f.seek(0) + assert f.tell() == 0 + + +@pytest.mark.adls +@skip_if_pyarrow_too_old +def test_read_specified_bytes_for_file_adls(pyarrow_fileio_adls: PyArrowFileIO) -> None: + location = f"abfss://warehouse/{uuid4()}" + + output_file = pyarrow_fileio_adls.new_output(location=location) + with output_file.create() as write_file: + write_file.write(b"foo") + + input_file = pyarrow_fileio_adls.new_input(location=location) + with input_file.open() as f: + f.seek(0) + assert b"f" == f.read(1) + f.seek(0) + assert b"fo" == f.read(2) + f.seek(1) + assert b"o" == f.read(1) + f.seek(1) + assert b"oo" == f.read(2) + f.seek(0) + assert b"foo" == f.read(999) # test reading amount larger than entire content length + + pyarrow_fileio_adls.delete(input_file) + + +@pytest.mark.adls +@skip_if_pyarrow_too_old +def test_raise_on_opening_file_not_found_adls(pyarrow_fileio_adls: PyArrowFileIO, adls_scheme: str) -> None: + """Test that PyArrowFile raises appropriately when the adls file is not found""" + + filename = str(uuid4()) + input_file = pyarrow_fileio_adls.new_input(location=f"{adls_scheme}://warehouse/{filename}") + with pytest.raises(FileNotFoundError) as exc_info: + input_file.open().read() + + assert filename in str(exc_info.value) + + +@pytest.mark.adls +@skip_if_pyarrow_too_old +def test_checking_if_a_file_exists_adls(pyarrow_fileio_adls: PyArrowFileIO, adls_scheme: str) -> None: + """Test checking if a file exists""" + non_existent_file = pyarrow_fileio_adls.new_input(location=f"{adls_scheme}://warehouse/does-not-exist.txt") + assert not non_existent_file.exists() + + location = f"{adls_scheme}://warehouse/{uuid4()}" + output_file = pyarrow_fileio_adls.new_output(location=location) + assert not output_file.exists() + with output_file.create() as f: + f.write(b"foo") + + existing_input_file = pyarrow_fileio_adls.new_input(location=location) + assert existing_input_file.exists() + + existing_output_file = pyarrow_fileio_adls.new_output(location=location) + assert existing_output_file.exists() + + pyarrow_fileio_adls.delete(existing_output_file) + + +@pytest.mark.adls +@skip_if_pyarrow_too_old +def test_closing_a_file_adls(pyarrow_fileio_adls: PyArrowFileIO, adls_scheme: str) -> None: + """Test closing an output file and input file""" + filename = str(uuid4()) + output_file = pyarrow_fileio_adls.new_output(location=f"{adls_scheme}://warehouse/{filename}") + with output_file.create() as write_file: + write_file.write(b"foo") + assert not write_file.closed # type: ignore + assert write_file.closed # type: ignore + + input_file = pyarrow_fileio_adls.new_input(location=f"{adls_scheme}://warehouse/{filename}") + with input_file.open() as f: + assert not f.closed # type: ignore + assert f.closed # type: ignore + + pyarrow_fileio_adls.delete(f"{adls_scheme}://warehouse/{filename}") + + +@pytest.mark.adls +@skip_if_pyarrow_too_old +def test_converting_an_outputfile_to_an_inputfile_adls(pyarrow_fileio_adls: PyArrowFileIO, adls_scheme: str) -> None: + """Test converting an output file to an input file""" + filename = str(uuid4()) + output_file = pyarrow_fileio_adls.new_output(location=f"{adls_scheme}://warehouse/{filename}") + input_file = output_file.to_input_file() + assert input_file.location == output_file.location + + +@pytest.mark.adls +@skip_if_pyarrow_too_old +def test_writing_avro_file_adls(generated_manifest_entry_file: str, pyarrow_fileio_adls: PyArrowFileIO, adls_scheme: str) -> None: + """Test that bytes match when reading a local avro file, writing it using pyarrow file-io, and then reading it again""" + filename = str(uuid4()) + with PyArrowFileIO().new_input(location=generated_manifest_entry_file).open() as f: + b1 = f.read() + with pyarrow_fileio_adls.new_output(location=f"{adls_scheme}://warehouse/{filename}").create() as out_f: + out_f.write(b1) + with pyarrow_fileio_adls.new_input(location=f"{adls_scheme}://warehouse/{filename}").open() as in_f: + b2 = in_f.read() + assert b1 == b2 # Check that bytes of read from local avro file match bytes written to s3 + + pyarrow_fileio_adls.delete(f"{adls_scheme}://warehouse/{filename}") + + def test_parse_location() -> None: def check_results(location: str, expected_schema: str, expected_netloc: str, expected_uri: str) -> None: schema, netloc, uri = PyArrowFileIO.parse_location(location)