Skip to content

Commit 481aaa5

Browse files
authored
Fix Repository.total_chunks_storage for V1 repos. (#1407)
TODO in future PR: actually allow the creation of V1 repos. Currently the API is added, but the parameter is ignored. Closes: #1220
1 parent 787cb13 commit 481aaa5

22 files changed

+354
-88
lines changed

icechunk-python/python/icechunk/_icechunk_python.pyi

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,6 +1558,7 @@ class PyRepository:
15581558
*,
15591559
config: RepositoryConfig | None = None,
15601560
authorize_virtual_chunk_access: dict[str, AnyCredential | None] | None = None,
1561+
spec_version: int | None = None,
15611562
) -> PyRepository: ...
15621563
@classmethod
15631564
async def create_async(
@@ -1566,6 +1567,7 @@ class PyRepository:
15661567
*,
15671568
config: RepositoryConfig | None = None,
15681569
authorize_virtual_chunk_access: dict[str, AnyCredential | None] | None = None,
1570+
spec_version: int | None = None,
15691571
) -> PyRepository: ...
15701572
@classmethod
15711573
def open(
@@ -1590,6 +1592,7 @@ class PyRepository:
15901592
*,
15911593
config: RepositoryConfig | None = None,
15921594
authorize_virtual_chunk_access: dict[str, AnyCredential | None] | None = None,
1595+
create_version: int | None = None,
15931596
) -> PyRepository: ...
15941597
@classmethod
15951598
async def open_or_create_async(
@@ -1598,6 +1601,7 @@ class PyRepository:
15981601
*,
15991602
config: RepositoryConfig | None = None,
16001603
authorize_virtual_chunk_access: dict[str, AnyCredential | None] | None = None,
1604+
create_version: int | None = None,
16011605
) -> PyRepository: ...
16021606
@staticmethod
16031607
def exists(storage: Storage) -> bool: ...

icechunk-python/python/icechunk/repository.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def create(
3333
storage: Storage,
3434
config: RepositoryConfig | None = None,
3535
authorize_virtual_chunk_access: dict[str, AnyCredential | None] | None = None,
36+
spec_version: int | None = None,
3637
) -> Self:
3738
"""
3839
Create a new Icechunk repository.
@@ -55,6 +56,9 @@ def create(
5556
environment, or anonymous credentials will be used if the container allows it.
5657
As a security measure, Icechunk will block access to virtual chunks if the
5758
container is not authorized using this argument.
59+
spec_version : int, optional
60+
Use this version of the spec for the new repository. If not passed, the latest version
61+
of the spec that was available before the library version release will be used.
5862
5963
Returns
6064
-------
@@ -66,6 +70,7 @@ def create(
6670
storage,
6771
config=config,
6872
authorize_virtual_chunk_access=authorize_virtual_chunk_access,
73+
spec_version=spec_version,
6974
)
7075
)
7176

@@ -75,6 +80,7 @@ async def create_async(
7580
storage: Storage,
7681
config: RepositoryConfig | None = None,
7782
authorize_virtual_chunk_access: dict[str, AnyCredential | None] | None = None,
83+
spec_version: int | None = None,
7884
) -> Self:
7985
"""
8086
Create a new Icechunk repository asynchronously.
@@ -97,6 +103,9 @@ async def create_async(
97103
environment, or anonymous credentials will be used if the container allows it.
98104
As a security measure, Icechunk will block access to virtual chunks if the
99105
container is not authorized using this argument.
106+
spec_version : int, optional
107+
Use this version of the spec for the new repository. If not passed, the latest version
108+
of the spec that was available before the library version release will be used.
100109
101110
Returns
102111
-------
@@ -108,6 +117,7 @@ async def create_async(
108117
storage,
109118
config=config,
110119
authorize_virtual_chunk_access=authorize_virtual_chunk_access,
120+
spec_version=spec_version,
111121
)
112122
)
113123

@@ -205,6 +215,7 @@ def open_or_create(
205215
storage: Storage,
206216
config: RepositoryConfig | None = None,
207217
authorize_virtual_chunk_access: dict[str, AnyCredential | None] | None = None,
218+
create_version: int | None = None,
208219
) -> Self:
209220
"""
210221
Open an existing Icechunk repository or create a new one if it does not exist.
@@ -230,6 +241,11 @@ def open_or_create(
230241
environment, or anonymous credentials will be used if the container allows it.
231242
As a security measure, Icechunk will block access to virtual chunks if the
232243
container is not authorized using this argument.
244+
create_version : int, optional
245+
Use this version of the spec for the new repository, if it needs to be created.
246+
If not passed, the latest version of the spec that was available before the
247+
library version release will be used.
248+
233249
234250
Returns
235251
-------
@@ -241,6 +257,7 @@ def open_or_create(
241257
storage,
242258
config=config,
243259
authorize_virtual_chunk_access=authorize_virtual_chunk_access,
260+
create_version=create_version,
244261
)
245262
)
246263

@@ -250,6 +267,7 @@ async def open_or_create_async(
250267
storage: Storage,
251268
config: RepositoryConfig | None = None,
252269
authorize_virtual_chunk_access: dict[str, AnyCredential | None] | None = None,
270+
create_version: int | None = None,
253271
) -> Self:
254272
"""
255273
Open an existing Icechunk repository or create a new one if it does not exist (async version).
@@ -275,6 +293,10 @@ async def open_or_create_async(
275293
environment, or anonymous credentials will be used if the container allows it.
276294
As a security measure, Icechunk will block access to virtual chunks if the
277295
container is not authorized using this argument.
296+
create_version : int, optional
297+
Use this version of the spec for the new repository, if it needs to be created.
298+
If not passed, the latest version of the spec that was available before the
299+
library version release will be used.
278300
279301
Returns
280302
-------
@@ -286,6 +308,7 @@ async def open_or_create_async(
286308
storage,
287309
config=config,
288310
authorize_virtual_chunk_access=authorize_virtual_chunk_access,
311+
create_version=create_version,
289312
)
290313
)
291314

icechunk-python/src/repository.rs

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use icechunk::{
1414
config::Credentials,
1515
format::{
1616
SnapshotId,
17+
format_constants::SpecVersionBin,
1718
repo_info::UpdateType,
1819
snapshot::{ManifestFileInfo, SnapshotInfo, SnapshotProperties},
1920
transaction_log::Diff,
@@ -788,13 +789,14 @@ impl PyRepository {
788789
/// python threads can make progress in the case of an actual block
789790
impl PyRepository {
790791
#[classmethod]
791-
#[pyo3(signature = (storage, *, config = None, authorize_virtual_chunk_access = None))]
792+
#[pyo3(signature = (storage, *, config = None, authorize_virtual_chunk_access = None, spec_version = None))]
792793
fn create(
793794
_cls: &Bound<'_, PyType>,
794795
py: Python<'_>,
795796
storage: PyStorage,
796797
config: Option<&PyRepositoryConfig>,
797798
authorize_virtual_chunk_access: Option<HashMap<String, Option<PyCredentials>>>,
799+
spec_version: Option<u8>,
798800
) -> PyResult<Self> {
799801
// This function calls block_on, so we need to allow other thread python to make progress
800802
py.detach(move || {
@@ -803,10 +805,17 @@ impl PyRepository {
803805
let config = config
804806
.map(|c| c.try_into().map_err(PyValueError::new_err))
805807
.transpose()?;
808+
let version = match spec_version {
809+
Some(n) => Some(
810+
SpecVersionBin::try_from(n).map_err(PyValueError::new_err)?,
811+
),
812+
None => None,
813+
};
806814
Repository::create(
807815
config,
808816
storage.0,
809817
map_credentials(authorize_virtual_chunk_access),
818+
version,
810819
)
811820
.await
812821
.map_err(PyIcechunkStoreError::RepositoryError)
@@ -817,23 +826,34 @@ impl PyRepository {
817826
}
818827

819828
#[classmethod]
820-
#[pyo3(signature = (storage, *, config = None, authorize_virtual_chunk_access = None))]
829+
#[pyo3(signature = (storage, *, config = None, authorize_virtual_chunk_access = None, spec_version = None))]
821830
fn create_async<'py>(
822831
_cls: &Bound<'py, PyType>,
823832
py: Python<'py>,
824833
storage: PyStorage,
825834
config: Option<&PyRepositoryConfig>,
826835
authorize_virtual_chunk_access: Option<HashMap<String, Option<PyCredentials>>>,
836+
spec_version: Option<u8>,
827837
) -> PyResult<Bound<'py, PyAny>> {
828838
let config =
829839
config.map(|c| c.try_into().map_err(PyValueError::new_err)).transpose()?;
830840
let authorize_virtual_chunk_access =
831841
map_credentials(authorize_virtual_chunk_access);
832842
pyo3_async_runtimes::tokio::future_into_py(py, async move {
833-
let repository =
834-
Repository::create(config, storage.0, authorize_virtual_chunk_access)
835-
.await
836-
.map_err(PyIcechunkStoreError::RepositoryError)?;
843+
let version = match spec_version {
844+
Some(n) => {
845+
Some(SpecVersionBin::try_from(n).map_err(PyValueError::new_err)?)
846+
}
847+
None => None,
848+
};
849+
let repository = Repository::create(
850+
config,
851+
storage.0,
852+
authorize_virtual_chunk_access,
853+
version,
854+
)
855+
.await
856+
.map_err(PyIcechunkStoreError::RepositoryError)?;
837857

838858
Ok(Self(Arc::new(RwLock::new(repository))))
839859
})
@@ -891,13 +911,14 @@ impl PyRepository {
891911
}
892912

893913
#[classmethod]
894-
#[pyo3(signature = (storage, *, config = None, authorize_virtual_chunk_access = None))]
914+
#[pyo3(signature = (storage, *, config = None, authorize_virtual_chunk_access = None, create_version = None))]
895915
fn open_or_create(
896916
_cls: &Bound<'_, PyType>,
897917
py: Python<'_>,
898918
storage: PyStorage,
899919
config: Option<&PyRepositoryConfig>,
900920
authorize_virtual_chunk_access: Option<HashMap<String, Option<PyCredentials>>>,
921+
create_version: Option<u8>,
901922
) -> PyResult<Self> {
902923
// This function calls block_on, so we need to allow other thread python to make progress
903924
py.detach(move || {
@@ -906,11 +927,18 @@ impl PyRepository {
906927
let config = config
907928
.map(|c| c.try_into().map_err(PyValueError::new_err))
908929
.transpose()?;
930+
let version = match create_version {
931+
Some(n) => Some(
932+
SpecVersionBin::try_from(n).map_err(PyValueError::new_err)?,
933+
),
934+
None => None,
935+
};
909936
Ok::<_, PyErr>(
910937
Repository::open_or_create(
911938
config,
912939
storage.0,
913940
map_credentials(authorize_virtual_chunk_access),
941+
version,
914942
)
915943
.await
916944
.map_err(PyIcechunkStoreError::RepositoryError)?,
@@ -922,23 +950,31 @@ impl PyRepository {
922950
}
923951

924952
#[classmethod]
925-
#[pyo3(signature = (storage, *, config = None, authorize_virtual_chunk_access = None))]
953+
#[pyo3(signature = (storage, *, config = None, authorize_virtual_chunk_access = None, create_version = None))]
926954
fn open_or_create_async<'py>(
927955
_cls: &Bound<'py, PyType>,
928956
py: Python<'py>,
929957
storage: PyStorage,
930958
config: Option<&PyRepositoryConfig>,
931959
authorize_virtual_chunk_access: Option<HashMap<String, Option<PyCredentials>>>,
960+
create_version: Option<u8>,
932961
) -> PyResult<Bound<'py, PyAny>> {
933962
let config =
934963
config.map(|c| c.try_into().map_err(PyValueError::new_err)).transpose()?;
935964
let authorize_virtual_chunk_access =
936965
map_credentials(authorize_virtual_chunk_access);
937966
pyo3_async_runtimes::tokio::future_into_py(py, async move {
967+
let version = match create_version {
968+
Some(n) => {
969+
Some(SpecVersionBin::try_from(n).map_err(PyValueError::new_err)?)
970+
}
971+
None => None,
972+
};
938973
let repository = Repository::open_or_create(
939974
config,
940975
storage.0,
941976
authorize_virtual_chunk_access,
977+
version,
942978
)
943979
.await
944980
.map_err(PyIcechunkStoreError::RepositoryError)?;

icechunk-python/tests/test_stats.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from pathlib import Path
2+
13
import pytest
24

35
import icechunk as ic
@@ -54,3 +56,17 @@ async def test_total_chunks_storage_async() -> None:
5456
await session.commit_async("commit 1")
5557

5658
assert await repo.total_chunks_storage_async() == 100 * 4
59+
60+
61+
@pytest.mark.parametrize(
62+
"dir", ["./tests/data/test-repo-v2", "./tests/data/test-repo-v1"]
63+
)
64+
def test_chunk_storage_on_filesystem(dir: str) -> None:
65+
repo = ic.Repository.open(
66+
storage=ic.local_filesystem_storage(dir),
67+
)
68+
actual = repo.total_chunks_storage()
69+
expected = sum(
70+
f.stat().st_size for f in (Path(dir) / "chunks").glob("*") if f.is_file()
71+
)
72+
assert actual == expected

icechunk/examples/low_level_dataset.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ let mut ds = Repository::create(Arc::clone(&storage));
2727
);
2828

2929
let storage: Arc<dyn Storage + Send + Sync> = new_in_memory_storage().await?;
30-
let repo = Repository::create(None, Arc::clone(&storage), HashMap::new()).await?;
30+
let repo =
31+
Repository::create(None, Arc::clone(&storage), HashMap::new(), None).await?;
3132
let mut ds = repo.writable_session("main").await?;
3233

3334
println!();

icechunk/examples/multithreaded_get_chunk_refs.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ async fn mk_repo(
5252
}),
5353
..RepositoryConfig::default()
5454
};
55-
let repo = Repository::open_or_create(Some(config), storage, HashMap::new()).await?;
55+
let repo =
56+
Repository::open_or_create(Some(config), storage, HashMap::new(), None).await?;
5657
Ok(repo)
5758
}
5859

icechunk/examples/multithreaded_store.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
1515
inline_chunk_threshold_bytes: Some(128),
1616
..Default::default()
1717
};
18-
let repo = Repository::create(Some(config), storage, HashMap::new()).await?;
18+
let repo = Repository::create(Some(config), storage, HashMap::new(), None).await?;
1919
let ds = Arc::new(RwLock::new(repo.writable_session("main").await?));
2020
let store = Store::from_session(Arc::clone(&ds)).await;
2121

0 commit comments

Comments
 (0)