Skip to content

Commit 7767537

Browse files
authored
TileDB-ML support for carrara (#221)
* Substitute the tiledb-cloud with tiledb-client and change namespace symantics * Update tiledb-ml package to work with python 3.12 * Fix annotations * Fix notebooks * Remove comments from CI * Fix shape mismatch * change the torchdata upper version * Update torch datapipes * Fix test order cause no order guarantees on sparse * Remove caching * Fix mime type when Tiledb uri is directly given
1 parent 5a62278 commit 7767537

File tree

20 files changed

+743
-389
lines changed

20 files changed

+743
-389
lines changed

.github/workflows/ci.yml

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,8 @@ jobs:
99
strategy:
1010
fail-fast: false
1111
matrix:
12-
python-version: ["3.9"]
13-
ml-deps:
14-
- "torch==2.1.2+cpu torchvision==0.16.2+cpu torchdata==0.7.1 tensorflow-cpu==2.15.1 "
15-
- "torch==2.3.1+cpu torchvision==0.18.1+cpu torchdata==0.8.0 'tensorflow-cpu<2.16'"
12+
python-version: ["3.12"]
13+
1614
env:
1715
run_coverage: ${{ github.ref == 'refs/heads/master' }}
1816

@@ -28,14 +26,13 @@ jobs:
2826
uses: actions/cache@v4
2927
with:
3028
path: ~/.cache/pip
31-
key: ${{ runner.os }}:ml-deps=[${{ matrix.ml-deps }}]
29+
key: ${{ runner.os }} # :ml-deps=[${{ matrix.ml-deps }}]
3230

3331
- name: Install dependencies
3432
run: |
35-
pip install --upgrade pip
36-
pip install --extra-index-url https://download.pytorch.org/whl/cpu ${{ matrix.ml-deps }}
37-
pip install pytest-mock pytest-cov scikit-learn==1.0.2
38-
pip install -e .[cloud]
33+
pip install --upgrade pip setuptools wheel
34+
pip install pytest-mock pytest-cov
35+
pip install -e .[full]
3936
4037
- name: Run pre-commit hooks
4138
run: |
@@ -45,6 +42,7 @@ jobs:
4542
- name: Run notebook examples
4643
run: |
4744
pip install pytest-xdist nbmake matplotlib idx2numpy "numpy<2"
45+
rm -rf examples/data/openml
4846
pytest --disable-warnings --nbmake examples/{models,readers}
4947
# Run tiledb-cloud in parallel
5048
if [[ "${{ secrets.TILEDB_API_TOKEN }}" != "" ]]; then

examples/cloud/serverless_training/pytorch/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ Firstly, we ingest all training images and labels in TileDB arrays and register
66
and finally, we serverless-ly get some predictions using the trained model. In case you want to run the example, you will need a TileDB-Cloud account as described
77
[here](https://docs.tiledb.com/cloud/tutorials/start-here). After signing up, you should export your username and password
88
as environmental variables (**TILEDB_USER_NAME**, **TILEDB_PASSWD**), in order to run ingestion, model training and prediction UDFs. Moreover,
9-
please add your TileDB namespace and your **S3** bucket in each script.
9+
please add your TileDB teamspace and your **S3** bucket in each script.
1010

1111
# Steps
1212

examples/cloud/serverless_training/pytorch/data_ingestion.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,25 @@
22
from typing import Any
33

44
import numpy as np
5-
import tiledb.cloud
5+
import tiledb.client
66

77
# Your TileDB username and password, exported as environmental variables
88
TILEDB_USER_NAME = os.environ.get("TILEDB_USER_NAME")
99
TILEDB_PASSWD = os.environ.get("TILEDB_PASSWD")
1010

11-
# Your TileDB namespace
12-
TILEDB_NAMESPACE = "your_tiledb_namespace"
11+
# Your TileDB workspace/teamspace
12+
TILEDB_WORKSPACE = "your_tiledb_WORKSPACE"
13+
TILEDB_TEAMSPACE = "your_tiledb_TEAMSPACE"
1314

1415
# Your S3 bucket
1516
S3_BUCKET = "your_s3_bucket"
1617

17-
IMAGES_URI = "tiledb://{}/s3://{}/mnist_images".format(TILEDB_NAMESPACE, S3_BUCKET)
18-
LABELS_URI = "tiledb://{}/s3://{}/mnist_labels".format(TILEDB_NAMESPACE, S3_BUCKET)
18+
IMAGES_URI = (
19+
f"tiledb://{TILEDB_WORKSPACE}/{TILEDB_TEAMSPACE}/s3://{S3_BUCKET}/mnist_images"
20+
)
21+
LABELS_URI = (
22+
f"tiledb://{TILEDB_WORKSPACE}/{TILEDB_TEAMSPACE}/s3://{S3_BUCKET}/mnist_labels"
23+
)
1924

2025

2126
# Let's define an ingestion function
@@ -62,8 +67,11 @@ def mnist_ingest(ingestion_func: Any) -> None:
6267
ingestion_func(data=labels, batch_size=64, uri=LABELS_URI)
6368

6469

65-
tiledb.cloud.login(username=TILEDB_USER_NAME, password=TILEDB_PASSWD)
70+
tiledb.client.configure(
71+
username=TILEDB_USER_NAME, password=TILEDB_PASSWD, workspace=TILEDB_WORKSPACE
72+
)
73+
tiledb.client.login()
6674

67-
tiledb.cloud.udf.exec(mnist_ingest, ingestion_func=ingest_in_tiledb)
75+
tiledb.client.udf.exec(mnist_ingest, ingestion_func=ingest_in_tiledb)
6876

69-
print(tiledb.cloud.last_udf_task().logs)
77+
print(tiledb.client.last_udf_task().logs)

examples/cloud/serverless_training/pytorch/model_load_and_predict.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,28 @@
11
import os
22
from typing import List
33

4-
import tiledb.cloud
4+
import tiledb.client
55

66
# Your TileDB username and password, exported as environmental variables
77
TILEDB_USER_NAME = os.environ.get("TILEDB_USER_NAME")
88
TILEDB_PASSWD = os.environ.get("TILEDB_PASSWD")
99

1010
# Your TileDB namespace
11-
TILEDB_NAMESPACE = "your_tiledb_namespace"
11+
TILEDB_WORKSPACE = "your_tiledb_WORKSPACE"
12+
TILEDB_TEAMSPACE = "your_tiledb_TEAMSPACE"
1213

1314
# Your S3 bucket
1415
S3_BUCKET = "your_s3_bucket"
1516

16-
IMAGES_URI = "tiledb://{}/s3://{}/mnist_images".format(TILEDB_NAMESPACE, S3_BUCKET)
17-
LABELS_URI = "tiledb://{}/s3://{}/mnist_labels".format(TILEDB_NAMESPACE, S3_BUCKET)
18-
MODEL_URI = "tiledb://{}/s3://{}/mnist_model".format(TILEDB_NAMESPACE, S3_BUCKET)
17+
IMAGES_URI = (
18+
f"tiledb://{TILEDB_WORKSPACE}/{TILEDB_TEAMSPACE}/s3://{S3_BUCKET}/mnist_images"
19+
)
20+
LABELS_URI = (
21+
f"tiledb://{TILEDB_WORKSPACE}/{TILEDB_TEAMSPACE}/s3://{S3_BUCKET}/mnist_labels"
22+
)
23+
MODEL_URI = (
24+
f"tiledb://{TILEDB_WORKSPACE}/{TILEDB_TEAMSPACE}/s3://{S3_BUCKET}/mnist_model"
25+
)
1926

2027
IO_BATCH_SIZE = 20000
2128

@@ -70,8 +77,11 @@ def forward(self, x: torch.Tensor) -> Any:
7077
return [np.argmax(pred) for pred in output.numpy()]
7178

7279

73-
tiledb.cloud.login(username=TILEDB_USER_NAME, password=TILEDB_PASSWD)
80+
tiledb.client.configure(
81+
username=TILEDB_USER_NAME, password=TILEDB_PASSWD, workspace=TILEDB_WORKSPACE
82+
)
83+
tiledb.client.login()
7484

75-
predictions = tiledb.cloud.udf.exec(predict)
85+
predictions = tiledb.client.udf.exec(predict)
7686

7787
print(predictions)

examples/cloud/serverless_training/pytorch/model_training.py

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
11
import os
22

3-
import tiledb.cloud
3+
import tiledb.client
4+
5+
from tiledb.ml.readers.types import ArrayParams
46

57
# Your TileDB username and password, exported as environmental variables
68
TILEDB_USER_NAME = os.environ.get("TILEDB_USER_NAME")
79
TILEDB_PASSWD = os.environ.get("TILEDB_PASSWD")
810

911
# Your TileDB namespace
10-
TILEDB_NAMESPACE = "your_tiledb_namespace"
11-
12-
# Your S3 bucket
13-
S3_BUCKET = "your_s3_bucket"
12+
TILEDB_WORKSPACE = "TileDB-Inc."
13+
TILEDB_TEAMSPACE = "tiledb-ml-ts"
1414

15-
IMAGES_URI = "tiledb://{}/s3://{}/mnist_images".format(TILEDB_NAMESPACE, S3_BUCKET)
16-
LABELS_URI = "tiledb://{}/s3://{}/mnist_labels".format(TILEDB_NAMESPACE, S3_BUCKET)
17-
MODEL_URI = "tiledb://{}/s3://{}/mnist_model".format(TILEDB_NAMESPACE, S3_BUCKET)
15+
IMAGES_URI = f"tiledb://{TILEDB_WORKSPACE}/{TILEDB_TEAMSPACE}/mnist_images"
16+
LABELS_URI = f"tiledb://{TILEDB_WORKSPACE}/{TILEDB_TEAMSPACE}/mnist_labels"
17+
MODEL_URI = f"tiledb://{TILEDB_WORKSPACE}/{TILEDB_TEAMSPACE}/mnist_model"
1818

1919
# The size of each slice from a image and label TileDB arrays.
2020
IO_BATCH_SIZE = 20000
@@ -49,8 +49,20 @@ def forward(self, x: torch.Tensor) -> Any:
4949
logits = self.linear_relu_stack(x)
5050
return logits
5151

52+
def do_random_noise(img: np.ndarray, mag: float = 0.1) -> np.ndarray:
53+
noise = np.random.uniform(-1, 1, img.shape) * mag
54+
img = img + noise
55+
img = np.clip(img, 0, 1)
56+
return img
57+
5258
with tiledb.open(IMAGES_URI) as x, tiledb.open(LABELS_URI) as y:
53-
train_loader = PyTorchTileDBDataLoader(x, y, batch_size=IO_BATCH_SIZE)
59+
train_loader = PyTorchTileDBDataLoader(
60+
ArrayParams(x, fn=do_random_noise),
61+
ArrayParams(y),
62+
batch_size=IO_BATCH_SIZE,
63+
num_workers=0,
64+
shuffle_buffer_size=256,
65+
)
5466

5567
net = Net(shape=(28, 28))
5668
criterion = nn.CrossEntropyLoss()
@@ -95,7 +107,7 @@ def forward(self, x: torch.Tensor) -> Any:
95107

96108
model = PyTorchTileDBModel(
97109
uri="mnist_model",
98-
namespace=TILEDB_NAMESPACE,
110+
teamspace=TILEDB_TEAMSPACE,
99111
model=net,
100112
optimizer=optimizer,
101113
)
@@ -104,8 +116,11 @@ def forward(self, x: torch.Tensor) -> Any:
104116
model.save()
105117

106118

107-
tiledb.cloud.login(username=TILEDB_USER_NAME, password=TILEDB_PASSWD)
119+
# tiledb.client.configure(
120+
# username=TILEDB_USER_NAME, password=TILEDB_PASSWD, workspace=TILEDB_WORKSPACE
121+
# )
122+
tiledb.client.login()
108123

109-
tiledb.cloud.udf.exec(train)
124+
tiledb.client.udf.exec(train)
110125

111-
print(tiledb.cloud.last_udf_task().logs)
126+
print(tiledb.client.last_udf_task().logs)

0 commit comments

Comments
 (0)