Skip to content

Commit b843bc3

Browse files
authored
Deprecate emnist dataset (#260)
* rename _dataset to _collection and fix #259 * add deptrecation improve documentation * add NEWS add deprecation test * fix test * improve documentation * add missing parameter
1 parent 4f098ff commit b843bc3

File tree

9 files changed

+109
-67
lines changed

9 files changed

+109
-67
lines changed

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ export(coco_detection_dataset)
8484
export(draw_bounding_boxes)
8585
export(draw_keypoints)
8686
export(draw_segmentation_masks)
87+
export(emnist_collection)
8788
export(emnist_dataset)
8889
export(eurosat100_dataset)
8990
export(eurosat_all_bands_dataset)

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
* Breaking Change : Refactoring of `coco_*` dataset family now provides each `item$x` being an image array (for consistency with other datasets).
3535
You can use `transform = transform_to_tensor` to restore the previous x output to be a `torch_tensor()`.
3636
* `transform_` are now documented into 3 different categories: unitary transformations, random transformations and combining transformations. (@cregouby, #250)
37+
* Deprecation : `emnist_dataset` is deprecated in favor of `emnist_collection()` (@cregouby, #260).
3738

3839
# torchvision 0.7.0
3940

R/dataset-mnist.R

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#' - **Fashion-MNIST**: Clothing item images for classification.
88
#' - **Kuzushiji-MNIST**: Japanese cursive character dataset.
99
#' - **QMNIST**: Extended MNIST with high-precision NIST data.
10-
#' - **EMNIST**: Letters and digits with multiple label splits.
10+
#' - **EMNIST**: A collection of letters and digits with multiple datasets and splits.
1111
#'
1212
#' @param root Root directory for dataset storage. The dataset will be stored under `root/<dataset-name>`. Defaults to `tempdir()`.
1313
#' @param train Logical. If TRUE, use the training set; otherwise, use the test set. Not applicable to all datasets.
@@ -18,7 +18,7 @@
1818
#'
1919
#' @return A torch dataset object, where each items is a list of `x` (image) and `y` (label).
2020
#'
21-
#' @section Supported `kind`s for `emnist_dataset()`:
21+
#' @section Supported `dataset`s for `emnist_collection()`:
2222
#' - `"byclass"`: 62 classes (digits + uppercase + lowercase)
2323
#' - `"bymerge"`: 47 classes (merged uppercase and lowercase)
2424
#' - `"balanced"`: 47 classes, balanced digits and letters
@@ -43,7 +43,7 @@
4343
#' item$x
4444
#' item$y
4545
#'
46-
#' emnist <- emnist_dataset(kind = "balanced", split = "test", download = TRUE)
46+
#' emnist <- emnist_collection(dataset = "balanced", split = "test", download = TRUE)
4747
#' item <- emnist[1]
4848
#' item$x
4949
#' item$y
@@ -354,19 +354,19 @@ fashion_mnist_dataset <- dataset(
354354
)
355355
)
356356

357-
#' @describeIn mnist_dataset EMNIST dataset with digits and letters and multiple split modes.
358-
#' @param kind change the classes into one of "byclass", "bymerge", "balanced" representing the kind of emnist dataset. You
359-
#' can look at dataset attribute `$classes` to see the actual classes.
357+
#' @describeIn mnist_dataset EMNIST collection with digits and letters arranged in multiple datasets.
358+
#' @param dataset one of "byclass", "bymerge", "balanced" representing the subset of emnist collection
359+
#' made of a set of classes. You can look at dataset attribute `$classes` to see the actual classes.
360360
#' @export
361-
emnist_dataset <- dataset(
362-
name = "emnist_dataset",
361+
emnist_collection <- dataset(
362+
name = "emnist_collection",
363363
archive_size = "540 MB",
364364

365365
resources = list(
366366
c("https://biometrics.nist.gov/cs_links/EMNIST/gzip.zip", "58c8d27c78d21e728a6bc7b3cc06412e")
367367
),
368368
rds_file = function(split, kind) paste0(split,"-",kind,".rds"),
369-
classes_all_kind = list(
369+
classes_all_dataset = list(
370370
byclass = c(
371371
"0","1","2","3","4","5","6","7","8","9",
372372
LETTERS,
@@ -390,34 +390,34 @@ emnist_dataset <- dataset(
390390
initialize = function(
391391
root = tempdir(),
392392
split = "test",
393-
kind = "balanced",
393+
dataset = "balanced",
394394
transform = NULL,
395395
target_transform = NULL,
396396
download = FALSE
397397
) {
398398

399399
self$split <- match.arg(split, choices = c("train", "test"))
400-
self$kind <- match.arg(kind, choices = names(self$classes_all_kind))
400+
self$dataset <- match.arg(dataset, choices = names(self$classes_all_dataset))
401401
self$root_path <- root
402402
self$raw_folder <- file.path(root, class(self)[1], "raw")
403403
self$processed_folder <- file.path(root, class(self)[1], "processed")
404404
self$transform <- transform
405405
self$target_transform <- target_transform
406-
self$class <- self$classes_all_kind[[self$kind]]
406+
self$class <- self$classes_all_dataset[[self$dataset]]
407407

408408
if (download) {
409-
cli_inform("Dataset {.cls {class(self)[[1]]}} (~{.emph {self$archive_size}}) will be downloaded and processed if not already available.")
409+
cli_inform("{.cls {class(self)[[1]]}} (~{.emph {self$archive_size}}) will be downloaded and processed if not already available.")
410410
self$download()
411411
}
412412

413413
if (!self$check_exists())
414414
runtime_error("Dataset not found. You can use `download = TRUE` to download it.")
415415

416-
dataset_lst <- readRDS(file.path(self$processed_folder, self$rds_file(self$split, self$kind)))
416+
dataset_lst <- readRDS(file.path(self$processed_folder, self$rds_file(self$split, self$dataset)))
417417
self$data <- dataset_lst[[1]]
418418
self$targets <- dataset_lst[[2]] + 1L
419419

420-
cli_inform("Split {.val {self$split}} of {.cls {class(self)[[1]]}} dataset of kind {.val {self$kind}} processed successfully!")
420+
cli_inform("Split {.val {self$split}} of dataset {.val {self$dataset}} from {.cls {class(self)[[1]]}} processed successfully!")
421421
},
422422

423423
download = function() {
@@ -440,15 +440,15 @@ emnist_dataset <- dataset(
440440
unzipped_root <- fs::dir_ls(unzip_dir, type = "directory", recurse = FALSE)[1]
441441

442442
# only manage extraction of the 2 ubyte.gz under interest
443-
img <- file.path(unzipped_root, glue::glue("emnist-{self$kind}-{self$split}-images-idx3-ubyte.gz"))
444-
lbl <- file.path(unzipped_root, glue::glue("emnist-{self$kind}-{self$split}-labels-idx1-ubyte.gz"))
443+
img <- file.path(unzipped_root, glue::glue("emnist-{self$dataset}-{self$split}-images-idx3-ubyte.gz"))
444+
lbl <- file.path(unzipped_root, glue::glue("emnist-{self$dataset}-{self$split}-labels-idx1-ubyte.gz"))
445445
dataset_set <- list(read_sn3_pascalvincent(img), read_sn3_pascalvincent(lbl))
446-
saveRDS(dataset_set, file.path(self$processed_folder, self$rds_file(self$split, self$kind)))
446+
saveRDS(dataset_set, file.path(self$processed_folder, self$rds_file(self$split, self$dataset)))
447447

448448
},
449449
# only manage existence of the rds file under interest
450450
check_exists = function() {
451-
fs::file_exists(file.path(self$processed_folder, self$rds_file(self$split, self$kind)))
451+
fs::file_exists(file.path(self$processed_folder, self$rds_file(self$split, self$dataset)))
452452
},
453453

454454
.getitem = function(index) {
@@ -492,3 +492,12 @@ read_sn3_pascalvincent <- function(path) {
492492
a <- aperm(a, perm = rev(seq_along(dim)))
493493
a
494494
}
495+
496+
#' @describeIn mnist_dataset Deprecated. Please use emnist_collection.
497+
#' @param kind the `dataset` in `emnist_collection`.
498+
#' @param ... the other `emnist_collection` parameters.
499+
#' @export
500+
emnist_dataset <- function(kind, ...){
501+
.Deprecated("emnist_collection")
502+
emnist_collection(dataset = kind, ...)
503+
}

cran-comments.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-
Re-submission to fix function call removed from dependency.
289 Bytes
Binary file not shown.

man/mnist_dataset.Rd

Lines changed: 17 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

po/R-fr.po

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
msgid ""
22
msgstr ""
33
"Project-Id-Version: torchvision 0.7.0.9000\n"
4-
"POT-Creation-Date: 2025-09-13 23:15+0200\n"
5-
"PO-Revision-Date: 2025-09-13 23:20+0200\n"
4+
"POT-Creation-Date: 2025-09-28 10:02+0200\n"
5+
"PO-Revision-Date: 2025-09-28 10:08+0200\n"
66
"Last-Translator: Christophe Regouby <[email protected]>\n"
77
"Language-Team: \n"
88
"Language: fr\n"
@@ -12,7 +12,7 @@ msgstr ""
1212
"X-Generator: Poedit 3.7\n"
1313
"X-Poedit-SourceCharset: UTF-8\n"
1414

15-
#: collection-rf100-doc.R:142
15+
#: collection-rf100-doc.R:139
1616
msgid ""
1717
"Dataset {.val {self$dataset}} split {.val {self$split}} of {.cls {class(self)"
1818
"[[1]]}} (~{.emph {self$archive_size}}) will be downloaded and processed if "
@@ -22,7 +22,7 @@ msgstr ""
2222
"(de taille ~{.emph {self$archive_size}}) sera téléchargée et traitée si elle "
2323
"n'est pas déjà disponible."
2424

25-
#: collection-rf100-doc.R:147
25+
#: collection-rf100-doc.R:144
2626
msgid ""
2727
"Dataset not found. Use download=TRUE or check that parquet files exist at "
2828
"the expected paths."
@@ -31,19 +31,19 @@ msgstr ""
3131
"télécharger, ou vérifier que le fichier `.parquet` existe à l'emplacement "
3232
"attendu."
3333

34-
#: collection-rf100-doc.R:158
34+
#: collection-rf100-doc.R:156
3535
msgid ""
3636
"{.cls {class(self)[[1]]}} dataset loaded with {self$.length()} images for "
3737
"split {.val {self$split}}."
3838
msgstr ""
3939
"Le jeu de données {.cls {class(self)[[1]]}} est disponible avec "
4040
"{self$.length()} images pour la partition {.val {self$split}}."
4141

42-
#: collection-rf100-doc.R:163
42+
#: collection-rf100-doc.R:161
4343
msgid "Downloading {.val {self$dataset}}..."
4444
msgstr "Téléchargement de {.val {self$dataset}}..."
4545

46-
#: collection-rf100-doc.R:168
46+
#: collection-rf100-doc.R:166
4747
msgid "Corrupt file! Delete the cached files and try again."
4848
msgstr "Fichier corrompu. Supprimez le fichier en cache et recommencez."
4949

@@ -78,7 +78,7 @@ msgstr ""
7878
#: dataset-caltech.R:61 dataset-cifar.R:52 dataset-coco.R:97 dataset-coco.R:289
7979
#: dataset-eurosat.R:57 dataset-fer.R:63 dataset-fgvc.R:91 dataset-flickr.R:69
8080
#: dataset-flickr.R:231 dataset-flowers.R:92 dataset-lfw.R:108
81-
#: dataset-lfw.R:242 dataset-mnist.R:86 dataset-mnist.R:256 dataset-mnist.R:426
81+
#: dataset-lfw.R:242 dataset-mnist.R:86 dataset-mnist.R:256
8282
#: dataset-oxfordiiitpet.R:71 dataset-oxfordiiitpet.R:279
8383
#: dataset-oxfordiiitpet.R:343 dataset-pascal.R:136 dataset-pascal.R:296
8484
#: dataset-places365.R:98
@@ -93,7 +93,7 @@ msgstr ""
9393
#: dataset-caltech.R:66 dataset-caltech.R:187 dataset-coco.R:102
9494
#: dataset-coco.R:294 dataset-eurosat.R:64 dataset-fer.R:70
9595
#: dataset-flowers.R:97 dataset-lfw.R:113 dataset-lfw.R:247 dataset-mnist.R:91
96-
#: dataset-mnist.R:261 dataset-mnist.R:431 dataset-oxfordiiitpet.R:76
96+
#: dataset-mnist.R:261 dataset-mnist.R:414 dataset-oxfordiiitpet.R:76
9797
#: dataset-oxfordiiitpet.R:284 dataset-oxfordiiitpet.R:348 dataset-pascal.R:141
9898
#: dataset-pascal.R:301 dataset-places365.R:103 dataset-plankton.R:89
9999
#: dataset-rf100-peixos.R:70
@@ -122,7 +122,7 @@ msgstr "Téléchargement de {.cls {class(self)[[1]]}}..."
122122
#: dataset-caltech.R:125 dataset-cifar.R:111 dataset-coco.R:198
123123
#: dataset-eurosat.R:88 dataset-fer.R:129 dataset-flowers.R:136
124124
#: dataset-lfw.R:155 dataset-lfw.R:168 dataset-mnist.R:121 dataset-mnist.R:286
125-
#: dataset-mnist.R:450 dataset-oxfordiiitpet.R:115 dataset-pascal.R:168
125+
#: dataset-mnist.R:433 dataset-oxfordiiitpet.R:115 dataset-pascal.R:168
126126
#: dataset-places365.R:182 dataset-plankton.R:105 dataset-rf100-peixos.R:82
127127
#: models-facenet.R:128 models-facenet.R:174 models-facenet.R:228
128128
#: models-facenet.R:311 models-vit.R:49
@@ -281,25 +281,34 @@ msgstr ""
281281
msgid "Processing {.cls {class(self)[[1]]}}..."
282282
msgstr "Préparation de {.cls {class(self)[[1]]}} ..."
283283

284-
#: dataset-mnist.R:437
284+
#: dataset-mnist.R:409
285285
msgid ""
286-
"Split {.val {self$split}} of {.cls {class(self)[[1]]}} dataset of kind {.val "
287-
"{self$kind}} processed successfully!"
286+
"{.cls {class(self)[[1]]}} (~{.emph {self$archive_size}}) will be downloaded "
287+
"and processed if not already available."
288288
msgstr ""
289-
"La partition {.val {self$split}} du jeu de données {.cls {class(self)[[1]]}} "
290-
"de type {.val {self$kind}} a été traitée avec succès !"
289+
"Le jeu de données {.cls {class(self)[[1]]}} (de taille ~{.emph "
290+
"{self$archive_size}}) sera téléchargé et traité s'il n'est pas déjà "
291+
"disponible."
292+
293+
#: dataset-mnist.R:420
294+
msgid ""
295+
"Split {.val {self$split}} of dataset {.val {self$dataset}} from {.cls "
296+
"{class(self)[[1]]}} processed successfully!"
297+
msgstr ""
298+
"La partition {.val {self$split}} du jeu de données {.val {self$dataset}} de "
299+
"{.cls {class(self)[[1]]}} a été traitée avec succès !"
291300

292301
#: dataset-places365.R:128
293302
msgid ""
294303
"{.cls {class(self)[[1]]}} Split '{self$split}' loaded with {length(self)} "
295304
"samples."
296305
msgstr ""
297-
"{.cls {class(self)[[1]]}} Le pratage '{self$split}' chargé et contient "
298-
"{length(self)} échantillons."
306+
"{.cls {class(self)[[1]]}} La partition '{self$split}' est chargée et "
307+
"contient {length(self)} échantillons."
299308

300309
#: dataset-places365.R:155
301310
msgid "Invalid split: {self$split}"
302-
msgstr "Partage non valide : {self$split}"
311+
msgstr "Partition non valide : {self$split}"
303312

304313
#: dataset-places365.R:163
305314
msgid "Downloading {.cls {class(self)[[1]]}} split '{self$split}'..."

0 commit comments

Comments
 (0)