Revert "Fix error when batch size is larger than the dataset" (#616)

Luthaf · web-flow · commit c72d6d07ad21 · 2025-06-11T14:52:11.000+02:00
This reverts commit 4b7c9a4.
diff --git a/src/metatrain/experimental/nanopet/trainer.py b/src/metatrain/experimental/nanopet/trainer.py
@@ -149,7 +149,7 @@ def train(
                     num_replicas=world_size,
                     rank=rank,
                     shuffle=True,
-                    drop_last=len(train_dataset) > self.hypers["batch_size"],
+                    drop_last=True,
                 )
                 for train_dataset in train_datasets
             ]
@@ -181,9 +181,7 @@ def train(
                     ),
                     drop_last=(
                         # the sampler takes care of this (if present)
-                        # check if batch size > train_dataset
-                        len(train_dataset) > self.hypers["batch_size"]
-                        and train_sampler is None
+                        train_sampler is None
                     ),
                     collate_fn=collate_fn,
                 )
diff --git a/src/metatrain/pet/trainer.py b/src/metatrain/pet/trainer.py
@@ -167,7 +167,7 @@ def train(
                     num_replicas=world_size,
                     rank=rank,
                     shuffle=True,
-                    drop_last=len(train_dataset) > self.hypers["batch_size"],
+                    drop_last=True,
                 )
                 for train_dataset in train_datasets
             ]
@@ -199,9 +199,7 @@ def train(
                     ),
                     drop_last=(
                         # the sampler takes care of this (if present)
-                        # check if batch size > train_dataset
-                        len(train_dataset) > self.hypers["batch_size"]
-                        and train_sampler is None
+                        train_sampler is None
                     ),
                     collate_fn=collate_fn,
                 )
diff --git a/src/metatrain/soap_bpnn/trainer.py b/src/metatrain/soap_bpnn/trainer.py
@@ -148,7 +148,7 @@ def train(
                     num_replicas=world_size,
                     rank=rank,
                     shuffle=True,
-                    drop_last=len(train_dataset) > self.hypers["batch_size"],
+                    drop_last=True,
                 )
                 for train_dataset in train_datasets
             ]
@@ -180,9 +180,7 @@ def train(
                     ),
                     drop_last=(
                         # the sampler takes care of this (if present)
-                        # check if batch size > train_dataset
-                        len(train_dataset) > self.hypers["batch_size"]
-                        and train_sampler is None
+                        train_sampler is None
                     ),
                     collate_fn=collate_fn,
                 )
diff --git a/tests/cli/test_train_model.py b/tests/cli/test_train_model.py
@@ -302,19 +302,6 @@ def test_empty_training_set(monkeypatch, tmp_path, options):
         train_model(options)
 
 
-def test_batch_size_smaller_training_set(monkeypatch, tmp_path, options):
-    """Test that training still runs for batch size > train_size."""
-    monkeypatch.chdir(tmp_path)
-
-    shutil.copy(DATASET_PATH_QM9, "qm9_reduced_100.xyz")
-
-    options["validation_set"] = 0.55
-    options["test_set"] = 0.4
-    options["architecture"]["training"]["batch_size"] = 1000
-
-    train_model(options)
-
-
 @pytest.mark.parametrize("split", [-0.1, 1.1])
 def test_wrong_test_split_size(split, monkeypatch, tmp_path, options):
     """Test that an error is raised if the test split has the wrong size"""

Original file line number	Diff line number	Diff line change
`@@ -149,7 +149,7 @@ def train(`
`149`	`149`	`num_replicas=world_size,`
`150`	`150`	`rank=rank,`
`151`	`151`	`shuffle=True,`
`152`		`- drop_last=len(train_dataset) > self.hypers["batch_size"],`
	`152`	`+ drop_last=True,`
`153`	`153`	`)`
`154`	`154`	`for train_dataset in train_datasets`
`155`	`155`	`]`
`@@ -181,9 +181,7 @@ def train(`
`181`	`181`	`),`
`182`	`182`	`drop_last=(`
`183`	`183`	`# the sampler takes care of this (if present)`
`184`		`- # check if batch size > train_dataset`
`185`		`- len(train_dataset) > self.hypers["batch_size"]`
`186`		`- and train_sampler is None`
	`184`	`+ train_sampler is None`
`187`	`185`	`),`
`188`	`186`	`collate_fn=collate_fn,`
`189`	`187`	`)`
Original file line number	Diff line number	Diff line change
`@@ -167,7 +167,7 @@ def train(`
`167`	`167`	`num_replicas=world_size,`
`168`	`168`	`rank=rank,`
`169`	`169`	`shuffle=True,`
`170`		`- drop_last=len(train_dataset) > self.hypers["batch_size"],`
	`170`	`+ drop_last=True,`
`171`	`171`	`)`
`172`	`172`	`for train_dataset in train_datasets`
`173`	`173`	`]`
`@@ -199,9 +199,7 @@ def train(`
`199`	`199`	`),`
`200`	`200`	`drop_last=(`
`201`	`201`	`# the sampler takes care of this (if present)`
`202`		`- # check if batch size > train_dataset`
`203`		`- len(train_dataset) > self.hypers["batch_size"]`
`204`		`- and train_sampler is None`
	`202`	`+ train_sampler is None`
`205`	`203`	`),`
`206`	`204`	`collate_fn=collate_fn,`
`207`	`205`	`)`
Original file line number	Diff line number	Diff line change
`@@ -148,7 +148,7 @@ def train(`
`148`	`148`	`num_replicas=world_size,`
`149`	`149`	`rank=rank,`
`150`	`150`	`shuffle=True,`
`151`		`- drop_last=len(train_dataset) > self.hypers["batch_size"],`
	`151`	`+ drop_last=True,`
`152`	`152`	`)`
`153`	`153`	`for train_dataset in train_datasets`
`154`	`154`	`]`
`@@ -180,9 +180,7 @@ def train(`
`180`	`180`	`),`
`181`	`181`	`drop_last=(`
`182`	`182`	`# the sampler takes care of this (if present)`
`183`		`- # check if batch size > train_dataset`
`184`		`- len(train_dataset) > self.hypers["batch_size"]`
`185`		`- and train_sampler is None`
	`183`	`+ train_sampler is None`
`186`	`184`	`),`
`187`	`185`	`collate_fn=collate_fn,`
`188`	`186`	`)`