Raise a descriptive exception when global batch size is not divisible by the number of workers

FDecaYed · FDecaYed · commit 70b7d20d867b · 2023-02-13T13:13:15.000+08:00
diff --git a/distributed_embeddings/python/layers/dist_model_parallel.py b/distributed_embeddings/python/layers/dist_model_parallel.py
@@ -622,8 +622,20 @@ def get_weights(self, all_ranks=False):
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
+    if input_shape is not None:
+      # Do some checks to detect cases that are not supported
+      if not isinstance(input_shape, list):
+        input_shape = [input_shape]
+      batch_sizes = [shape[0] for shape in input_shape]
+      batch_sizes = hvd.allgather(batch_sizes).numpy().tolist()
+      if len(set(batch_sizes)) > 1:
+        raise ValueError(F"All input need to have same batchsize. got {set(batch_sizes)}.")
+      if not self.dp_input:
+        if batch_sizes[0] % self.world_size > 0:
+          raise ValueError(
+              F"Global batchsize {batch_sizes[0]} not divisible workers count {self.world_size}.")
     for layer in self.local_embedding_layers:
-      layer.build(input_shape)
+      layer.build(input_shape[0] if input_shape else None)
     self.built = True
 
   def call(self, inputs):  # pylint: disable=missing-function-docstring
diff --git a/distributed_embeddings/python/layers/dist_model_parallel_test.py b/distributed_embeddings/python/layers/dist_model_parallel_test.py
@@ -129,7 +129,7 @@ def gen_inputs(self, table_sizes, input_to_table_map=None, mp_input_ids=None):
     dp_inputs = [
         t[self.hvd_rank * local_batch:(self.hvd_rank + 1) * local_batch] for t in global_inputs
     ]
-    mp_inputs = [global_inputs[i] for i in mp_input_ids] if mp_input_ids else None
+    mp_inputs = [global_inputs[i] for i in mp_input_ids] if mp_input_ids else []
 
     return dp_inputs, mp_inputs
 
@@ -362,6 +362,21 @@ def test_set_weight_uninitialized(self):
       test_model.dist_embeddings.set_weights(ref_weights[:num_tables])
       test_model.dense.set_weights(ref_weights[num_tables:])
 
+  def test_indivisible_batch(self):
+    table_sizes = self.gen_table_sizes()
+
+    ref_model = EmbeddingListModel(table_sizes, distribute=False)
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='basic', dp_input=False)
+
+    # First generate model parallel batches that's divisible by world_size. We then use (batch_size - 1)
+    # which will be indivisible by world_size greater than 1 due to consecutive numbers coprimes
+    mp_input_ids = test_model.dist_embeddings.strategy.input_ids_list[self.hvd_rank]
+    dp_inputs, mp_inputs = self.gen_inputs(table_sizes, mp_input_ids=mp_input_ids)
+    mp_inputs = [inp[1:] for inp in mp_inputs]
+    if self.hvd_size > 1:
+      with self.assertRaisesRegex(ValueError, "not divisible"):
+        self.run_and_test(ref_model, dp_inputs, test_model, mp_inputs)
+
 
 if __name__ == "__main__":
   test.main()