fixed bug where distributed training would stall

lucaslie · lucaslie · commit 2c0b8d43ad19 · 2020-05-08T11:54:02.000-04:00
diff --git a/src/experiment/experiment/logger.py b/src/experiment/experiment/logger.py
@@ -286,11 +286,12 @@ def _copy_results_over():
         util_file.create_directory(self._results_dir)
 
         # setup logging (with convenience function)
-        self._stdout_logger = pp_logging.setup_stdout(self._results_dir)
+        stdout_file = os.path.join(self._results_dir, "experiment.log")
+        self._stdout_logger = pp_logging.setup_stdout(stdout_file)
 
         # setup train logger
         self._train_logger = pp_logging.TrainLogger(
-            self._log_dir, self.global_tag, self._class_to_names
+            self._log_dir, stdout_file, self.global_tag, self._class_to_names,
         )
 
         # initialize writer
diff --git a/src/provable_pruning/provable_pruning/util/logging/stdout.py b/src/provable_pruning/provable_pruning/util/logging/stdout.py
@@ -1,7 +1,6 @@
 """A module with our customization for stdout to include a file log."""
 import sys
 import datetime
-import os.path
 import re
 
 
@@ -22,7 +21,7 @@ def __init__(self, file_name):
             self._last_msg_len = 0
 
         # this will be the file where we also log
-        self._stdout_file = open(file_name, "w")
+        self._stdout_file = file_name
 
     def write(self, msg, name=None):
         """Write to file and console.
@@ -77,7 +76,8 @@ def write(self, msg, name=None):
 
         # also write to log file
         time_tag = datetime.datetime.utcnow().strftime("%Y-%m-%d, %H:%M:%S.%f")
-        print(f"{time_tag}: {msg}", file=self._stdout_file)
+        with open(self._stdout_file, "a") as logfile:
+            print(f"{time_tag}: {msg}", file=logfile)
 
         # store last_name
         self._last_name = name
@@ -88,14 +88,10 @@ def write(self, msg, name=None):
     def flush(self):
         """Flush console and file."""
         self._stdout_original.flush()
-        self._stdout_file.flush()
 
 
-def setup_stdout(results_dir):
+def setup_stdout(log_file):
     """Set up stdout logger with this function."""
-    # log file name
-    log_file = os.path.join(results_dir, "experiment.log")
-
     # get an instance of the stdout logger
     stdout_logger = _StdoutLogger(log_file)
 
diff --git a/src/provable_pruning/provable_pruning/util/logging/train.py b/src/provable_pruning/provable_pruning/util/logging/train.py
@@ -4,6 +4,7 @@
 import math
 
 from torch.utils import tensorboard as tb
+from .stdout import setup_stdout
 from .tensorboard import log_scalar
 
 
@@ -16,14 +17,23 @@ class TrainLogger(object):
     multiprocessing context.
     """
 
-    def __init__(self, log_dir=None, global_tag=None, class_to_names=None):
+    def __init__(
+        self,
+        log_dir=None,
+        stdout_file=None,
+        global_tag=None,
+        class_to_names=None,
+    ):
         """Initialize the train logger.
 
         If the optional arguments are not supplied, the logger will print
-        updates about the training progress but won't log it to tensorboard
+        updates about the training progress but won't log it to tensorboard or
+        log it to a file
         """
         self._global_tag = global_tag
         self._logdir = log_dir
+        self._stdout_file = stdout_file
+        self._stdout_init = False
         self._diagnostics_step = 20 if "imagenet" in self._logdir else 50
         self._class_to_names = class_to_names
 
@@ -80,8 +90,9 @@ def initialize(
         s_idx=None,
     ):
         """Initialize the logger for the current (re-)training session."""
-        # reset the writer
+        # reset the writer and logger
         self._writer = None
+        self._stdout_init = False
 
         # setup parameters
         if self._class_to_names is None:
@@ -171,7 +182,7 @@ def train_diagnostics(
         self._t_last_print = time.time()
 
         # print progress
-        print(
+        self._print(
             self._progress_str.format(
                 epoch + 1, step, loss, acc1 * 100.0, acc5 * 100.0, t_elapsed
             )
@@ -225,9 +236,9 @@ def test_diagnostics(self, epoch, loss, acc1, acc5):
         """Finish test statistics computations and store them."""
         # store statistics
         self.test_epoch.append(epoch)
-        self.test_acc1.append(loss)
-        self.test_acc5.append(acc1)
-        self.test_loss.append(acc5)
+        self.test_loss.append(float(loss))
+        self.test_acc1.append(acc1)
+        self.test_acc5.append(acc5)
 
         # get the writer
         writer = self._get_writer()
@@ -268,7 +279,7 @@ def test_diagnostics(self, epoch, loss, acc1, acc5):
             )
 
         # print progress
-        print(
+        self._print(
             self._test_str.format(
                 self.test_epoch[-1] + 1,
                 self.test_loss[-1],
@@ -280,8 +291,16 @@ def test_diagnostics(self, epoch, loss, acc1, acc5):
     def epoch_diagnostics(self, t_total, t_loading, t_optim, t_enforce, t_log):
         """Print diagnostics around the timing of one epoch."""
         t_remaining = t_total - sum([t_loading, t_optim, t_enforce, t_log])
-        print(
+        self._print(
             self._timing_str.format(
                 t_total, t_loading, t_optim, t_enforce, t_log, t_remaining
             )
         )
+
+    def _print(self, value):
+        """Print and ensure we are also printing to file."""
+        if not self._stdout_init and self._stdout_file is not None:
+            stdout = setup_stdout(self._stdout_file)
+            stdout.write(" " * 200, name=self.name)
+            self._stdout_init = True
+        print(value)
diff --git a/src/provable_pruning/provable_pruning/util/train.py b/src/provable_pruning/provable_pruning/util/train.py
@@ -315,7 +315,7 @@ def _train_procedure(
         torch.set_grad_enabled(True)
 
         # setup torch.distributed and spawn processes
-        num_workers = self.train_loader.num_workers // self.num_gpus
+        num_workers = self.train_loader.num_workers // max(self.num_gpus, 1)
 
         # empty gpu cache to make sure everything is ready for retraining
         torch.cuda.empty_cache()
@@ -456,6 +456,9 @@ def train_with_worker(
         file_name_checkpoint, net_handle, optimizer, loc
     )
 
+    # wait for all processes to load the checkpoint
+    dist.barrier()
+
     # this may be non-zero in the case of rewinding ...
     if not found_checkpoint:
         start_epoch = params["startEpoch"]
@@ -476,9 +479,6 @@ def train_with_worker(
     if not is_cpu:
         cudnn.benchmark = True
 
-    # switch to train mode
-    net_parallel.train()
-
     # convenience function for storing check points
     def store_checkpoints(epoch):
         # save checkpoint at the end of every epoch with 0 worker
@@ -518,15 +518,14 @@ def store_checkpoints(epoch):
         )
 
         # test after one epoch
-        if gpu_id == 0 and train_logger is not None:
-            _test_one_epoch(
-                loader=test_loader,
-                criterion=criterion,
-                epoch=epoch,
-                device=worker_device,
-                net=net_parallel,
-                train_logger=train_logger,
-            )
+        _test_one_epoch(
+            loader=test_loader,
+            criterion=criterion,
+            epoch=epoch,
+            device=worker_device,
+            net=net_parallel,
+            train_logger=train_logger if gpu_id == 0 else None,
+        )
 
     # store final checkpoint
     store_checkpoints(params["numEpochs"])
@@ -540,6 +539,7 @@ def store_checkpoints(epoch):
 
     # destroy process group at the end
     if is_distributed:
+        dist.barrier()
         dist.destroy_process_group()
 
 
@@ -561,6 +561,9 @@ def _train_one_epoch(
     t_enforce = 0.0
     t_log = 0.0
 
+    # switch to train mode
+    net_parallel.train()
+
     # go through one epoch and train
     for i, (images, targets) in enumerate(train_loader):
 
@@ -624,6 +627,9 @@ def _test_one_epoch(loader, criterion, epoch, device, net, train_logger=None):
     loss = 0
     num_total = 0
 
+    # switch to eval mode
+    net.eval()
+
     with torch.no_grad():
         for images, targets in loader:
             # move to correct device
@@ -647,7 +653,7 @@ def _test_one_epoch(loader, criterion, epoch, device, net, train_logger=None):
     acc5 /= num_total
     loss /= num_total
 
-    # make sure loss is also a regular float (not torch.Tensor)/s
+    # make sure loss is also a regular float (not torch.Tensor)
     loss = float(loss)
 
     if train_logger is not None: