From dba62863822a767a04548f11810d06779c200b8b Mon Sep 17 00:00:00 2001 From: Gabry Date: Fri, 28 Jun 2024 13:56:16 +0200 Subject: [PATCH 1/3] add wandb log --- .gitignore | 1 + micromind/core.py | 28 +++++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 628f31fb..9b402724 100644 --- a/.gitignore +++ b/.gitignore @@ -133,6 +133,7 @@ results/ ckp/ checkpoints/ *.swp +wandb/ Dockerfile build_dgx.sh diff --git a/micromind/core.py b/micromind/core.py index 2f887beb..40e177c2 100644 --- a/micromind/core.py +++ b/micromind/core.py @@ -25,11 +25,14 @@ # This is used ONLY if you are not using argparse to get the hparams default_cfg = { + "project_name": "micromind", "output_folder": "results", "experiment_name": "micromind_exp", "opt": "adam", # this is ignored if you are overriding the configure_optimizers "lr": 0.001, # this is ignored if you are overriding the configure_optimizers "debug": False, + "log_wandb": False, + "wandb_resume": 'auto' # Resume run if prev crashed, otherwise new run. ["allow", "must", "never", "auto" or None] } @@ -381,7 +384,8 @@ def compute_macs(self, input_shape: Union[List, Tuple]): def on_train_start(self): """Initializes the optimizer, modules and puts the networks on the right - devices. Optionally loads checkpoint if already present. + devices. Optionally loads checkpoint if already present. It also start wandb + logger if selected. This function gets executed at the beginning of every training. """ @@ -389,6 +393,17 @@ def on_train_start(self): # pass debug status to checkpointer self.checkpointer.debug = self.hparams.debug + if self.hparams.log_wandb: + import wandb + + self.wlog = wandb.init( + project=self.hparams.project_name, + name=self.hparams.experiment_name, + resume=self.hparams.wandb_resume, + id=self.hparams.experiment_name, + config=self.hparams + ) + init_opt = self.configure_optimizers() if isinstance(init_opt, list) or isinstance(init_opt, tuple): self.opt, self.lr_sched = init_opt @@ -449,6 +464,8 @@ def init_devices(self): def on_train_end(self): """Runs at the end of each training. Cleans up before exiting.""" + if self.hparams.log_wandb: + self.wlog.finish() pass def eval(self): @@ -531,6 +548,9 @@ def train( # ok for cos_lr self.lr_sched.step() + if self.hparams.log_wandb: + self.wlog.log({"lr": self.lr_sched.get_last_lr()}) + for m in self.metrics: if ( self.current_epoch + 1 @@ -560,6 +580,9 @@ def train( train_metrics.update({"train_loss": loss_epoch / (idx + 1)}) + if self.hparams.log_wandb: # wandb log train loss + self.wlog.log(train_metrics) + if "val" in datasets: val_metrics = self.validate() if ( @@ -574,6 +597,9 @@ def train( else: val_metrics = train_metrics.update({"val_loss": loss_epoch / (idx + 1)}) + if self.hparams.log_wandb: # wandb log val loss + self.wlog.log(val_metrics) + if e >= 1 and self.debug: break From 95579d0998e29090bdc6bbca3b8bec6eca897216 Mon Sep 17 00:00:00 2001 From: Gabry Date: Fri, 28 Jun 2024 14:23:18 +0200 Subject: [PATCH 2/3] fix_flake8 --- micromind/core.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/micromind/core.py b/micromind/core.py index 40e177c2..53c64f62 100644 --- a/micromind/core.py +++ b/micromind/core.py @@ -32,7 +32,7 @@ "lr": 0.001, # this is ignored if you are overriding the configure_optimizers "debug": False, "log_wandb": False, - "wandb_resume": 'auto' # Resume run if prev crashed, otherwise new run. ["allow", "must", "never", "auto" or None] + "wandb_resume": "auto", # ["allow", "must", "never", "auto" or None] } @@ -384,7 +384,7 @@ def compute_macs(self, input_shape: Union[List, Tuple]): def on_train_start(self): """Initializes the optimizer, modules and puts the networks on the right - devices. Optionally loads checkpoint if already present. It also start wandb + devices. Optionally loads checkpoint if already present. It also start wandb logger if selected. This function gets executed at the beginning of every training. @@ -397,11 +397,11 @@ def on_train_start(self): import wandb self.wlog = wandb.init( - project=self.hparams.project_name, + project=self.hparams.project_name, name=self.hparams.experiment_name, resume=self.hparams.wandb_resume, id=self.hparams.experiment_name, - config=self.hparams + config=self.hparams, ) init_opt = self.configure_optimizers() @@ -580,8 +580,8 @@ def train( train_metrics.update({"train_loss": loss_epoch / (idx + 1)}) - if self.hparams.log_wandb: # wandb log train loss - self.wlog.log(train_metrics) + if self.hparams.log_wandb: # wandb log train loss + self.wlog.log(train_metrics) if "val" in datasets: val_metrics = self.validate() @@ -597,7 +597,7 @@ def train( else: val_metrics = train_metrics.update({"val_loss": loss_epoch / (idx + 1)}) - if self.hparams.log_wandb: # wandb log val loss + if self.hparams.log_wandb: # wandb log val loss self.wlog.log(val_metrics) if e >= 1 and self.debug: From 1958830c7a06e31d183aff8da57b01da960417cd Mon Sep 17 00:00:00 2001 From: Gabry Date: Wed, 3 Jul 2024 11:01:51 +0200 Subject: [PATCH 3/3] compact log in one check --- micromind/core.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/micromind/core.py b/micromind/core.py index 53c64f62..9e94aebf 100644 --- a/micromind/core.py +++ b/micromind/core.py @@ -580,9 +580,6 @@ def train( train_metrics.update({"train_loss": loss_epoch / (idx + 1)}) - if self.hparams.log_wandb: # wandb log train loss - self.wlog.log(train_metrics) - if "val" in datasets: val_metrics = self.validate() if ( @@ -597,7 +594,8 @@ def train( else: val_metrics = train_metrics.update({"val_loss": loss_epoch / (idx + 1)}) - if self.hparams.log_wandb: # wandb log val loss + if self.hparams.log_wandb: # wandb log + self.wlog.log(train_metrics) self.wlog.log(val_metrics) if e >= 1 and self.debug: