From dba62863822a767a04548f11810d06779c200b8b Mon Sep 17 00:00:00 2001
From: Gabry <gabrielesantini98@gmail.com>
Date: Fri, 28 Jun 2024 13:56:16 +0200
Subject: [PATCH 1/3] add wandb log

---
 .gitignore        |  1 +
 micromind/core.py | 28 +++++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 628f31fb..9b402724 100644
--- a/.gitignore
+++ b/.gitignore
@@ -133,6 +133,7 @@ results/
 ckp/
 checkpoints/
 *.swp
+wandb/
 
 Dockerfile
 build_dgx.sh
diff --git a/micromind/core.py b/micromind/core.py
index 2f887beb..40e177c2 100644
--- a/micromind/core.py
+++ b/micromind/core.py
@@ -25,11 +25,14 @@
 
 # This is used ONLY if you are not using argparse to get the hparams
 default_cfg = {
+    "project_name": "micromind",
     "output_folder": "results",
     "experiment_name": "micromind_exp",
     "opt": "adam",  # this is ignored if you are overriding the configure_optimizers
     "lr": 0.001,  # this is ignored if you are overriding the configure_optimizers
     "debug": False,
+    "log_wandb": False,
+    "wandb_resume": 'auto' # Resume run if prev crashed, otherwise new run. ["allow", "must", "never", "auto" or None]
 }
 
 
@@ -381,7 +384,8 @@ def compute_macs(self, input_shape: Union[List, Tuple]):
 
     def on_train_start(self):
         """Initializes the optimizer, modules and puts the networks on the right
-        devices. Optionally loads checkpoint if already present.
+        devices. Optionally loads checkpoint if already present. It also start wandb 
+        logger if selected.
 
         This function gets executed at the beginning of every training.
         """
@@ -389,6 +393,17 @@ def on_train_start(self):
         # pass debug status to checkpointer
         self.checkpointer.debug = self.hparams.debug
 
+        if self.hparams.log_wandb:
+            import wandb
+
+            self.wlog = wandb.init(
+                project=self.hparams.project_name, 
+                name=self.hparams.experiment_name,
+                resume=self.hparams.wandb_resume,
+                id=self.hparams.experiment_name,
+                config=self.hparams
+            )
+
         init_opt = self.configure_optimizers()
         if isinstance(init_opt, list) or isinstance(init_opt, tuple):
             self.opt, self.lr_sched = init_opt
@@ -449,6 +464,8 @@ def init_devices(self):
 
     def on_train_end(self):
         """Runs at the end of each training. Cleans up before exiting."""
+        if self.hparams.log_wandb:
+            self.wlog.finish()
         pass
 
     def eval(self):
@@ -531,6 +548,9 @@ def train(
                     # ok for cos_lr
                     self.lr_sched.step()
 
+                    if self.hparams.log_wandb:
+                        self.wlog.log({"lr": self.lr_sched.get_last_lr()})
+
                 for m in self.metrics:
                     if (
                         self.current_epoch + 1
@@ -560,6 +580,9 @@ def train(
 
             train_metrics.update({"train_loss": loss_epoch / (idx + 1)})
 
+            if self.hparams.log_wandb: # wandb log train loss
+                    self.wlog.log(train_metrics)
+
             if "val" in datasets:
                 val_metrics = self.validate()
                 if (
@@ -574,6 +597,9 @@ def train(
             else:
                 val_metrics = train_metrics.update({"val_loss": loss_epoch / (idx + 1)})
 
+            if self.hparams.log_wandb: # wandb log val loss
+                self.wlog.log(val_metrics)
+
             if e >= 1 and self.debug:
                 break
 

From 95579d0998e29090bdc6bbca3b8bec6eca897216 Mon Sep 17 00:00:00 2001
From: Gabry <gabrielesantini98@gmail.com>
Date: Fri, 28 Jun 2024 14:23:18 +0200
Subject: [PATCH 2/3] fix_flake8

---
 micromind/core.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/micromind/core.py b/micromind/core.py
index 40e177c2..53c64f62 100644
--- a/micromind/core.py
+++ b/micromind/core.py
@@ -32,7 +32,7 @@
     "lr": 0.001,  # this is ignored if you are overriding the configure_optimizers
     "debug": False,
     "log_wandb": False,
-    "wandb_resume": 'auto' # Resume run if prev crashed, otherwise new run. ["allow", "must", "never", "auto" or None]
+    "wandb_resume": "auto",  # ["allow", "must", "never", "auto" or None]
 }
 
 
@@ -384,7 +384,7 @@ def compute_macs(self, input_shape: Union[List, Tuple]):
 
     def on_train_start(self):
         """Initializes the optimizer, modules and puts the networks on the right
-        devices. Optionally loads checkpoint if already present. It also start wandb 
+        devices. Optionally loads checkpoint if already present. It also start wandb
         logger if selected.
 
         This function gets executed at the beginning of every training.
@@ -397,11 +397,11 @@ def on_train_start(self):
             import wandb
 
             self.wlog = wandb.init(
-                project=self.hparams.project_name, 
+                project=self.hparams.project_name,
                 name=self.hparams.experiment_name,
                 resume=self.hparams.wandb_resume,
                 id=self.hparams.experiment_name,
-                config=self.hparams
+                config=self.hparams,
             )
 
         init_opt = self.configure_optimizers()
@@ -580,8 +580,8 @@ def train(
 
             train_metrics.update({"train_loss": loss_epoch / (idx + 1)})
 
-            if self.hparams.log_wandb: # wandb log train loss
-                    self.wlog.log(train_metrics)
+            if self.hparams.log_wandb:  # wandb log train loss
+                self.wlog.log(train_metrics)
 
             if "val" in datasets:
                 val_metrics = self.validate()
@@ -597,7 +597,7 @@ def train(
             else:
                 val_metrics = train_metrics.update({"val_loss": loss_epoch / (idx + 1)})
 
-            if self.hparams.log_wandb: # wandb log val loss
+            if self.hparams.log_wandb:  # wandb log val loss
                 self.wlog.log(val_metrics)
 
             if e >= 1 and self.debug:

From 1958830c7a06e31d183aff8da57b01da960417cd Mon Sep 17 00:00:00 2001
From: Gabry <gabrielesantini98@gmail.com>
Date: Wed, 3 Jul 2024 11:01:51 +0200
Subject: [PATCH 3/3] compact log in one check

---
 micromind/core.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/micromind/core.py b/micromind/core.py
index 53c64f62..9e94aebf 100644
--- a/micromind/core.py
+++ b/micromind/core.py
@@ -580,9 +580,6 @@ def train(
 
             train_metrics.update({"train_loss": loss_epoch / (idx + 1)})
 
-            if self.hparams.log_wandb:  # wandb log train loss
-                self.wlog.log(train_metrics)
-
             if "val" in datasets:
                 val_metrics = self.validate()
                 if (
@@ -597,7 +594,8 @@ def train(
             else:
                 val_metrics = train_metrics.update({"val_loss": loss_epoch / (idx + 1)})
 
-            if self.hparams.log_wandb:  # wandb log val loss
+            if self.hparams.log_wandb:  # wandb log
+                self.wlog.log(train_metrics)
                 self.wlog.log(val_metrics)
 
             if e >= 1 and self.debug: