Merge pull request #731 from marrlab/list_inner_product

smilesun · web-flow · commit 910dd3b250e2 · 2024-01-07T13:22:33.000+01:00
stable loss agg, keep batch structure, dial, List inner product
diff --git a/domainlab/__init__.py b/domainlab/__init__.py
@@ -8,7 +8,13 @@
 g_inst_component_loss_agg = torch.sum
 g_tensor_batch_agg = torch.sum
 g_list_loss_agg = sum
-g_list_model_penalized_reg_agg = sum
+
+def g_list_model_penalized_reg_agg(list_penalized_reg):
+    """
+    aggregate along the list, but do not diminish the batch structure of the tensor
+    """
+    return torch.stack(list_penalized_reg, dim=0).sum(dim=0)
+
 g_str_cross_entropy_agg = "none"
 # component loss refers to aggregation of pixel loss, digit of KL divergences loss
 # instance loss currently use torch.sum, which is the same effect as torch.mean, the
diff --git a/domainlab/algos/trainers/a_trainer.py b/domainlab/algos/trainers/a_trainer.py
@@ -217,8 +217,14 @@ def cal_reg_loss(self, tensor_x, tensor_y, tensor_d, others=None):
         """
         list_reg_model, list_mu_model = self.decoratee.cal_reg_loss(
             tensor_x, tensor_y, tensor_d, others)
-        list_reg, list_mu = self._cal_reg_loss(tensor_x, tensor_y, tensor_d, others)
-        return list_reg_model + list_reg, list_mu_model + list_mu
+        assert len(list_reg_model) == len(list_mu_model)
+
+        list_reg_trainer, list_mu_trainer = self._cal_reg_loss(tensor_x, tensor_y, tensor_d, others)
+        assert len(list_reg_trainer) ==  len(list_mu_trainer)
+
+        list_loss = list_reg_model + list_reg_trainer
+        list_mu = list_mu_model + list_mu_trainer
+        return list_loss, list_mu
 
     def _cal_reg_loss(self, tensor_x, tensor_y, tensor_d, others=None):
         """
diff --git a/domainlab/algos/trainers/train_basic.py b/domainlab/algos/trainers/train_basic.py
@@ -57,14 +57,17 @@ def after_epoch(self, epoch):
         assert flag_stop is not None
         return flag_stop
 
-    def log_r_loss(self, list_b_reg_loss):
+    def log_loss(self, list_b_reg_loss, loss_task, loss):
         """
         just for logging the self.epo_reg_loss_tr
         """
+        self.epo_task_loss_tr += loss_task.sum().detach().item()
+        #
         list_b_reg_loss_sumed = [ele.sum().detach().item()
                                  for ele in list_b_reg_loss]
         self.epo_reg_loss_tr = list(map(add, self.epo_reg_loss_tr,
                                         list_b_reg_loss_sumed))
+        self.epo_loss_tr += loss.detach().item()
 
     def tr_batch(self, tensor_x, tensor_y, tensor_d, others, ind_batch, epoch):
         """
@@ -78,7 +81,6 @@ def tr_batch(self, tensor_x, tensor_y, tensor_d, others, ind_batch, epoch):
         loss = self.cal_loss(tensor_x, tensor_y, tensor_d, others)
         loss.backward()
         self.optimizer.step()
-        self.epo_loss_tr += loss.detach().item()
         self.after_batch(epoch, ind_batch)
         self.counter_batch += 1
 
@@ -88,15 +90,13 @@ def cal_loss(self, tensor_x, tensor_y, tensor_d, others):
         """
         loss_task = self.model.cal_task_loss(tensor_x, tensor_y)
 
-        # only for logging
-        self.epo_task_loss_tr += loss_task.sum().detach().item()
-        #
-        list_reg_tr, list_mu_tr = self.cal_reg_loss(tensor_x, tensor_y,
+        list_reg_tr_batch, list_mu_tr = self.cal_reg_loss(tensor_x, tensor_y,
                                                     tensor_d, others)
-        #
-        self.log_r_loss(list_reg_tr)   # just for logging
-        reg_tr = self.model.inner_product(list_reg_tr, list_mu_tr)
+        tensor_batch_reg_loss_penalized = self.model.list_inner_product(
+                list_reg_tr_batch, list_mu_tr)
+        assert len(tensor_batch_reg_loss_penalized.shape) == 1
         loss_erm_agg = g_tensor_batch_agg(loss_task)
-        loss_reg_agg = g_tensor_batch_agg(reg_tr)
+        loss_reg_agg = g_tensor_batch_agg(tensor_batch_reg_loss_penalized)
         loss = self.model.multiplier4task_loss * loss_erm_agg + loss_reg_agg
+        self.log_loss(list_reg_tr_batch, loss_task, loss)
         return loss
diff --git a/domainlab/algos/trainers/train_dial.py b/domainlab/algos/trainers/train_dial.py
@@ -17,7 +17,6 @@ def gen_adversarial(self, device, img_natural, vec_y):
         this is not necessarily constraint optimal due to nonlinearity,
         as the constraint epsilon is only considered ad-hoc
         """
-        # @FIXME: is there better way to initialize adversarial image?
         # ensure adversarial image not in computational graph
         steps_perturb = self.aconf.dial_steps_perturb
         scale = self.aconf.dial_noise_scale
diff --git a/domainlab/algos/trainers/train_matchdg.py b/domainlab/algos/trainers/train_matchdg.py
@@ -110,7 +110,7 @@ def tr_batch(self, epoch, batch_idx, x_e, y_e, d_e, others=None):
         if self.flag_erm:
             # decoratee can be both trainer or model
             list_loss_reg_rand, list_mu_reg = self.decoratee.cal_reg_loss(x_e, y_e, d_e, others)
-            loss_reg = self.model.inner_product(list_loss_reg_rand, list_mu_reg)
+            loss_reg = self.model.list_inner_product(list_loss_reg_rand, list_mu_reg)
             loss_task_rand = self.model.cal_task_loss(x_e, y_e)
             # loss_erm_rnd_loader, *_ = self.model.cal_loss(x_e, y_e, d_e, others)
             loss_erm_rnd_loader = loss_reg + loss_task_rand * self.model.multiplier4task_loss
diff --git a/domainlab/algos/trainers/train_mldg.py b/domainlab/algos/trainers/train_mldg.py
@@ -76,7 +76,7 @@ def tr_epoch(self, epoch):
             # since mldg's reg loss is on target domain,
             # no other trainer except hyperscheduler could decorate it unless we use state pattern
             # in the future to control source and target domain loader behavior
-            source_reg_tr = self.model.inner_product(list_source_reg_tr, list_source_mu_tr)
+            source_reg_tr = self.model.list_inner_product(list_source_reg_tr, list_source_mu_tr)
             # self.aconf.gamma_reg * loss_look_forward.sum()
             loss = loss_source_task.sum() + source_reg_tr.sum() +\
                     self.aconf.gamma_reg * loss_look_forward.sum()
diff --git a/domainlab/models/a_model.py b/domainlab/models/a_model.py
@@ -43,21 +43,25 @@ def cal_loss(self, tensor_x, tensor_y, tensor_d=None, others=None):
         calculate the loss
         """
         list_loss, list_multiplier = self.cal_reg_loss(tensor_x, tensor_y, tensor_d, others)
-        loss_reg = self.inner_product(list_loss, list_multiplier)
+        loss_reg = self.list_inner_product(list_loss, list_multiplier)
         loss_task_alone = self.cal_task_loss(tensor_x, tensor_y)
         loss_task = self.multiplier4task_loss * loss_task_alone
         return loss_task + loss_reg, list_loss, loss_task_alone
 
-    def inner_product(self, list_loss_scalar, list_multiplier):
+    def list_inner_product(self, list_loss, list_multiplier):
         """
-        compute inner product between list of scalar loss and multiplier
-        - the first dimension of the tensor v_reg_loss is mini-batch
-        the second dimension is the number of regularizers
-        - the vector mmu has dimension the number of regularizers
+        compute inner product between list of regularization loss and multiplier
+        - the length of the list is the number of regularizers
+        - for each element of the list: the first dimension of the tensor is mini-batch
+        return value of list_inner_product should keep the minibatch structure, thus aggregation
+        here only aggregate along the list
         """
-        list_tuple = zip(list_loss_scalar, list_multiplier)
+        list_tuple = zip(list_loss, list_multiplier)
         list_penalized_reg = [mtuple[0]*mtuple[1] for mtuple in list_tuple]
-        return g_list_model_penalized_reg_agg(list_penalized_reg)
+        tensor_batch_penalized_loss = g_list_model_penalized_reg_agg(list_penalized_reg)
+        # return value of list_inner_product should keep the minibatch structure, thus aggregation
+        # here only aggregate along the list
+        return tensor_batch_penalized_loss
 
     @abc.abstractmethod
     def cal_task_loss(self, tensor_x, tensor_y):
diff --git a/domainlab/models/a_model_classif.py b/domainlab/models/a_model_classif.py
@@ -216,4 +216,4 @@ def _cal_reg_loss(self, tensor_x, tensor_y, tensor_d, others=None):
         """
         device = tensor_x.device
         bsize = tensor_x.shape[0]
-        return [torch.zeros(bsize, 1).to(device)], [0.0]
+        return [torch.zeros(bsize).to(device)], [0.0]