Merge pull request #20553 from mvdbeek/live_fix_cry_emoji

jmchilton · web-flow · commit 7769f6379d82 · 2025-07-29T10:26:42.000-04:00
[25.0] Propagate cached job output replacement to copies of outputs
diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py
@@ -127,6 +127,7 @@
     reconstructor,
     registry,
     relationship,
+    remote,
     validates,
 )
 from sqlalchemy.orm.attributes import flag_modified
@@ -5475,6 +5476,7 @@ class HistoryDatasetAssociation(DatasetInstance, HasTags, Dictifiable, UsesAnnot
     dataset_id: Mapped[Optional[int]]
     hidden_beneath_collection_instance: Mapped[Optional["HistoryDatasetCollectionAssociation"]]
     tags: Mapped[List["HistoryDatasetAssociationTagAssociation"]]
+    copied_to_history_dataset_associations: Mapped[List["HistoryDatasetAssociation"]]
 
     def __init__(
         self,
@@ -5562,6 +5564,9 @@ def copy_from(self, other_hda, new_dataset=None, include_tags=True, include_meta
             self.copy_tags_from(self.user, other_hda)
         self.dataset = new_dataset or other_hda.dataset
         self.copied_from_history_dataset_association_id = other_hda.id
+        for copied_hda in self.copied_to_history_dataset_associations:
+            copied_hda.copy_from(self, include_tags=include_tags, include_metadata=include_metadata)
+
         if old_dataset:
             old_dataset.full_delete()
 
@@ -11946,15 +11951,34 @@ def __repr__(self):
             lazy="joined",
             back_populates="history_associations",
         ),
+        copied_to_history_dataset_associations=relationship(
+            "HistoryDatasetAssociation",
+            primaryjoin=lambda: and_(
+                HistoryDatasetAssociation.id
+                == remote(HistoryDatasetAssociation.copied_from_history_dataset_association_id),
+                # Include dataset_id, not technically necessary but allows filtering early
+                # and avoid the need for an index on copied_from_history_dataset_association_id
+                HistoryDatasetAssociation.dataset_id == remote(HistoryDatasetAssociation.dataset_id),
+            ),
+            remote_side=lambda: [
+                HistoryDatasetAssociation.copied_from_history_dataset_association_id,
+                HistoryDatasetAssociation.dataset_id,
+            ],
+            back_populates="copied_from_history_dataset_association",
+        ),
         copied_from_history_dataset_association=relationship(
-            HistoryDatasetAssociation,
-            primaryjoin=(
-                HistoryDatasetAssociation.table.c.copied_from_history_dataset_association_id
-                == HistoryDatasetAssociation.table.c.id
+            "HistoryDatasetAssociation",
+            primaryjoin=lambda: and_(
+                HistoryDatasetAssociation.copied_from_history_dataset_association_id
+                == remote(HistoryDatasetAssociation.id),
+                HistoryDatasetAssociation.dataset_id == remote(HistoryDatasetAssociation.dataset_id),
             ),
-            remote_side=[HistoryDatasetAssociation.table.c.id],
-            uselist=False,
+            remote_side=lambda: [
+                HistoryDatasetAssociation.id,
+                HistoryDatasetAssociation.dataset_id,
+            ],
             back_populates="copied_to_history_dataset_associations",
+            uselist=False,
         ),
         copied_from_library_dataset_dataset_association=relationship(
             LibraryDatasetDatasetAssociation,
@@ -11964,14 +11988,6 @@ def __repr__(self):
             ),
             back_populates="copied_to_history_dataset_associations",
         ),
-        copied_to_history_dataset_associations=relationship(
-            HistoryDatasetAssociation,
-            primaryjoin=(
-                HistoryDatasetAssociation.table.c.copied_from_history_dataset_association_id
-                == HistoryDatasetAssociation.table.c.id
-            ),
-            back_populates="copied_from_history_dataset_association",
-        ),
         copied_to_library_dataset_dataset_associations=relationship(
             LibraryDatasetDatasetAssociation,
             primaryjoin=(
diff --git a/lib/galaxy/model/store/__init__.py b/lib/galaxy/model/store/__init__.py
@@ -531,6 +531,17 @@ def handle_dataset_object_edit(dataset_instance, dataset_attrs):
 
                 model_class = dataset_attrs.get("model_class", "HistoryDatasetAssociation")
                 if model_class == "HistoryDatasetAssociation":
+                    # Check if this HDA should reuse a dataset from a copied-from HDA
+                    reuse_dataset = None
+                    copied_from_chain = dataset_attrs.get("copied_from_history_dataset_association_id_chain", [])
+                    if copied_from_chain:
+                        # Look for the source HDA in the current import set
+                        copied_from_key = _copied_from_object_key(copied_from_chain, object_import_tracker.hdas_by_key)
+                        if copied_from_key and copied_from_key in object_import_tracker.hdas_by_key:
+                            source_hda = object_import_tracker.hdas_by_key[copied_from_key]
+                            # Reuse the dataset from the source HDA
+                            reuse_dataset = source_hda.dataset
+
                     # Create dataset and HDA.
                     dataset_instance = model.HistoryDatasetAssociation(
                         name=dataset_attrs["name"],
@@ -545,7 +556,8 @@ def handle_dataset_object_edit(dataset_instance, dataset_attrs):
                         tool_version=metadata.get("tool_version"),
                         metadata_deferred=metadata_deferred,
                         history=history,
-                        create_dataset=True,
+                        create_dataset=reuse_dataset is None,
+                        dataset=reuse_dataset,
                         flush=False,
                         sa_session=self.sa_session,
                     )
diff --git a/lib/galaxy_test/api/test_tools.py b/lib/galaxy_test/api/test_tools.py
@@ -1057,6 +1057,42 @@ def test_run_cat1_use_cached_job(self):
             assert len(filenames) == 3, filenames
             assert len(set(filenames)) <= 2, filenames
 
+    @skip_without_tool("cat1")
+    @requires_new_history
+    def test_run_cat1_use_cached_job_build_list(self):
+        with self.dataset_populator.test_history_for(self.test_run_cat1_use_cached_job) as history_id:
+            # Run simple non-upload tool with an input data parameter.
+            inputs = self._get_cat1_inputs(history_id)
+            outputs_one = self._run_cat1(history_id, inputs=inputs, assert_ok=True, wait_for_job=True)
+            outputs_two = self._run_cat1(
+                history_id, inputs=inputs, use_cached_job=False, assert_ok=True, wait_for_job=True
+            )
+            # Rename inputs. Job should still be cached since cat1 doesn't look at name attribute
+            self.dataset_populator.rename_dataset(inputs["input1"]["id"])
+            outputs_three = self._run_cat1(
+                history_id, inputs=inputs, use_cached_job=True, assert_ok=False, wait_for_job=False
+            ).json()
+            outputs_four = self._run(
+                "__BUILD_LIST__",
+                history_id=history_id,
+                inputs={"datasets_0|input": {"src": "hda", "id": outputs_three["outputs"][0]["id"]}},
+            ).json()
+            self.dataset_populator.wait_for_job(outputs_three["jobs"][0]["id"])
+            dataset_details = []
+            for output in [outputs_one, outputs_two, outputs_three]:
+                output_id = output["outputs"][0]["id"]
+                dataset_details.append(self._get(f"datasets/{output_id}").json())
+                assert self._get(f"jobs/{output['jobs'][0]['id']}/metrics").json()
+            filenames = [dd["file_name"] for dd in dataset_details]
+            assert len(filenames) == 3, filenames
+            assert len(set(filenames)) <= 2, filenames
+            hdca = self.dataset_populator.get_history_collection_details(
+                history_id, content_id=outputs_four["output_collections"][0]["id"]
+            )
+            assert self.dataset_populator.get_history_dataset_content(
+                history_id, content_id=hdca["elements"][0]["object"]["id"]
+            )
+
     @skip_without_tool("cat_list")
     @skip_without_tool("__SORTLIST__")
     def test_run_cat_list_hdca_sort_order_respecrted_use_cached_job(self):