From 9cf1e7e9d0ba9e3f0835423048a866d672ce2dba Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Tue, 24 Jun 2025 10:56:50 -0400 Subject: [PATCH 1/3] Propagate cached job output replacement to copies (made by database operation tools) Should fix the issue we had during Galaxy live, where we had a step in which we picked the first dataset collection element from a mapped over collection whose outputs where not yet replaced by the job cache. The effect of this is that the extracted dataset's dataset was deleted, and all downstream jobs where waiting for this dataset to become terminal. --- lib/galaxy/model/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py index 1926212327ea..fe564aa7e3f0 100644 --- a/lib/galaxy/model/__init__.py +++ b/lib/galaxy/model/__init__.py @@ -5475,6 +5475,7 @@ class HistoryDatasetAssociation(DatasetInstance, HasTags, Dictifiable, UsesAnnot dataset_id: Mapped[Optional[int]] hidden_beneath_collection_instance: Mapped[Optional["HistoryDatasetCollectionAssociation"]] tags: Mapped[List["HistoryDatasetAssociationTagAssociation"]] + copied_to_history_dataset_associations: Mapped[List["HistoryDatasetAssociation"]] def __init__( self, @@ -5562,6 +5563,9 @@ def copy_from(self, other_hda, new_dataset=None, include_tags=True, include_meta self.copy_tags_from(self.user, other_hda) self.dataset = new_dataset or other_hda.dataset self.copied_from_history_dataset_association_id = other_hda.id + for copied_hda in self.copied_to_history_dataset_associations: + copied_hda.copy_from(self, include_tags=include_tags, include_metadata=include_metadata) + if old_dataset: old_dataset.full_delete() From d613247d23562ba1e820d04454f03c8b04425eb8 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Wed, 25 Jun 2025 10:48:37 -0400 Subject: [PATCH 2/3] Speed up copied_to_history_dataset_associations --- lib/galaxy/model/__init__.py | 40 +++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py index fe564aa7e3f0..1195e5d05fc1 100644 --- a/lib/galaxy/model/__init__.py +++ b/lib/galaxy/model/__init__.py @@ -127,6 +127,7 @@ reconstructor, registry, relationship, + remote, validates, ) from sqlalchemy.orm.attributes import flag_modified @@ -11950,15 +11951,34 @@ def __repr__(self): lazy="joined", back_populates="history_associations", ), + copied_to_history_dataset_associations=relationship( + "HistoryDatasetAssociation", + primaryjoin=lambda: and_( + HistoryDatasetAssociation.id + == remote(HistoryDatasetAssociation.copied_from_history_dataset_association_id), + # Include dataset_id, not technically necessary but allows filtering early + # and avoid the need for an index on copied_from_history_dataset_association_id + HistoryDatasetAssociation.dataset_id == remote(HistoryDatasetAssociation.dataset_id), + ), + remote_side=lambda: [ + HistoryDatasetAssociation.copied_from_history_dataset_association_id, + HistoryDatasetAssociation.dataset_id, + ], + back_populates="copied_from_history_dataset_association", + ), copied_from_history_dataset_association=relationship( - HistoryDatasetAssociation, - primaryjoin=( - HistoryDatasetAssociation.table.c.copied_from_history_dataset_association_id - == HistoryDatasetAssociation.table.c.id + "HistoryDatasetAssociation", + primaryjoin=lambda: and_( + HistoryDatasetAssociation.copied_from_history_dataset_association_id + == remote(HistoryDatasetAssociation.id), + HistoryDatasetAssociation.dataset_id == remote(HistoryDatasetAssociation.dataset_id), ), - remote_side=[HistoryDatasetAssociation.table.c.id], - uselist=False, + remote_side=lambda: [ + HistoryDatasetAssociation.id, + HistoryDatasetAssociation.dataset_id, + ], back_populates="copied_to_history_dataset_associations", + uselist=False, ), copied_from_library_dataset_dataset_association=relationship( LibraryDatasetDatasetAssociation, @@ -11968,14 +11988,6 @@ def __repr__(self): ), back_populates="copied_to_history_dataset_associations", ), - copied_to_history_dataset_associations=relationship( - HistoryDatasetAssociation, - primaryjoin=( - HistoryDatasetAssociation.table.c.copied_from_history_dataset_association_id - == HistoryDatasetAssociation.table.c.id - ), - back_populates="copied_from_history_dataset_association", - ), copied_to_library_dataset_dataset_associations=relationship( LibraryDatasetDatasetAssociation, primaryjoin=( From a72048c35b55bb93255372a36a8565fc5822ddef Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Thu, 26 Jun 2025 22:56:09 -0400 Subject: [PATCH 3/3] Reuse dataset on history import --- lib/galaxy/model/store/__init__.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/model/store/__init__.py b/lib/galaxy/model/store/__init__.py index 0b092370afbe..92123494958b 100644 --- a/lib/galaxy/model/store/__init__.py +++ b/lib/galaxy/model/store/__init__.py @@ -531,6 +531,17 @@ def handle_dataset_object_edit(dataset_instance, dataset_attrs): model_class = dataset_attrs.get("model_class", "HistoryDatasetAssociation") if model_class == "HistoryDatasetAssociation": + # Check if this HDA should reuse a dataset from a copied-from HDA + reuse_dataset = None + copied_from_chain = dataset_attrs.get("copied_from_history_dataset_association_id_chain", []) + if copied_from_chain: + # Look for the source HDA in the current import set + copied_from_key = _copied_from_object_key(copied_from_chain, object_import_tracker.hdas_by_key) + if copied_from_key and copied_from_key in object_import_tracker.hdas_by_key: + source_hda = object_import_tracker.hdas_by_key[copied_from_key] + # Reuse the dataset from the source HDA + reuse_dataset = source_hda.dataset + # Create dataset and HDA. dataset_instance = model.HistoryDatasetAssociation( name=dataset_attrs["name"], @@ -545,7 +556,8 @@ def handle_dataset_object_edit(dataset_instance, dataset_attrs): tool_version=metadata.get("tool_version"), metadata_deferred=metadata_deferred, history=history, - create_dataset=True, + create_dataset=reuse_dataset is None, + dataset=reuse_dataset, flush=False, sa_session=self.sa_session, )