From 666a00df7845b140a1b06a39a7de7204ebfa3d3f Mon Sep 17 00:00:00 2001 From: nobody <249805557+InlinePizza@users.noreply.github.com> Date: Tue, 16 Dec 2025 15:54:57 -0800 Subject: [PATCH] Deduplicate fetch commands by SHA before parallel processing When running git clone -vvv, noticed that the bundle for main was being downloaded twice in parallel. Git sends duplicate fetch commands for the same SHA when HEAD and a branch ref point to the same commit. These parallel downloads were clobbering each other's file writes. Deduplicates commands by SHA before processing to avoid redundant parallel downloads of the same bundle. --- git_remote_s3/remote.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/git_remote_s3/remote.py b/git_remote_s3/remote.py index fa154e5..1937aaf 100755 --- a/git_remote_s3/remote.py +++ b/git_remote_s3/remote.py @@ -483,17 +483,30 @@ def process_fetch_cmds(self, cmds): if not cmds: return - logger.info(f"Processing {len(cmds)} fetch commands in parallel") + # Deduplicate commands by SHA before processing. Git sends all fetch + # commands in a batch before the empty line delimiter, and may send + # duplicate commands for the same SHA (e.g., when HEAD and a branch + # ref point to the same commit). Deduplicating here avoids redundant + # parallel downloads of the same bundle. + seen_shas = set() + unique_cmds = [] + for cmd in cmds: + sha = cmd.split(" ")[1] + if sha not in seen_shas: + seen_shas.add(sha) + unique_cmds.append(cmd) + + logger.info(f"Processing {len(unique_cmds)} unique fetch commands in parallel") # Use a thread pool to process fetch commands in parallel with concurrent.futures.ThreadPoolExecutor() as executor: # Submit all fetch commands to the thread pool - futures = [executor.submit(self.cmd_fetch, cmd) for cmd in cmds] + futures = [executor.submit(self.cmd_fetch, cmd) for cmd in unique_cmds] # Wait for all fetch commands to complete concurrent.futures.wait(futures) - logger.info(f"Completed processing {len(cmds)} fetch commands in parallel") + logger.info(f"Completed processing {len(unique_cmds)} fetch commands in parallel") def process_cmd(self, cmd: str): # noqa: C901 if cmd.startswith("fetch"):