diff --git a/e2e/cases/py-image-layer-symlinks-567/BUILD.bazel b/e2e/cases/py-image-layer-symlinks-567/BUILD.bazel new file mode 100644 index 00000000..2590ae05 --- /dev/null +++ b/e2e/cases/py-image-layer-symlinks-567/BUILD.bazel @@ -0,0 +1,26 @@ +load("@aspect_rules_py//py:defs.bzl", "py_binary", "py_image_layer") +load("@rules_shell//shell:sh_test.bzl", "sh_test") + +py_binary( + name = "main", + srcs = ["main.py"], +) + +py_image_layer( + name = "layers", + binary = ":main", +) + +# Extract the interpreter tar listing for the test to inspect. +genrule( + name = "interpreter_listing", + srcs = [":layers_interpreter"], + outs = ["interpreter_listing.txt"], + cmd = "tar -tvf $< 2>/dev/null > $@", +) + +sh_test( + name = "test_resolve_symlinks", + srcs = ["test_resolve_symlinks.sh"], + data = [":interpreter_listing"], +) diff --git a/e2e/cases/py-image-layer-symlinks-567/main.py b/e2e/cases/py-image-layer-symlinks-567/main.py new file mode 100644 index 00000000..5906af62 --- /dev/null +++ b/e2e/cases/py-image-layer-symlinks-567/main.py @@ -0,0 +1,2 @@ +"""Minimal py_binary for testing py_image_layer symlink preservation.""" +print("hello") diff --git a/e2e/cases/py-image-layer-symlinks-567/test_resolve_symlinks.sh b/e2e/cases/py-image-layer-symlinks-567/test_resolve_symlinks.sh new file mode 100755 index 00000000..b0f04c05 --- /dev/null +++ b/e2e/cases/py-image-layer-symlinks-567/test_resolve_symlinks.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# Integration test: verify py_image_layer preserves symlinks in the interpreter tar. +# Validates the fix for https://github.com/aspect-build/rules_py/issues/567 +# +# Builds a real py_image_layer from a py_binary and checks that the interpreter +# tar contains symlink entries (e.g. python -> python3.x) instead of full copies. + +set -euo pipefail + +PASS=0 +FAIL=0 + +assert() { + local label="$1" + shift + if "$@"; then + PASS=$((PASS + 1)) + else + echo "FAIL: $label" + FAIL=$((FAIL + 1)) + fi +} + +# Find the interpreter listing in runfiles +LISTING="" +for f in $(find "$TEST_SRCDIR" -name 'interpreter_listing.txt' 2>/dev/null); do + LISTING="$f" + break +done + +if [ -z "$LISTING" ]; then + echo "FAIL: could not find interpreter_listing.txt in runfiles" + exit 1 +fi + +echo "Inspecting interpreter layer listing:" +grep '/bin/python' "$LISTING" +echo "" + +# python and python3 should be symlinks (tar -tv shows 'l' prefix for symlinks) +assert "bin/python is a symlink to python3.x" \ + grep -qE '^l.*bin/python -> python3\.' "$LISTING" + +assert "bin/python3 is a symlink to python3.x" \ + grep -qE '^l.*bin/python3 -> python3\.' "$LISTING" + +# The real python3.X binary should be a regular file (not a symlink) +assert "bin/python3.X is a regular file" \ + grep -qE '^-.*bin/python3\.[0-9]+ *$' "$LISTING" + +# No symlink should have an absolute target (would break in containers) +assert "no absolute symlink targets" \ + bash -c '! grep -qE "^l.* -> /." "$1"' _ "$LISTING" + +# The python entry should NOT be a large regular file (would mean no dedup) +assert "bin/python is not a 100MB+ regular file copy" \ + bash -c '! grep -qE "^-.* [0-9]{9,}.*bin/python *$" "$1"' _ "$LISTING" + +# --- Summary --- + +echo "Results: $PASS passed, $FAIL failed" +if [ "$FAIL" -gt 0 ]; then + exit 1 +fi diff --git a/py/private/py_image_layer.bzl b/py/private/py_image_layer.bzl index 8f3d4d59..28645788 100644 --- a/py/private/py_image_layer.bzl +++ b/py/private/py_image_layer.bzl @@ -34,6 +34,54 @@ load("@bazel_lib//lib:transitions.bzl", "platform_transition_filegroup") load("@tar.bzl//tar:mtree.bzl", "mtree_mutate", "mtree_spec") load("@tar.bzl//tar:tar.bzl", "tar") +# Resolve filesystem-level symlinks in mtree specs so that tars preserve them. +# +# mtree_spec records every file as type=file with a distinct content= path, even +# when files are symlinks to each other (e.g. python/python3 -> python3.13 in the +# Python toolchain). Without this, the tar contains multiple full copies. +# +# Detection uses two-step readlink in the Bazel sandbox: +# 1. readlink(sandbox_path) -> absolute cache path (always succeeds in sandbox) +# 2. readlink(cache_path) -> non-empty only if the real file is a symlink +# Only relative targets from step 2 are converted; absolute targets are cross-repo +# ctx.actions.symlink references that would break in containers. +# +# A Starlark rule (not a genrule) is required because genrules cannot access +# runfiles from srcs — only DefaultInfo.default_runfiles gives sandbox access. +# +# See: https://github.com/aspect-build/rules_py/issues/567 + +_RESOLVE_SYMLINKS_SCRIPT = """\ +/type=file/ && /content=/ { + match($0, /content=[^ ]+/); split(substr($0, RSTART, RLENGTH), p, "=") + c = ""; cmd = "readlink \\\"" p[2] "\\\""; cmd | getline c; close(cmd) + if (c != "") { t = ""; cmd2 = "readlink \\\"" c "\\\""; cmd2 | getline t; close(cmd2) + if (t != "" && t !~ /^\\//) { sub(/type=file/, "type=link"); sub(/content=[^ ]+/, "link=" t) } + } +} 1""" + +def _resolve_symlinks_impl(ctx): + out = ctx.outputs.out + ctx.actions.run_shell( + command = "awk '%s' %s > %s" % (_RESOLVE_SYMLINKS_SCRIPT, ctx.file.mtree.path, out.path), + inputs = depset([ctx.file.mtree], transitive = [ + s[DefaultInfo].default_runfiles.files + for s in ctx.attr.srcs + ]), + outputs = [out], + ) + return [DefaultInfo(files = depset([out]))] + +_resolve_symlinks = rule( + doc = "Post-process an mtree spec to convert filesystem symlinks to type=link entries.", + implementation = _resolve_symlinks_impl, + attrs = { + "mtree": attr.label(mandatory = True, allow_single_file = True), + "srcs": attr.label_list(mandatory = True), + "out": attr.output(mandatory = True), + }, +) + default_layer_groups = { # match *only* external repositories containing a Python interpreter, # by matching the interpreter repo naming convention: @@ -158,23 +206,28 @@ def py_image_layer( # Produce the manifest for a tar file of our py_binary, but don't tar it up yet, so we can split # into fine-grained layers for better pull, push and remote cache performance. manifest_name = name + ".manifest" + mtree_spec( + name = manifest_name + ".raw", + srcs = [binary], + **kwargs + ) + _resolve_symlinks( + name = manifest_name + ".symlinks_resolved", + mtree = manifest_name + ".raw", + srcs = [binary], + out = manifest_name + ".symlinks_resolved.mtree", + ) if owner: - mtree_spec( - name = manifest_name + ".preprocessed", - srcs = [binary], - **kwargs - ) mtree_mutate( name = manifest_name, - mtree = name + ".manifest.preprocessed", + mtree = manifest_name + ".symlinks_resolved", owner = owner, group = group, ) else: - mtree_spec( + native.filegroup( name = manifest_name, - srcs = [binary], - **kwargs + srcs = [manifest_name + ".symlinks_resolved"], ) groups = dict(**layer_groups)