diff --git a/.github/scripts/dispatch.py b/.github/scripts/dispatch.py new file mode 100644 index 00000000..aba2308d --- /dev/null +++ b/.github/scripts/dispatch.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python3 +""" +Dispatch build workflows for a kernel. + +Four entrypoints call this script: + 1. The PR-merge dispatch workflow (via CLI) + 2. The PR-open dispatch workflow (via CLI) + 3. The comment bot (via import) + 4. Local CLI invocation +""" + +import argparse +import json +import os +import re +import subprocess +import sys +import tomllib +import urllib.error +import urllib.request +import uuid +from dataclasses import dataclass, field +from pathlib import Path + + +RELEASE_WORKFLOWS = [ + "build.yaml", + "build-mac.yaml", + "build-windows.yaml", +] + +KERNEL_NAME_RE = re.compile(r"^[A-Za-z0-9_-]+$") + + +@dataclass +class ReleaseDispatchResult: + kernel_name: str + dispatched: list[tuple[str, str]] = field(default_factory=list) # (workflow, dispatch_key) + failed: list[tuple[str, int]] = field(default_factory=list) # (workflow, http_code) + skipped: list[str] = field(default_factory=list) # workflow filenames + + +def github_api_request( + url: str, token: str, method: str = "GET", data: dict | None = None +): + body = None + if data is not None: + body = json.dumps(data).encode("utf-8") + + req = urllib.request.Request( + url=url, + data=body, + method=method, + headers={ + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {token}", + "X-GitHub-Api-Version": "2022-11-28", + "Content-Type": "application/json", + }, + ) + with urllib.request.urlopen(req) as resp: + return resp.status, resp.read().decode("utf-8") + + + +def get_token() -> str | None: + """Resolve GitHub token: env var first, then ``gh auth token`` fallback.""" + token = os.environ.get("GITHUB_TOKEN") + if token: + return token + try: + result = subprocess.run( + ["gh", "auth", "token"], + capture_output=True, + text=True, + check=True, + ) + return result.stdout.strip() or None + except (FileNotFoundError, subprocess.CalledProcessError): + return None + + +def get_repo() -> str | None: + """Resolve repository: GITHUB_REPOSITORY env var, or parse from git remote.""" + repo = os.environ.get("GITHUB_REPOSITORY") + if repo: + return repo + try: + result = subprocess.run( + ["git", "remote", "get-url", "origin"], + capture_output=True, + text=True, + check=True, + ) + url = result.stdout.strip() + match = re.search(r"github\.com[:/](.+?)(?:\.git)?$", url) + if match: + return match.group(1) + except (FileNotFoundError, subprocess.CalledProcessError): + pass + return None + + +BACKEND_TO_WORKFLOWS = { + "cuda": {"build.yaml", "build-windows.yaml"}, + "cpu": {"build.yaml"}, + "rocm": {"build.yaml"}, + "metal": {"build-mac.yaml"}, + "xpu": {"build.yaml", "build-windows.yaml"}, +} + +# Only these kernels are known to build successfully on Windows. +# Add new entries here as Windows support is validated for a kernel. +WINDOWS_KERNELS = { + "relu", + "activation", + "flash-attn2", +} + + +def read_backends(kernel_name: str) -> list[str] | None: + """Read the backends list from a kernel's build.toml. Returns None if not found.""" + build_toml = Path(kernel_name) / "build.toml" + if not build_toml.exists(): + return None + with open(build_toml, "rb") as f: + config = tomllib.load(f) + backends = config.get("general", {}).get("backends") + if backends is None: + backends = config.get("backends") + if isinstance(backends, list): + return backends + return None + + +def select_workflows(kernel_name: str) -> set[str]: + """ + Determine which build workflows to dispatch based on the kernel's + backends declared in build.toml. + + Mapping: + cuda, cpu, rocm -> build.yaml (Linux) + metal -> build-mac.yaml (macOS) + cuda, xpu -> build-windows.yaml (Windows, allowlisted kernels only) + + Falls back to all workflows if build.toml can't be read. + """ + backends = read_backends(kernel_name) + if backends is None: + print(f"Could not read backends for {kernel_name}, dispatching all workflows") + return set(RELEASE_WORKFLOWS) + + workflows = set() + for b in backends: + workflows.update(BACKEND_TO_WORKFLOWS.get(b, set())) + + if not workflows: + print(f"No known backends found for {kernel_name}: {backends}, dispatching all workflows") + return set(RELEASE_WORKFLOWS) + + # Only dispatch Windows builds for kernels known to build there. + if "build-windows.yaml" in workflows and kernel_name not in WINDOWS_KERNELS: + workflows.discard("build-windows.yaml") + print(f"Skipping Windows build for {kernel_name} (not in WINDOWS_KERNELS allowlist)") + + return workflows + + +def dispatch_release( + kernel_name: str, + *, + token: str, + repo: str, + ref: str = "main", + mode: str = "release", + repo_prefix: str = "kernels-community", + dispatch_key_prefix: str = "", + dry_run: bool = False, + skip_build: bool = False, + pr_number: str = "", + target_branch: str = "", + upload: bool = True, +) -> ReleaseDispatchResult: + """ + Dispatch the appropriate build workflows for a kernel. + + Args: + kernel_name: Name of the kernel directory. + token: GitHub API token. + repo: GitHub repository in "owner/repo" format. + ref: Git ref to dispatch against (default "main"). + mode: Build mode - "pr" for CI builds, "release" for full builds. + repo_prefix: Hub org prefix for uploads (default "kernels-community"). + dispatch_key_prefix: Optional prefix for dispatch keys (e.g. "pr42-"). + dry_run: Print what would be dispatched without actually dispatching. + skip_build: Skip build and upload steps. + pr_number: Optional PR number to checkout before building. + target_branch: Target branch for upload. + upload: Whether to upload after build. + + Returns: + ReleaseDispatchResult with dispatched/failed/skipped lists. + """ + if not KERNEL_NAME_RE.match(kernel_name): + print(f"Invalid kernel name: {kernel_name!r}", file=sys.stderr) + result = ReleaseDispatchResult(kernel_name=kernel_name) + for wf in RELEASE_WORKFLOWS: + result.failed.append((wf, 0)) + return result + + result = ReleaseDispatchResult(kernel_name=kernel_name) + + backends = read_backends(kernel_name) or [] + workflows = select_workflows(kernel_name) + + # Invert BACKEND_TO_WORKFLOWS so we can scope backends per workflow. + workflow_to_backends: dict[str, set[str]] = {} + for backend, wfs in BACKEND_TO_WORKFLOWS.items(): + for wf in wfs: + workflow_to_backends.setdefault(wf, set()).add(backend) + + skipped_workflows = set(RELEASE_WORKFLOWS) - workflows + result.skipped = sorted(skipped_workflows) + + api_base = f"https://api.github.com/repos/{repo}" + for workflow in workflows: + # Only pass backends that this workflow can actually build. + scoped = sorted(b for b in backends if b in workflow_to_backends.get(workflow, set())) + backends_csv = ",".join(scoped) + + dispatch_key = ( + f"{dispatch_key_prefix}{kernel_name}-{workflow}-{uuid.uuid4().hex[:12]}" + ) + if dry_run: + inputs = { + "kernel_name": kernel_name, + "dispatch_key": dispatch_key, + "mode": mode, + "backends": backends_csv, + "repo_prefix": repo_prefix, + } + if skip_build: + inputs["skip_build"] = "true" + if pr_number: + inputs["pr_number"] = pr_number + if target_branch: + inputs["target_branch"] = target_branch + if not upload: + inputs["upload"] = "false" + dispatch_body = {"ref": ref, "inputs": inputs} + print(f"\n[dry-run] {workflow}:") + print(json.dumps(dispatch_body, indent=2)) + result.dispatched.append((workflow, dispatch_key)) + continue + dispatch_url = f"{api_base}/actions/workflows/{workflow}/dispatches" + inputs = { + "kernel_name": kernel_name, + "dispatch_key": dispatch_key, + "mode": mode, + "backends": backends_csv, + "repo_prefix": repo_prefix, + } + if skip_build: + inputs["skip_build"] = "true" + if pr_number: + inputs["pr_number"] = pr_number + if target_branch: + inputs["target_branch"] = target_branch + if not upload: + inputs["upload"] = "false" + dispatch_body = { + "ref": ref, + "inputs": inputs, + } + try: + print(f"Dispatching {workflow} for kernel `{kernel_name}` on ref `{ref}`") + github_api_request(dispatch_url, token, method="POST", data=dispatch_body) + result.dispatched.append((workflow, dispatch_key)) + except urllib.error.HTTPError as e: + err_text = e.read().decode("utf-8", errors="replace") + print(f"Failed to dispatch {workflow} (HTTP {e.code}): {err_text}", file=sys.stderr) + result.failed.append((workflow, e.code)) + + return result + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Dispatch release workflows for a kernel" + ) + parser.add_argument("kernel_name", help="Kernel directory name") + parser.add_argument( + "--ref", default="main", help="Git ref to dispatch on (default: main)" + ) + parser.add_argument( + "--mode", default="release", choices=["pr", "release"], + help="Build mode: pr (CI only) or release (build + upload) (default: release)", + ) + parser.add_argument( + "--repo", default=None, help="GitHub repo in owner/repo format (default: auto-detect)" + ) + parser.add_argument( + "--skip-build", action="store_true", + help="Skip build and upload steps (for testing workflow plumbing)", + ) + parser.add_argument( + "--pr-number", default="", + help="PR number to checkout before building", + ) + parser.add_argument( + "--target-branch", default="", + help="Target branch for upload", + ) + parser.add_argument( + "--no-upload", action="store_true", + help="Build only, do not upload", + ) + parser.add_argument( + "--dry-run", action="store_true", + help="Print the dispatch payloads without actually dispatching", + ) + parser.add_argument( + "--repo-prefix", default="kernels-community", + help="Hub org prefix for uploads (default: kernels-community)", + ) + args = parser.parse_args() + + common = dict( + mode=args.mode, + repo_prefix=args.repo_prefix, + dry_run=args.dry_run, + skip_build=args.skip_build, + pr_number=args.pr_number, + target_branch=args.target_branch, + upload=not args.no_upload, + ) + + if args.dry_run: + result = dispatch_release( + args.kernel_name, + token="", + repo=args.repo or "", + ref=args.ref, + **common, + ) + else: + token = get_token() + if not token: + print( + "Error: No GitHub token found. Set GITHUB_TOKEN or run `gh auth login`.", + file=sys.stderr, + ) + return 1 + + repo = args.repo or get_repo() + if not repo: + print( + "Error: Cannot determine repository. Set GITHUB_REPOSITORY or use --repo.", + file=sys.stderr, + ) + return 1 + + result = dispatch_release( + args.kernel_name, + token=token, + repo=repo, + ref=args.ref, + **common, + ) + + if result.dispatched: + print(f"\nDispatched ({len(result.dispatched)}):") + for wf, dk in result.dispatched: + print(f" - {wf} (key: {dk})") + if result.skipped: + print(f"\nSkipped ({len(result.skipped)}):") + for wf in result.skipped: + print(f" - {wf}") + if result.failed: + print(f"\nFailed ({len(result.failed)}):") + for wf, code in result.failed: + print(f" - {wf} (HTTP {code})") + return 1 + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/.github/scripts/pr_comment_kernel_bot.py b/.github/scripts/pr_comment_kernel_bot.py index 2f9772cb..daf32280 100644 --- a/.github/scripts/pr_comment_kernel_bot.py +++ b/.github/scripts/pr_comment_kernel_bot.py @@ -10,29 +10,25 @@ import urllib.request import uuid +from dispatch import RELEASE_WORKFLOWS, dispatch_release as do_dispatch_release + KERNEL_RE = re.compile(r"^[A-Za-z0-9_-]+$") BRANCH_RE = re.compile(r"^[A-Za-z0-9._/-]+$") COMMENT_CHARS_RE = re.compile(r"^/kernel-bot[ A-Za-z0-9_./-]*$") COMMAND_PERMISSIONS = { "build": {"admin", "write"}, - "build-and-upload": {"admin"}, + "build-and-stage": {"admin", "write"}, "merge-and-upload": {"admin"}, "release": {"admin"}, } -FORK_BLOCKED_COMMANDS = {"build", "build-and-upload", "release"} -RELEASE_WORKFLOWS = [ - "build-release.yaml", - "build-release-mac.yaml", - "build-release-windows.yaml", -] +FORK_BLOCKED_COMMANDS = {"build", "build-and-stage", "release"} MAX_COMMENT_LENGTH = 1024 -DISPATCH_WORKFLOW = "manual-build-upload.yaml" RUN_LOOKUP_ATTEMPTS = 10 RUN_LOOKUP_SLEEP_SECONDS = 2 RUN_LOOKUP_PAGE_SIZE = 100 COMMAND_USAGE = ( - "Invalid command. Use `/kernel-bot " + "Invalid command. Use `/kernel-bot " " [kernel2 ...] [--branch ]`." ) @@ -225,7 +221,7 @@ def resolve_dispatch_run_urls( return if workflows is None: - workflows = [DISPATCH_WORKFLOW] + workflows = RELEASE_WORKFLOWS for attempt in range(RUN_LOOKUP_ATTEMPTS): for workflow in workflows: @@ -298,7 +294,7 @@ def comment_base_lines( ] if pr_head_sha: lines.append(f"PR head SHA: `{pr_head_sha}`") - lines.append(f"Workflow: `{DISPATCH_WORKFLOW}`") + lines.append(f"Workflows: `{', '.join(RELEASE_WORKFLOWS)}`") return lines @@ -379,7 +375,7 @@ def parse_command(comment: str) -> ParsedCommand: if not args: return ParsedCommand( - error="No kernels provided. Use `/kernel-bot [kernel2 ...]`.", + error="No kernels provided. Use `/kernel-bot [kernel2 ...]`.", ) kernels = [] @@ -546,31 +542,30 @@ def main(): ) return 0 - dispatch_url = f"{api_base}/actions/workflows/{DISPATCH_WORKFLOW}/dispatches" if command == "build": target_branch = requested_branch or f"pr-{issue_number}" dispatch_pr_number = str(issue_number) - upload_flag = "false" - allow_main_dispatch = "false" - elif command == "build-and-upload": + dispatch_upload = False + dispatch_repo_prefix = "kernels-community" + elif command == "build-and-stage": target_branch = requested_branch or f"pr-{issue_number}" dispatch_pr_number = str(issue_number) - upload_flag = "true" - allow_main_dispatch = "false" + dispatch_upload = True + dispatch_repo_prefix = "kernels-staging" elif command == "release": - target_branch = requested_branch or default_branch + target_branch = requested_branch or "" dispatch_pr_number = "" - upload_flag = "true" - allow_main_dispatch = "true" - else: + dispatch_upload = True + dispatch_repo_prefix = "kernels-community" + else: # merge-and-upload target_branch = requested_branch or "main" dispatch_pr_number = "" - upload_flag = "true" - allow_main_dispatch = "true" + dispatch_upload = True + dispatch_repo_prefix = "kernels-community" mode_text = { "build": "build only", - "build-and-upload": "build and upload", + "build-and-stage": "build and stage", "merge-and-upload": "merge, build and upload", "release": "release (linux + mac + windows)", }[command] @@ -661,56 +656,25 @@ def main(): dispatches = [] failed = [] - if command == "release": - for kernel_name in kernels: - for workflow in RELEASE_WORKFLOWS: - release_dispatch_url = f"{api_base}/actions/workflows/{workflow}/dispatches" - dispatch_key = make_dispatch_key(issue_number, f"{kernel_name}-{workflow}") - dispatch_body = { - "ref": default_branch, - "inputs": { - "kernel_name": kernel_name, - "dispatch_key": dispatch_key, - }, - } - try: - print( - f"Dispatching {workflow} for kernel `{kernel_name}`" - ) - github_api_request(release_dispatch_url, token, method="POST", data=dispatch_body) - dispatches.append( - DispatchResult(kernel_name=f"{kernel_name} ({workflow})", dispatch_key=dispatch_key) - ) - except urllib.error.HTTPError as e: - err_text = e.read().decode("utf-8", errors="replace") - print(err_text, file=sys.stderr) - failed.append((f"{kernel_name} ({workflow})", e.code)) - else: - for kernel_name in kernels: - dispatch_key = make_dispatch_key(issue_number, kernel_name) - dispatch_body = { - "ref": default_branch, - "inputs": { - "kernel_name": kernel_name, - "pr_number": dispatch_pr_number, - "target_branch": target_branch, - "upload": upload_flag, - "allow_main_dispatch": allow_main_dispatch, - "dispatch_key": dispatch_key, - }, - } - try: - print( - f"Dispatching workflow for command `{command}`, kernel `{kernel_name}`, branch `{target_branch}`" - ) - github_api_request(dispatch_url, token, method="POST", data=dispatch_body) - dispatches.append( - DispatchResult(kernel_name=kernel_name, dispatch_key=dispatch_key) - ) - except urllib.error.HTTPError as e: - err_text = e.read().decode("utf-8", errors="replace") - print(err_text, file=sys.stderr) - failed.append((kernel_name, e.code)) + for kernel_name in kernels: + release_result = do_dispatch_release( + kernel_name, + token=token, + repo=repository, + ref=default_branch, + mode="release", + repo_prefix=dispatch_repo_prefix, + dispatch_key_prefix=f"pr{issue_number}-", + pr_number=dispatch_pr_number, + target_branch=target_branch, + upload=dispatch_upload, + ) + for wf, dk in release_result.dispatched: + dispatches.append( + DispatchResult(kernel_name=f"{kernel_name} ({wf})", dispatch_key=dk) + ) + for wf, code in release_result.failed: + failed.append((f"{kernel_name} ({wf})", code)) resolve_dispatch_run_urls( api_base, @@ -718,7 +682,7 @@ def main(): repository, default_branch, dispatches, - workflows=RELEASE_WORKFLOWS if command == "release" else None, + workflows=RELEASE_WORKFLOWS, ) comment_written = try_send_issue_comment( diff --git a/.github/workflows/build-mac.yaml b/.github/workflows/build-mac.yaml new file mode 100644 index 00000000..02d63534 --- /dev/null +++ b/.github/workflows/build-mac.yaml @@ -0,0 +1,160 @@ +name: Build (macOS) +run-name: >- + Build (macOS) / ${{ inputs.kernel_name || '' }} / mode=${{ inputs.mode || 'release' }} / request=${{ inputs.dispatch_key || '' }} +on: + workflow_dispatch: + inputs: + kernel_name: + description: "Kernel directory name to build" + required: true + type: string + dispatch_key: + description: "Unique key for matching this run back to a bot dispatch" + required: false + type: string + mode: + description: "Build mode: pr (CI only) or release (build + upload)" + required: false + type: string + default: "release" + skip_build: + description: "Skip build and upload steps (for testing workflow plumbing)" + required: false + type: boolean + default: false + pr_number: + description: "Optional PR number to checkout before building" + required: false + type: string + default: "" + target_branch: + description: "Target branch for upload (default: repo default)" + required: false + type: string + default: "" + upload: + description: "Whether to upload after build" + required: false + type: boolean + default: true + repo_prefix: + description: "Hub org prefix for uploads (e.g. kernels-community, kernels-staging)" + required: false + type: string + default: "kernels-community" +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + # Build the Metal kernel; PR mode builds .#ci, release builds the full bundle and uploads. + build-kernel: + runs-on: macos-26 + steps: + # Guard against injection via pr_number input. + - name: Validate PR number + if: inputs.pr_number != '' + run: | + case "${{ inputs.pr_number }}" in + ''|*[!0-9]*) + echo "Invalid pr_number input: must be numeric" + exit 1 + ;; + esac + + # When building for a PR, check out the PR head; otherwise use default branch. + - name: Checkout PR branch + if: inputs.pr_number != '' + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + ref: refs/pull/${{ inputs.pr_number }}/head + fetch-depth: 0 + - name: Checkout default branch + if: inputs.pr_number == '' + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + # Nix toolchain + binary cache setup. + - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 + with: + extra-conf: | + max-jobs = 1 + cores = 3 + sandbox = relaxed + - name: Nix info + run: nix-shell -p nix-info --run "nix-info -m" + - uses: cachix/cachix-action@3ba601ff5bbb07c7220846facfa2cd81eeee15a1 # v16 + with: + name: huggingface + env: + USER: runner + + # Ensure the kernel directory exists and has the required config files. + - name: Validate kernel directory + id: validate + run: | + KERNEL="${{ inputs.kernel_name }}" + if [ -d "$KERNEL" ] && [ -f "$KERNEL/flake.nix" ] && [ -f "$KERNEL/build.toml" ]; then + echo "kernel=$KERNEL" >> $GITHUB_OUTPUT + echo "skip=false" >> $GITHUB_OUTPUT + else + echo "skip=true" >> $GITHUB_OUTPUT + fi + + # PR-only: verify the kernel has a Hub repo-id before burning CI time. + - name: Check that repo-id is present + if: steps.validate.outputs.skip == 'false' && inputs.mode == 'pr' + run: | + KERNEL="${{ steps.validate.outputs.kernel }}" + if ! cat $KERNEL/build.toml | nix run nixpkgs#dasel -- -i toml '$root.general.hub.get("repo-id")' &> /dev/null ; then + echo "Mandatory repo-id is missing in $KERNEL/build.toml" + exit 1 + fi + + # Metal toolchain is required for macOS GPU kernel compilation. + - name: Install Metal toolchain + if: steps.validate.outputs.skip == 'false' && inputs.skip_build != true + run: xcodebuild -downloadComponent MetalToolchain + + # PR mode builds the CI target (.#ci); release builds the full bundle. + - name: Build kernel + if: steps.validate.outputs.skip == 'false' && inputs.skip_build != true + run: | + KERNEL="${{ steps.validate.outputs.kernel }}" + if [ "${{ inputs.mode }}" = "pr" ]; then + ( cd "$KERNEL" && nix build -L .#ci && ls -l result/ ) + else + ( cd "$KERNEL" && nix build -L ) + fi + + # Upload built artifacts to both model and kernel Hub repos. + - name: Upload kernel to Hub + if: steps.validate.outputs.skip == 'false' && inputs.skip_build != true && inputs.mode != 'pr' && inputs.upload != false + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + KERNEL="${{ steps.validate.outputs.kernel }}" + cd "$KERNEL" + BRANCH_FLAG="" + if [ -n "${{ inputs.target_branch }}" ]; then + BRANCH_FLAG="--branch ${{ inputs.target_branch }}" + fi + nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type model --repo-id "${{ inputs.repo_prefix }}/$KERNEL" $BRANCH_FLAG + nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type kernel --repo-id "${{ inputs.repo_prefix }}/$KERNEL" $BRANCH_FLAG + + # v1 kernels without an explicit branch override also get uploaded to main. + - name: Upload v1 kernels to main + if: steps.validate.outputs.skip == 'false' && inputs.skip_build != true && inputs.mode != 'pr' && inputs.upload != false && inputs.target_branch == '' + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + KERNEL="${{ steps.validate.outputs.kernel }}" + cd "$KERNEL" + + if [ -f "build.toml" ]; then + VERSION=$(grep -E '^\s*version\s*=\s*1\s*$' build.toml || true) + BRANCH=$(grep -E '^\s*branch\s*=' build.toml || true) + if [ -n "$VERSION" ] && [ -z "$BRANCH" ]; then + nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type model --repo-id "${{ inputs.repo_prefix }}/$KERNEL" --branch main + nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type kernel --repo-id "${{ inputs.repo_prefix }}/$KERNEL" --branch main + fi + fi diff --git a/.github/workflows/build-pr-dispatch.yaml b/.github/workflows/build-pr-dispatch.yaml new file mode 100644 index 00000000..e0f9baf4 --- /dev/null +++ b/.github/workflows/build-pr-dispatch.yaml @@ -0,0 +1,46 @@ +name: Build PR (Dispatch) +on: + pull_request: + paths-ignore: + - "**/README.md" +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + actions: write + +jobs: + # Resolve the kernel from the PR title and fan out to per-platform build workflows. + dispatch: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + # Extract kernel name from PR title (format: "kernel-name: description") + # and verify the directory has the required flake.nix + build.toml files. + - name: Validate kernel directory + id: validate + env: + PR_TITLE: ${{ github.event.pull_request.title }} + run: | + if KERNEL=$(python3 .github/workflows/validate-kernel-pr.py "pr"); then + echo "kernel=$KERNEL" >> $GITHUB_OUTPUT + echo "skip=false" >> $GITHUB_OUTPUT + else + echo "skip=true" >> $GITHUB_OUTPUT + fi + + # Fan out to per-platform build workflows (linux, mac, windows) based + # on the backends declared in the kernel's build.toml. + - name: Dispatch build workflows + if: steps.validate.outputs.skip == 'false' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_REPOSITORY: ${{ github.repository }} + run: | + KERNEL="${{ steps.validate.outputs.kernel }}" + python3 .github/scripts/dispatch.py "$KERNEL" \ + --mode pr \ + --pr-number "${{ github.event.pull_request.number }}" \ + --no-upload diff --git a/.github/workflows/build-pr-mac.yaml b/.github/workflows/build-pr-mac.yaml deleted file mode 100644 index efbbcb31..00000000 --- a/.github/workflows/build-pr-mac.yaml +++ /dev/null @@ -1,54 +0,0 @@ -name: Build PR (macOS) -on: - pull_request: - paths-ignore: - - "**/README.md" -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - build-kernel: - runs-on: macos-26 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 - with: - extra-conf: | - max-jobs = 1 - cores = 3 - sandbox = relaxed - - name: Nix info - run: nix-shell -p nix-info --run "nix-info -m" - - uses: cachix/cachix-action@3ba601ff5bbb07c7220846facfa2cd81eeee15a1 # v16 - with: - name: huggingface - env: - USER: runner - - name: Validate kernel directory - id: validate - env: - PR_TITLE: ${{ github.event.pull_request.title }} - run: | - if KERNEL=$(python3 .github/workflows/validate-kernel-pr.py "pr"); then - echo "kernel=$KERNEL" >> $GITHUB_OUTPUT - echo "skip=false" >> $GITHUB_OUTPUT - else - echo "skip=true" >> $GITHUB_OUTPUT - fi - - name: "Check that repo-id is present" - if: steps.validate.outputs.skip == 'false' - run: | - KERNEL="${{ steps.validate.outputs.kernel }}" - if ! cat $KERNEL/build.toml | nix run nixpkgs#dasel -- -i toml '$root.general.hub.get("repo-id")' &> /dev/null ; then - echo "Mandatory repo-id is missing in $KERNEL/build.toml" - exit 1 - fi - - name: Install Metal toolchain - if: steps.validate.outputs.skip == 'false' - run: xcodebuild -downloadComponent MetalToolchain - - name: Build kernel - if: steps.validate.outputs.skip == 'false' - run: | - KERNEL="${{ steps.validate.outputs.kernel }}" - ( cd "$KERNEL" && nix build -L .#ci && ls -l result/ ) diff --git a/.github/workflows/build-pr-windows.yaml b/.github/workflows/build-pr-windows.yaml deleted file mode 100644 index 8a56d801..00000000 --- a/.github/workflows/build-pr-windows.yaml +++ /dev/null @@ -1,228 +0,0 @@ -name: Build PR (Windows) -on: - pull_request: - types: [opened, synchronize] - paths-ignore: - - "**/README.md" - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - build-kernel: - strategy: - matrix: - os: [windows-2022] - python: [3.12] - platform: [ - # CUDA platforms - # { backend: 'cuda', torch_version: '2.9.1', cuda: '12.6.3', wheel: '126' }, - { - backend: "cuda", - torch_version: "2.9.1", - cuda: "12.8.1", - wheel: "128", - }, - # { backend: 'cuda', torch_version: '2.9.1', cuda: '13.0.1', wheel: '130' }, - # Intel XPU platform - { - backend: "xpu", - torch_version: "2.10.0", - oneapi: "2025.3.1", - oneapi_url: "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/076e961b-2c29-48a8-9203-c96f00e7051b/intel-oneapi-base-toolkit-2025.3.1.35_offline.exe", - }, - ] - - runs-on: windows-2022 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Validate kernel directory - id: validate - shell: pwsh - env: - PR_TITLE: ${{ github.event.pull_request.title }} - run: | - $ErrorActionPreference = "Continue" - $KERNEL = python .github/workflows/validate-kernel-pr.py "pr" 2>&1 - if ($LASTEXITCODE -eq 0) { - echo "kernel=$KERNEL" >> $env:GITHUB_OUTPUT - echo "skip=false" >> $env:GITHUB_OUTPUT - } else { - echo "skip=true" >> $env:GITHUB_OUTPUT - } - exit 0 - - - name: Check backend support - id: check-backend - if: steps.validate.outputs.skip == 'false' - shell: pwsh - run: | - $KERNEL = "${{ steps.validate.outputs.kernel }}" - $BACKEND = "${{ matrix.platform.backend }}" - $buildToml = Get-Content "${KERNEL}/build.toml" -Raw - - # Check CUDA minimum version requirement from build.toml [general.cuda] minver - if ($BACKEND -eq "cuda") { - $CUDA_VERSION = "${{ matrix.platform.cuda }}" - if ($buildToml -match 'minver\s*=\s*"([^"]+)"') { - $minver = $matches[1] - # Compare versions: strip patch from CUDA version (e.g. 12.8.1 -> 12.8) - $cudaMajorMinor = ($CUDA_VERSION -split '\.')[0..1] -join '.' - if ([version]$cudaMajorMinor -lt [version]$minver) { - Write-Output "Kernel '$KERNEL' requires CUDA >= $minver but matrix provides $CUDA_VERSION - skipping" - echo "supported=false" >> $env:GITHUB_OUTPUT - exit 0 - } - } - } - - # XPU block list for Windows - these kernels are not compatible with Windows XPU builds - $xpuBlockList = @("megablocks", "flash-attn2") - - # Check if XPU backend and kernel is in block list - if ($BACKEND -eq "xpu" -and $KERNEL -in $xpuBlockList) { - Write-Output "Kernel '$KERNEL' is not compatible with Windows XPU builds - skipping" - Write-Output "Blocked XPU kernels: $($xpuBlockList -join ', ')" - echo "supported=false" >> $env:GITHUB_OUTPUT - exit 0 - } - - # Kernels that require oneAPI setup for XPU builds - $xpuNeedsOneApi = @("relu", "rotary", "rmsnorm") - if ($BACKEND -eq "xpu" -and $KERNEL -in $xpuNeedsOneApi) { - echo "needs_oneapi=true" >> $env:GITHUB_OUTPUT - } else { - echo "needs_oneapi=false" >> $env:GITHUB_OUTPUT - } - - # Check two formats: - # 1. [kernel.*] section with: backend = "xpu" - # 2. [general] section with: backends = ["cuda", "xpu", ...] (can be multi-line) - $kernelPattern = "backend\s*=\s*[`"']${BACKEND}[`"']" - $backendsPattern = "(?s)backends\s*=\s*\[.*?[`"']${BACKEND}[`"'].*?\]" - - if (($buildToml -match $kernelPattern) -or ($buildToml -match $backendsPattern)) { - Write-Output "Kernel '$KERNEL' supports backend '$BACKEND'" - echo "supported=true" >> $env:GITHUB_OUTPUT - } else { - Write-Output "Kernel '$KERNEL' does NOT support backend '$BACKEND' - skipping build" - echo "supported=false" >> $env:GITHUB_OUTPUT - } - - - name: Kernel Info - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' - shell: pwsh - run: | - $KERNEL = "${{ steps.validate.outputs.kernel }}" - Write-Output "Building Kernel: $KERNEL" - - - name: Kernel extract required builder version - id: extract-builder-version - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' - shell: pwsh - run: | - $KERNEL = "${{ steps.validate.outputs.kernel }}" - $lock = Get-Content "${KERNEL}/flake.lock" | ConvertFrom-Json - $revision = $lock.nodes."kernel-builder".locked.rev - Write-Output "Building Kernel with revision: $revision" - echo "revision=$revision" >> $env:GITHUB_OUTPUT - - - uses: Jimver/cuda-toolkit@b6fc3a9f3f15256d9d94ffe1254f9c5a2565cde6 # v0.2.30 - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && matrix.platform.backend == 'cuda' - id: setup-cuda-toolkit - with: - cuda: ${{ matrix.platform.cuda }} - - - name: Setup Intel oneAPI - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && matrix.platform.backend == 'xpu' && steps.check-backend.outputs.needs_oneapi == 'true' - shell: pwsh - run: | - & "$env:GITHUB_WORKSPACE\.github\scripts\windows\install-oneapi.ps1" -OneApiVersion "${{ matrix.platform.oneapi }}" -OneApiUrl "${{ matrix.platform.oneapi_url }}" - - - name: Setup Python - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: ${{ matrix.python }} - - - name: Install PyTorch (CUDA) - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && matrix.platform.backend == 'cuda' - run: pip install torch --index-url https://download.pytorch.org/whl/cu${{ matrix.platform.wheel }} - - - name: Install PyTorch (XPU) - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && matrix.platform.backend == 'xpu' - run: pip3 install torch==${{ matrix.platform.torch_version }} --index-url https://download.pytorch.org/whl/xpu - - - name: Checkout kernels - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' - id: checkout-kernels - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - repository: huggingface/kernels - ref: "${{ steps.extract-builder-version.outputs.revision }}" - path: kernels - - - name: Cache Rust build - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - path: | - kernels/kernel-builder/target - ~/.cargo/registry - ~/.cargo/git - key: ${{ runner.os }}-rust-debug-${{ hashFiles('kernels/kernel-builder/Cargo.lock') }} - restore-keys: | - ${{ runner.os }}-rust-debug- - - - name: Build kernel-builder - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' - working-directory: kernels\kernel-builder - shell: pwsh - run: cargo build - - - name: Build kernel - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' - shell: pwsh - run: | - $KERNEL = "${{ steps.validate.outputs.kernel }}" - $NEEDS_ONEAPI = "${{ steps.check-backend.outputs.needs_oneapi }}" - - # Initialize oneAPI environment for XPU builds that require it - if ("${{ matrix.platform.backend }}" -eq "xpu" -and $NEEDS_ONEAPI -eq "true") { - $setvarsPath = "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" - if (Test-Path $setvarsPath) { - Write-Host "Initializing Intel oneAPI environment for XPU build..." -ForegroundColor Cyan - - # Create a temporary file to capture environment variables - $tempFile = [System.IO.Path]::GetTempFileName() - - # Run setvars.bat and capture all environment variables - cmd.exe /c "`"$setvarsPath`" && set > `"$tempFile`"" - - # Parse and set each environment variable in PowerShell - Get-Content $tempFile | ForEach-Object { - if ($_ -match "^(.*?)=(.*)$") { - $varName = $matches[1] - $varValue = $matches[2] - [System.Environment]::SetEnvironmentVariable($varName, $varValue, [System.EnvironmentVariableTarget]::Process) - } - } - - Remove-Item $tempFile -ErrorAction SilentlyContinue - - # Verify Intel compiler is now in PATH - $icxPath = (Get-Command icx-cl -ErrorAction SilentlyContinue).Path - if ($icxPath) { - Write-Host "Intel oneAPI environment initialized successfully" -ForegroundColor Green - Write-Host "Intel C++ Compiler found at: $icxPath" -ForegroundColor Green - } else { - Write-Error "Intel compiler (icx-cl) still not found in PATH after initialization" - exit 1 - } - } else { - Write-Error "setvars.bat not found at $setvarsPath" - exit 1 - } - } - & "$env:GITHUB_WORKSPACE\kernels\nix-builder\scripts\windows\builder.ps1" -Backend ${{ matrix.platform.backend }} -SourceFolder "$KERNEL" -BuildConfig Release -Build diff --git a/.github/workflows/build-pr.yaml b/.github/workflows/build-pr.yaml deleted file mode 100644 index f85ccc41..00000000 --- a/.github/workflows/build-pr.yaml +++ /dev/null @@ -1,164 +0,0 @@ -name: Build PR -on: - pull_request: - paths-ignore: - - "**/README.md" -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - setup: - runs-on: ubuntu-latest - outputs: - skip: ${{ steps.validate.outputs.skip }} - kernel: ${{ steps.validate.outputs.kernel }} - matrix: ${{ steps.matrix.outputs.matrix }} - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 - with: - extra-conf: | - max-jobs = 2 - cores = 4 - sandbox-fallback = false - - uses: cachix/cachix-action@3ba601ff5bbb07c7220846facfa2cd81eeee15a1 # v16 - with: - name: huggingface - env: - USER: runner - - name: Validate kernel directory - id: validate - env: - PR_TITLE: ${{ github.event.pull_request.title }} - run: | - if KERNEL=$(python3 .github/workflows/validate-kernel-pr.py "pr"); then - echo "kernel=$KERNEL" >> $GITHUB_OUTPUT - echo "skip=false" >> $GITHUB_OUTPUT - else - echo "skip=true" >> $GITHUB_OUTPUT - fi - - name: Check that repo-id is present - if: steps.validate.outputs.skip == 'false' - run: | - KERNEL="${{ steps.validate.outputs.kernel }}" - if ! cat $KERNEL/build.toml | nix run nixpkgs#dasel -- -i toml '$root.general.hub.get("repo-id")' &> /dev/null ; then - echo "Mandatory repo-id is missing in $KERNEL/build.toml" - exit 1 - fi - - name: Generate build matrix - if: steps.validate.outputs.skip == 'false' - id: matrix - env: - KERNEL: ${{ steps.validate.outputs.kernel }} - run: | - KERNEL="${{ steps.validate.outputs.kernel }}" - X86_BACKENDS=$(cd "$KERNEL" && nix eval .#backendCi --apply builtins.attrNames --json --system x86_64-linux) - ARM_BACKENDS=$(cd "$KERNEL" && nix eval .#backendCi --apply builtins.attrNames --json --system aarch64-linux) - MATRIX=$(python3 .github/workflows/generate-build-matrix.py "$X86_BACKENDS" "$ARM_BACKENDS") - echo "matrix=$MATRIX" >> $GITHUB_OUTPUT - - build-kernel: - needs: setup - if: needs.setup.outputs.skip == 'false' - strategy: - fail-fast: false - matrix: ${{ fromJSON(needs.setup.outputs.matrix) }} - runs-on: - group: ${{ matrix.runner }} - timeout-minutes: 600 # 10h - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 - with: - extra-conf: | - max-jobs = ${{ matrix.max_jobs }} - cores = ${{ matrix.cores }} - sandbox-fallback = false - - name: Nix info - run: nix-shell -p nix-info --run "nix-info -m" - - uses: cachix/cachix-action@3ba601ff5bbb07c7220846facfa2cd81eeee15a1 # v16 - with: - name: huggingface - env: - USER: runner - - name: Build kernel - run: | - KERNEL="${{ needs.setup.outputs.kernel }}" - ( cd "$KERNEL" && nix build -L .#backendCi.${{ matrix.backend }} && ls -l result/ ) - - build-ci-test: - needs: setup - if: needs.setup.outputs.skip == 'false' - runs-on: - group: aws-highmemory-32-plus-nix - timeout-minutes: 600 # 10h - outputs: - ci-test-path: ${{ steps.export-closure.outputs.ci-test-path }} - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 - with: - extra-conf: | - max-jobs = 2 - cores = 12 - sandbox-fallback = false - - name: Nix info - run: nix-shell -p nix-info --run "nix-info -m" - - uses: cachix/cachix-action@3ba601ff5bbb07c7220846facfa2cd81eeee15a1 # v16 - with: - name: huggingface - env: - USER: runner - - name: Build ci-test - run: | - KERNEL="${{ needs.setup.outputs.kernel }}" - ( cd "$KERNEL" && nix build -L .#ci-test ) - - name: Export ci-test closure - id: export-closure - run: | - KERNEL="${{ needs.setup.outputs.kernel }}" - CI_TEST_PATH=$(readlink -f "$KERNEL/result") - echo "ci-test-path=$CI_TEST_PATH" >> $GITHUB_OUTPUT - nix-store --export $(nix-store -qR "$CI_TEST_PATH") | nix run nixpkgs#zstd -- -T0 > ci-test-closure.nar.zst - - name: Upload ci-test closure - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 - with: - name: ci-test-closure - path: ci-test-closure.nar.zst - retention-days: 1 - - test-kernel-gpu: - needs: [setup, build-kernel, build-ci-test] - if: needs.setup.outputs.skip == 'false' - runs-on: - group: aws-g6-12xlarge-plus - steps: - - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 - with: - extra-conf: | - max-jobs = 2 - cores = 12 - - uses: cachix/cachix-action@3ba601ff5bbb07c7220846facfa2cd81eeee15a1 # v16 - with: - name: huggingface - env: - USER: runner - - name: Setup Nix driver location - run: | - sudo mkdir -p /run/opengl-driver/lib - sudo find /usr/lib64 \ - -name 'libcuda.so*' \ - -exec ln -s {} /run/opengl-driver/lib/ \; - find /run/opengl-driver - - name: Download ci-test closure - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 - with: - name: ci-test-closure - - name: Import ci-test closure - run: | - nix run nixpkgs#zstd -- -d ci-test-closure.nar.zst -c | nix-store --import - - name: Run GPU tests - run: | - CI_TEST_PATH="${{ needs.build-ci-test.outputs.ci-test-path }}" - "$CI_TEST_PATH/bin/ci-test" diff --git a/.github/workflows/build-release-dispatch.yaml b/.github/workflows/build-release-dispatch.yaml new file mode 100644 index 00000000..2d16b624 --- /dev/null +++ b/.github/workflows/build-release-dispatch.yaml @@ -0,0 +1,67 @@ +name: Build Release (Dispatch) +run-name: >- + Build Release (Dispatch) / ${{ inputs.kernel_name || github.event.pull_request.title || '' }} / request=${{ inputs.dispatch_key || '' }} +on: + pull_request: + types: [closed] + workflow_dispatch: + inputs: + kernel_name: + description: "Kernel directory name to build" + required: true + type: string + dispatch_key: + description: "Unique key for matching this run back to a bot dispatch" + required: false + type: string + +permissions: + actions: write + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + setup: + if: github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true + runs-on: ubuntu-latest + outputs: + skip: ${{ steps.validate.outputs.skip }} + kernel: ${{ steps.validate.outputs.kernel }} + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Validate kernel directory + id: validate + env: + PR_TITLE: ${{ github.event.pull_request.title }} + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + KERNEL="${{ inputs.kernel_name }}" + if [ -d "$KERNEL" ] && [ -f "$KERNEL/flake.nix" ] && [ -f "$KERNEL/build.toml" ]; then + echo "kernel=$KERNEL" >> $GITHUB_OUTPUT + echo "skip=false" >> $GITHUB_OUTPUT + else + echo "skip=true" >> $GITHUB_OUTPUT + fi + else + if KERNEL=$(python3 .github/workflows/validate-kernel-pr.py "release"); then + echo "kernel=$KERNEL" >> $GITHUB_OUTPUT + echo "skip=false" >> $GITHUB_OUTPUT + else + echo "skip=true" >> $GITHUB_OUTPUT + fi + fi + - name: Print result + run: | + echo "skip=${{ steps.validate.outputs.skip }}" + echo "kernel=${{ steps.validate.outputs.kernel }}" + - name: Dispatch release workflows + if: steps.validate.outputs.skip == 'false' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_REPOSITORY: ${{ github.repository }} + run: | + KERNEL="${{ steps.validate.outputs.kernel }}" + REF="${{ github.event.repository.default_branch || 'main' }}" + python3 .github/scripts/dispatch.py "$KERNEL" --ref "$REF" --mode release diff --git a/.github/workflows/build-release-mac.yaml b/.github/workflows/build-release-mac.yaml deleted file mode 100644 index 20f9b416..00000000 --- a/.github/workflows/build-release-mac.yaml +++ /dev/null @@ -1,93 +0,0 @@ -name: Build Release (macOS) -run-name: >- - Build Release (macOS) / ${{ inputs.kernel_name || '' }} / request=${{ inputs.dispatch_key || '' }} -on: - pull_request: - types: [closed] - workflow_dispatch: - inputs: - kernel_name: - description: "Kernel directory name to build" - required: true - type: string - dispatch_key: - description: "Unique key for matching this run back to a bot dispatch" - required: false - type: string -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - build-kernel: - if: github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true - runs-on: macos-26 - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 - with: - extra-conf: | - max-jobs = 1 - cores = 3 - sandbox = relaxed - - name: Nix info - run: nix-shell -p nix-info --run "nix-info -m" - - uses: cachix/cachix-action@3ba601ff5bbb07c7220846facfa2cd81eeee15a1 # v16 - with: - name: huggingface - env: - USER: runner - - name: Validate kernel directory - id: validate - env: - PR_TITLE: ${{ github.event.pull_request.title }} - run: | - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - KERNEL="${{ inputs.kernel_name }}" - if [ -d "$KERNEL" ] && [ -f "$KERNEL/flake.nix" ] && [ -f "$KERNEL/build.toml" ]; then - echo "kernel=$KERNEL" >> $GITHUB_OUTPUT - echo "skip=false" >> $GITHUB_OUTPUT - else - echo "skip=true" >> $GITHUB_OUTPUT - fi - else - if KERNEL=$(python3 .github/workflows/validate-kernel-pr.py "release"); then - echo "kernel=$KERNEL" >> $GITHUB_OUTPUT - echo "skip=false" >> $GITHUB_OUTPUT - else - echo "skip=true" >> $GITHUB_OUTPUT - fi - fi - - name: Install Metal toolchain - if: steps.validate.outputs.skip == 'false' - run: xcodebuild -downloadComponent MetalToolchain - - name: Build kernel - if: steps.validate.outputs.skip == 'false' - run: | - KERNEL="${{ steps.validate.outputs.kernel }}" - ( cd "$KERNEL" && nix build -L ) - - name: Upload kernel to Hub - if: steps.validate.outputs.skip == 'false' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - KERNEL="${{ steps.validate.outputs.kernel }}" - cd "$KERNEL" - nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type model --repo-id "kernels-community/$KERNEL" - nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type kernel --repo-id "kernels-community/$KERNEL" - - name: Upload v1 kernels to main - if: steps.validate.outputs.skip == 'false' - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - KERNEL="${{ steps.validate.outputs.kernel }}" - cd "$KERNEL" - - if [ -f "build.toml" ]; then - VERSION=$(grep -E '^\s*version\s*=\s*1\s*$' build.toml || true) - BRANCH=$(grep -E '^\s*branch\s*=' build.toml || true) - if [ -n "$VERSION" ] && [ -z "$BRANCH" ]; then - nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type model --repo-id "kernels-community/$KERNEL" --branch main - nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type kernel --repo-id "kernels-community/$KERNEL" --branch main - fi - fi diff --git a/.github/workflows/build-release.yaml b/.github/workflows/build-release.yaml deleted file mode 100644 index 3a2864d6..00000000 --- a/.github/workflows/build-release.yaml +++ /dev/null @@ -1,126 +0,0 @@ -name: Build Release -run-name: >- - Build Release / ${{ inputs.kernel_name || '' }} / request=${{ inputs.dispatch_key || '' }} -on: - pull_request: - types: [closed] - workflow_dispatch: - inputs: - kernel_name: - description: "Kernel directory name to build" - required: true - type: string - dispatch_key: - description: "Unique key for matching this run back to a bot dispatch" - required: false - type: string -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - setup: - if: github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true - runs-on: ubuntu-latest - outputs: - skip: ${{ steps.validate.outputs.skip }} - kernel: ${{ steps.validate.outputs.kernel }} - matrix: ${{ steps.matrix.outputs.matrix }} - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 - with: - extra-conf: | - max-jobs = 2 - cores = 4 - sandbox-fallback = false - - uses: cachix/cachix-action@3ba601ff5bbb07c7220846facfa2cd81eeee15a1 # v16 - with: - name: huggingface - env: - USER: runner - - name: Validate kernel directory - id: validate - env: - PR_TITLE: ${{ github.event.pull_request.title }} - run: | - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - KERNEL="${{ inputs.kernel_name }}" - if [ -d "$KERNEL" ] && [ -f "$KERNEL/flake.nix" ] && [ -f "$KERNEL/build.toml" ]; then - echo "kernel=$KERNEL" >> $GITHUB_OUTPUT - echo "skip=false" >> $GITHUB_OUTPUT - else - echo "skip=true" >> $GITHUB_OUTPUT - fi - else - if KERNEL=$(python3 .github/workflows/validate-kernel-pr.py "release"); then - echo "kernel=$KERNEL" >> $GITHUB_OUTPUT - echo "skip=false" >> $GITHUB_OUTPUT - else - echo "skip=true" >> $GITHUB_OUTPUT - fi - fi - - name: Generate build matrix - if: steps.validate.outputs.skip == 'false' - id: matrix - env: - KERNEL: ${{ steps.validate.outputs.kernel }} - run: | - KERNEL="${{ steps.validate.outputs.kernel }}" - X86_BACKENDS=$(cd "$KERNEL" && nix eval .#backendBundle --apply builtins.attrNames --json --system x86_64-linux) - ARM_BACKENDS=$(cd "$KERNEL" && nix eval .#backendBundle --apply builtins.attrNames --json --system aarch64-linux) - MATRIX=$(python3 .github/workflows/generate-build-matrix.py "$X86_BACKENDS" "$ARM_BACKENDS") - echo "matrix=$MATRIX" >> $GITHUB_OUTPUT - - build-kernel: - needs: setup - if: needs.setup.outputs.skip == 'false' - strategy: - fail-fast: false - matrix: ${{ fromJSON(needs.setup.outputs.matrix) }} - runs-on: - group: ${{ matrix.runner }} - timeout-minutes: 1200 # 20h - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 - with: - extra-conf: | - max-jobs = ${{ matrix.max_jobs }} - cores = ${{ matrix.cores }} - sandbox-fallback = false - - name: Nix info - run: nix-shell -p nix-info --run "nix-info -m" - - uses: cachix/cachix-action@3ba601ff5bbb07c7220846facfa2cd81eeee15a1 # v16 - with: - name: huggingface - env: - USER: runner - - name: Build kernel - run: | - KERNEL="${{ needs.setup.outputs.kernel }}" - ( cd "$KERNEL" && nix build -L .#backendBundle.${{ matrix.backend }} && ls -l result/ ) - - name: Upload kernel to Hub - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - KERNEL="${{ needs.setup.outputs.kernel }}" - cd "$KERNEL" - nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type model --repo-id "kernels-community/$KERNEL" - nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type kernel --repo-id "kernels-community/$KERNEL" - - name: Upload v1 kernels to main - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - KERNEL="${{ needs.setup.outputs.kernel }}" - cd "$KERNEL" - - # Check if build.toml exists, has version = 1, and does not specify a branch. - if [ -f "build.toml" ]; then - VERSION=$(grep -E '^\s*version\s*=\s*1\s*$' build.toml || true) - BRANCH=$(grep -E '^\s*branch\s*=' build.toml || true) - if [ -n "$VERSION" ] && [ -z "$BRANCH" ]; then - nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type model --repo-id "kernels-community/$KERNEL" --branch main - nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type kernel --repo-id "kernels-community/$KERNEL" --branch main - fi - fi diff --git a/.github/workflows/build-release-windows.yaml b/.github/workflows/build-windows.yaml similarity index 58% rename from .github/workflows/build-release-windows.yaml rename to .github/workflows/build-windows.yaml index 0b897eb8..31c7c459 100644 --- a/.github/workflows/build-release-windows.yaml +++ b/.github/workflows/build-windows.yaml @@ -1,11 +1,7 @@ -name: Build Release (Windows) +name: Build (Windows) run-name: >- - Build Release (Windows) / ${{ inputs.kernel_name || '' }} / request=${{ inputs.dispatch_key || '' }} + Build (Windows) / ${{ inputs.kernel_name || '' }} / mode=${{ inputs.mode || 'release' }} / request=${{ inputs.dispatch_key || '' }} on: - pull_request: - types: [closed] - paths-ignore: - - "**/README.md" workflow_dispatch: inputs: kernel_name: @@ -16,14 +12,49 @@ on: description: "Unique key for matching this run back to a bot dispatch" required: false type: string + mode: + description: "Build mode: pr (CI only) or release (build + upload)" + required: false + type: string + default: "release" + skip_build: + description: "Skip build and upload steps (for testing workflow plumbing)" + required: false + type: boolean + default: false + pr_number: + description: "Optional PR number to checkout before building" + required: false + type: string + default: "" + target_branch: + description: "Target branch for upload (default: repo default)" + required: false + type: string + default: "" + upload: + description: "Whether to upload after build" + required: false + type: boolean + default: true + backends: + description: "Comma-separated list of backends from build.toml (set by dispatch script)" + required: false + type: string + default: "" + repo_prefix: + description: "Hub org prefix for uploads (e.g. kernels-community, kernels-staging)" + required: false + type: string + default: "kernels-community" concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true jobs: + # Build the kernel for each CUDA/XPU variant; release mode uploads to the Hub. build-kernel: - if: github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true strategy: matrix: os: [windows-2022] @@ -49,34 +80,44 @@ jobs: runs-on: windows-2022 steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + # Guard against injection via pr_number input. + - name: Validate PR number + if: inputs.pr_number != '' + shell: pwsh + run: | + if ("${{ inputs.pr_number }}" -notmatch '^\d+$') { + Write-Error "Invalid pr_number input: must be numeric" + exit 1 + } + # When building for a PR, check out the PR head; otherwise use default branch. + - name: Checkout PR branch + if: inputs.pr_number != '' + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + ref: refs/pull/${{ inputs.pr_number }}/head + fetch-depth: 0 + - name: Checkout default branch + if: inputs.pr_number == '' + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + # Ensure the kernel directory exists and has the required config files. - name: Validate kernel directory id: validate shell: pwsh - env: - PR_TITLE: ${{ github.event.pull_request.title }} run: | $ErrorActionPreference = "Continue" - if ("${{ github.event_name }}" -eq "workflow_dispatch") { - $KERNEL = "${{ inputs.kernel_name }}" - if ((Test-Path "$KERNEL") -and (Test-Path "$KERNEL/flake.nix") -and (Test-Path "$KERNEL/build.toml")) { - echo "kernel=$KERNEL" >> $env:GITHUB_OUTPUT - echo "skip=false" >> $env:GITHUB_OUTPUT - } else { - echo "skip=true" >> $env:GITHUB_OUTPUT - } + $KERNEL = "${{ inputs.kernel_name }}" + if ((Test-Path "$KERNEL") -and (Test-Path "$KERNEL/flake.nix") -and (Test-Path "$KERNEL/build.toml")) { + echo "kernel=$KERNEL" >> $env:GITHUB_OUTPUT + echo "skip=false" >> $env:GITHUB_OUTPUT } else { - $KERNEL = python .github/workflows/validate-kernel-pr.py "release" 2>&1 - if ($LASTEXITCODE -eq 0) { - echo "kernel=$KERNEL" >> $env:GITHUB_OUTPUT - echo "skip=false" >> $env:GITHUB_OUTPUT - } else { - echo "skip=true" >> $env:GITHUB_OUTPUT - } + echo "skip=true" >> $env:GITHUB_OUTPUT } exit 0 + # Check if the kernel supports this matrix backend and meets CUDA + # minimum version. Backend list is passed by the dispatch script. - name: Check backend support id: check-backend if: steps.validate.outputs.skip == 'false' @@ -84,51 +125,45 @@ jobs: run: | $KERNEL = "${{ steps.validate.outputs.kernel }}" $BACKEND = "${{ matrix.platform.backend }}" - $buildToml = Get-Content "${KERNEL}/build.toml" -Raw - - # XPU block list for Windows - these kernels are not compatible with Windows XPU builds - $xpuBlockList = @("megablocks", "flash-attn2") + $backends = "${{ inputs.backends }}" -split "," - # Check if XPU backend and kernel is in block list - if ($BACKEND -eq "xpu" -and $KERNEL -in $xpuBlockList) { - Write-Output "Kernel '$KERNEL' is not compatible with Windows XPU builds - skipping" - Write-Output "Blocked XPU kernels: $($xpuBlockList -join ', ')" + if ($BACKEND -notin $backends) { + Write-Output "Kernel '$KERNEL' does not support backend '$BACKEND' - skipping" echo "supported=false" >> $env:GITHUB_OUTPUT exit 0 } - # Kernels that require oneAPI setup for XPU builds - $xpuNeedsOneApi = @("relu", "rotary", "rmsnorm") - if ($BACKEND -eq "xpu" -and $KERNEL -in $xpuNeedsOneApi) { - echo "needs_oneapi=true" >> $env:GITHUB_OUTPUT - } else { - echo "needs_oneapi=false" >> $env:GITHUB_OUTPUT + # Check CUDA minimum version requirement from build.toml [general.cuda] minver + if ($BACKEND -eq "cuda") { + $CUDA_VERSION = "${{ matrix.platform.cuda }}" + $buildToml = Get-Content "${KERNEL}/build.toml" -Raw + if ($buildToml -match 'minver\s*=\s*"([^"]+)"') { + $minver = $matches[1] + $cudaMajorMinor = ($CUDA_VERSION -split '\.')[0..1] -join '.' + if ([version]$cudaMajorMinor -lt [version]$minver) { + Write-Output "Kernel '$KERNEL' requires CUDA >= $minver but matrix provides $CUDA_VERSION - skipping" + echo "supported=false" >> $env:GITHUB_OUTPUT + exit 0 + } + } } - # Check two formats: - # 1. [kernel.*] section with: backend = "xpu" - # 2. [general] section with: backends = ["cuda", "xpu", ...] (can be multi-line) - $kernelPattern = "backend\s*=\s*[`"']${BACKEND}[`"']" - $backendsPattern = "(?s)backends\s*=\s*\[.*?[`"']${BACKEND}[`"'].*?\]" - - if (($buildToml -match $kernelPattern) -or ($buildToml -match $backendsPattern)) { - Write-Output "Kernel '$KERNEL' supports backend '$BACKEND'" - echo "supported=true" >> $env:GITHUB_OUTPUT - } else { - Write-Output "Kernel '$KERNEL' does NOT support backend '$BACKEND' - skipping build" - echo "supported=false" >> $env:GITHUB_OUTPUT - } + Write-Output "Kernel '$KERNEL' supports backend '$BACKEND'" + echo "supported=true" >> $env:GITHUB_OUTPUT + # Log the kernel being built for easier debugging in CI output. - name: Kernel Info - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' + if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && inputs.skip_build != true shell: pwsh run: | $KERNEL = "${{ steps.validate.outputs.kernel }}" Write-Output "Building Kernel: $KERNEL" + # Read the pinned kernel-builder revision from flake.lock so we build + # with the exact same tooling the kernel was developed against. - name: Kernel extract required builder version id: extract-builder-version - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' + if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && inputs.skip_build != true shell: pwsh run: | $KERNEL = "${{ steps.validate.outputs.kernel }}" @@ -137,34 +172,38 @@ jobs: Write-Output "Building Kernel with revision: $revision" echo "revision=$revision" >> $env:GITHUB_OUTPUT + # Install the CUDA toolkit for CUDA backend builds. - uses: Jimver/cuda-toolkit@b6fc3a9f3f15256d9d94ffe1254f9c5a2565cde6 # v0.2.30 - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && matrix.platform.backend == 'cuda' + if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && inputs.skip_build != true && matrix.platform.backend == 'cuda' id: setup-cuda-toolkit with: cuda: ${{ matrix.platform.cuda }} + # Install Intel oneAPI for XPU builds (provides the icx-cl compiler). - name: Setup Intel oneAPI - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && matrix.platform.backend == 'xpu' && steps.check-backend.outputs.needs_oneapi == 'true' + if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && inputs.skip_build != true && matrix.platform.backend == 'xpu' shell: pwsh run: | & "$env:GITHUB_WORKSPACE\.github\scripts\windows\install-oneapi.ps1" -OneApiVersion "${{ matrix.platform.oneapi }}" -OneApiUrl "${{ matrix.platform.oneapi_url }}" + # Python is needed for PyTorch and the build toolchain. - name: Setup Python - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' + if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && inputs.skip_build != true uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: ${{ matrix.python }} + # Install backend-specific PyTorch wheel. - name: Install PyTorch (CUDA) - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && matrix.platform.backend == 'cuda' + if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && inputs.skip_build != true && matrix.platform.backend == 'cuda' run: pip install torch --index-url https://download.pytorch.org/whl/cu${{ matrix.platform.wheel }} - - name: Install PyTorch (XPU) - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && matrix.platform.backend == 'xpu' + if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && inputs.skip_build != true && matrix.platform.backend == 'xpu' run: pip3 install torch==${{ matrix.platform.torch_version }} --index-url https://download.pytorch.org/whl/xpu + # Check out the kernel-builder repo at the pinned revision for building. - name: Checkout kernels - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' + if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && inputs.skip_build != true id: checkout-kernels uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: @@ -172,8 +211,9 @@ jobs: ref: "${{ steps.extract-builder-version.outputs.revision }}" path: kernels + # Cache Rust compilation artifacts to speed up kernel-builder builds. - name: Cache Rust build - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' + if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && inputs.skip_build != true uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 with: path: | @@ -184,22 +224,24 @@ jobs: restore-keys: | ${{ runner.os }}-rust-debug- + # Build the kernel-builder CLI tool from source. - name: Build kernel-builder - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' + if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && inputs.skip_build != true working-directory: kernels\kernel-builder shell: pwsh run: cargo build + # Compile the kernel using the Windows build script, then run + # cmake local_install to create the directory layout for upload. - name: Build kernel - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' + if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && inputs.skip_build != true shell: pwsh env: KERNEL_SOURCE: ${{ steps.validate.outputs.kernel }} - NEEDS_ONEAPI: ${{ steps.check-backend.outputs.needs_oneapi }} PLATFORM_BACKEND: ${{ matrix.platform.backend }} run: | - # Initialize oneAPI environment for XPU builds that require it - if ($env:PLATFORM_BACKEND -eq "xpu" -and $env:NEEDS_ONEAPI -eq "true") { + # Initialize oneAPI environment for XPU builds + if ($env:PLATFORM_BACKEND -eq "xpu") { $setvarsPath = "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" if (Test-Path $setvarsPath) { Write-Host "Initializing Intel oneAPI environment for XPU build..." -ForegroundColor Cyan @@ -243,21 +285,29 @@ jobs: cmake --build . --config Release --target local_install Pop-Location + # Upload built artifacts to both model and kernel Hub repos. - name: Upload kernel to Hub - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' + if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && inputs.skip_build != true && inputs.mode != 'pr' && inputs.upload != false shell: pwsh env: HF_TOKEN: ${{ secrets.HF_TOKEN }} KERNEL_SOURCE: ${{ steps.validate.outputs.kernel }} + TARGET_BRANCH: ${{ inputs.target_branch }} run: | $KB = "$env:GITHUB_WORKSPACE\kernels\kernel-builder\target\debug\kernel-builder.exe" + $branchArgs = @() + if ($env:TARGET_BRANCH -ne "") { + $branchArgs = @("--branch", $env:TARGET_BRANCH) + } + # Upload to both model and kernel repo types - & $KB upload "$env:KERNEL_SOURCE\build" --repo-type model --repo-id "kernels-community/$env:KERNEL_SOURCE" - & $KB upload "$env:KERNEL_SOURCE\build" --repo-type kernel --repo-id "kernels-community/$env:KERNEL_SOURCE" + & $KB upload "$env:KERNEL_SOURCE\build" --repo-type model --repo-id "${{ inputs.repo_prefix }}/$env:KERNEL_SOURCE" @branchArgs + & $KB upload "$env:KERNEL_SOURCE\build" --repo-type kernel --repo-id "${{ inputs.repo_prefix }}/$env:KERNEL_SOURCE" @branchArgs + # v1 kernels without an explicit branch override also get uploaded to main. - name: Upload v1 kernels to main - if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' + if: steps.validate.outputs.skip == 'false' && steps.check-backend.outputs.supported == 'true' && inputs.skip_build != true && inputs.mode != 'pr' && inputs.upload != false && inputs.target_branch == '' shell: pwsh env: HF_TOKEN: ${{ secrets.HF_TOKEN }} @@ -271,8 +321,8 @@ jobs: $content = Get-Content $buildTomlPath -Raw if ($content -match '(?m)^\s*version\s*=\s*1\s*(\r)?$' -and $content -notmatch '(?m)^\s*branch\s*=') { Write-Host "Kernel version is 1 and no branch override, uploading to main branch..." - & $KB upload "$env:KERNEL_SOURCE\build" --repo-type model --repo-id "kernels-community/$env:KERNEL_SOURCE" --branch main - & $KB upload "$env:KERNEL_SOURCE\build" --repo-type kernel --repo-id "kernels-community/$env:KERNEL_SOURCE" --branch main + & $KB upload "$env:KERNEL_SOURCE\build" --repo-type model --repo-id "${{ inputs.repo_prefix }}/$env:KERNEL_SOURCE" --branch main + & $KB upload "$env:KERNEL_SOURCE\build" --repo-type kernel --repo-id "${{ inputs.repo_prefix }}/$env:KERNEL_SOURCE" --branch main } else { Write-Host "Kernel version is not 1 or branch is overridden, skipping main branch upload" } diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 00000000..1de24749 --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,322 @@ +name: Build +run-name: >- + Build / ${{ inputs.kernel_name || '' }} / mode=${{ inputs.mode || 'release' }} / request=${{ inputs.dispatch_key || '' }} +on: + workflow_dispatch: + inputs: + kernel_name: + description: "Kernel directory name to build" + required: true + type: string + dispatch_key: + description: "Unique key for matching this run back to a bot dispatch" + required: false + type: string + mode: + description: "Build mode: pr (CI only) or release (build + upload)" + required: false + type: string + default: "release" + skip_build: + description: "Skip build and upload steps (for testing workflow plumbing)" + required: false + type: boolean + default: false + pr_number: + description: "Optional PR number to checkout before building" + required: false + type: string + default: "" + target_branch: + description: "Target branch for upload (default: repo default)" + required: false + type: string + default: "" + upload: + description: "Whether to upload after build" + required: false + type: boolean + default: true + repo_prefix: + description: "Hub org prefix for uploads (e.g. kernels-community, kernels-staging)" + required: false + type: string + default: "kernels-community" +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + # Validate inputs, resolve the kernel directory, and generate the build matrix. + setup: + runs-on: ubuntu-latest + outputs: + skip: ${{ steps.validate.outputs.skip }} + kernel: ${{ steps.validate.outputs.kernel }} + matrix: ${{ steps.matrix.outputs.matrix }} + steps: + # Guard against injection via pr_number input. + - name: Validate PR number + if: inputs.pr_number != '' + id: validate-pr + run: | + case "${{ inputs.pr_number }}" in + ''|*[!0-9]*) + echo "Invalid pr_number input: must be numeric" + exit 1 + ;; + esac + + # When building for a PR, check out the PR head; otherwise use default branch. + - name: Checkout PR branch + if: inputs.pr_number != '' + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + ref: refs/pull/${{ inputs.pr_number }}/head + fetch-depth: 0 + - name: Checkout default branch + if: inputs.pr_number == '' + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + # Nix toolchain + binary cache setup. + - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 + with: + extra-conf: | + max-jobs = 2 + cores = 4 + sandbox-fallback = false + - uses: cachix/cachix-action@3ba601ff5bbb07c7220846facfa2cd81eeee15a1 # v16 + with: + name: huggingface + env: + USER: runner + + # Ensure the kernel directory exists and has the required config files. + - name: Validate kernel directory + id: validate + run: | + KERNEL="${{ inputs.kernel_name }}" + if [ -d "$KERNEL" ] && [ -f "$KERNEL/flake.nix" ] && [ -f "$KERNEL/build.toml" ]; then + echo "kernel=$KERNEL" >> $GITHUB_OUTPUT + echo "skip=false" >> $GITHUB_OUTPUT + else + echo "skip=true" >> $GITHUB_OUTPUT + fi + + # PR-only: verify the kernel has a Hub repo-id before burning CI time. + - name: Check that repo-id is present + if: steps.validate.outputs.skip == 'false' && inputs.mode == 'pr' + run: | + KERNEL="${{ steps.validate.outputs.kernel }}" + if ! cat $KERNEL/build.toml | nix run nixpkgs#dasel -- -i toml '$root.general.hub.get("repo-id")' &> /dev/null ; then + echo "Mandatory repo-id is missing in $KERNEL/build.toml" + exit 1 + fi + + # PR mode uses backendCi (lighter, faster); release uses backendBundle (all variants). + - name: Generate build matrix + if: steps.validate.outputs.skip == 'false' + id: matrix + env: + KERNEL: ${{ steps.validate.outputs.kernel }} + run: | + KERNEL="${{ steps.validate.outputs.kernel }}" + if [ "${{ inputs.mode }}" = "pr" ]; then + NIX_TARGET="backendCi" + else + NIX_TARGET="backendBundle" + fi + X86_BACKENDS=$(cd "$KERNEL" && nix eval .#${NIX_TARGET} --apply builtins.attrNames --json --system x86_64-linux) + ARM_BACKENDS=$(cd "$KERNEL" && nix eval .#${NIX_TARGET} --apply builtins.attrNames --json --system aarch64-linux) + MATRIX=$(python3 .github/workflows/generate-build-matrix.py "$X86_BACKENDS" "$ARM_BACKENDS") + echo "matrix=$MATRIX" >> $GITHUB_OUTPUT + + # Compile the kernel for each (backend, arch) pair; release mode also uploads to the Hub. + build-kernel: + needs: setup + if: needs.setup.outputs.skip == 'false' && inputs.skip_build != true + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.setup.outputs.matrix) }} + runs-on: + group: ${{ matrix.runner }} + # PR builds get 10h; release builds get 20h (more variants to compile). + timeout-minutes: ${{ inputs.mode == 'pr' && 600 || 1200 }} + steps: + # When building for a PR, check out the PR head; otherwise use default branch. + - name: Checkout PR branch + if: inputs.pr_number != '' + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + ref: refs/pull/${{ inputs.pr_number }}/head + fetch-depth: 0 + - name: Checkout default branch + if: inputs.pr_number == '' + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + # Nix toolchain + binary cache setup (uses matrix-specific concurrency limits). + - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 + with: + extra-conf: | + max-jobs = ${{ matrix.max_jobs }} + cores = ${{ matrix.cores }} + sandbox-fallback = false + - name: Nix info + run: nix-shell -p nix-info --run "nix-info -m" + - uses: cachix/cachix-action@3ba601ff5bbb07c7220846facfa2cd81eeee15a1 # v16 + with: + name: huggingface + env: + USER: runner + + # PR mode builds backendCi (single representative variant); + # release mode builds backendBundle (full set of variants). + - name: Build kernel + run: | + KERNEL="${{ needs.setup.outputs.kernel }}" + if [ "${{ inputs.mode }}" = "pr" ]; then + ( cd "$KERNEL" && nix build -L .#backendCi.${{ matrix.backend }} && ls -l result/ ) + else + ( cd "$KERNEL" && nix build -L .#backendBundle.${{ matrix.backend }} && ls -l result/ ) + fi + + # Upload built artifacts to both model and kernel Hub repos. + - name: Upload kernel to Hub + if: inputs.mode != 'pr' && inputs.upload != false + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + KERNEL="${{ needs.setup.outputs.kernel }}" + cd "$KERNEL" + BRANCH_FLAG="" + if [ -n "${{ inputs.target_branch }}" ]; then + BRANCH_FLAG="--branch ${{ inputs.target_branch }}" + fi + nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type model --repo-id "${{ inputs.repo_prefix }}/$KERNEL" $BRANCH_FLAG + nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type kernel --repo-id "${{ inputs.repo_prefix }}/$KERNEL" $BRANCH_FLAG + + # v1 kernels without an explicit branch override also get uploaded to main. + - name: Upload v1 kernels to main + if: inputs.mode != 'pr' && inputs.upload != false && inputs.target_branch == '' + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + KERNEL="${{ needs.setup.outputs.kernel }}" + cd "$KERNEL" + + # Check if build.toml exists, has version = 1, and does not specify a branch. + if [ -f "build.toml" ]; then + VERSION=$(grep -E '^\s*version\s*=\s*1\s*$' build.toml || true) + BRANCH=$(grep -E '^\s*branch\s*=' build.toml || true) + if [ -n "$VERSION" ] && [ -z "$BRANCH" ]; then + nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type model --repo-id "${{ inputs.repo_prefix }}/$KERNEL" --branch main + nix run -L github:huggingface/kernels#kernel-builder -- upload --repo-type kernel --repo-id "${{ inputs.repo_prefix }}/$KERNEL" --branch main + fi + fi + + # Build the ci-test derivation and export its Nix closure as an artifact + # so the GPU test job can import it on a GPU-enabled runner. + build-ci-test: + needs: setup + if: needs.setup.outputs.skip == 'false' && inputs.mode == 'pr' + runs-on: + group: aws-highmemory-32-plus-nix + timeout-minutes: 600 # 10h + outputs: + ci-test-path: ${{ steps.export-closure.outputs.ci-test-path }} + steps: + # When building for a PR, check out the PR head; otherwise use default branch. + - name: Checkout PR branch + if: inputs.pr_number != '' + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + ref: refs/pull/${{ inputs.pr_number }}/head + fetch-depth: 0 + - name: Checkout default branch + if: inputs.pr_number == '' + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + # Nix toolchain + binary cache setup. + - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 + with: + extra-conf: | + max-jobs = 2 + cores = 12 + sandbox-fallback = false + - name: Nix info + run: nix-shell -p nix-info --run "nix-info -m" + - uses: cachix/cachix-action@3ba601ff5bbb07c7220846facfa2cd81eeee15a1 # v16 + with: + name: huggingface + env: + USER: runner + + # Build the test binary that will run on the GPU runner. + - name: Build ci-test + run: | + KERNEL="${{ needs.setup.outputs.kernel }}" + ( cd "$KERNEL" && nix build -L .#ci-test ) + + # Serialize the full Nix closure so the GPU runner can import it + # without needing to rebuild or have access to the Nix store. + - name: Export ci-test closure + id: export-closure + run: | + KERNEL="${{ needs.setup.outputs.kernel }}" + CI_TEST_PATH=$(readlink -f "$KERNEL/result") + echo "ci-test-path=$CI_TEST_PATH" >> $GITHUB_OUTPUT + nix-store --export $(nix-store -qR "$CI_TEST_PATH") | nix run nixpkgs#zstd -- -T0 > ci-test-closure.nar.zst + + # Upload the closure as a short-lived artifact for the GPU test job. + - name: Upload ci-test closure + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: ci-test-closure + path: ci-test-closure.nar.zst + retention-days: 1 + + # Import the ci-test closure onto a GPU runner and execute the test suite. + # Runs after both build-kernel and build-ci-test complete. + test-kernel-gpu: + needs: [setup, build-kernel, build-ci-test] + if: needs.setup.outputs.skip == 'false' && inputs.mode == 'pr' + runs-on: + group: aws-g6-12xlarge-plus + steps: + # Nix toolchain + binary cache setup (needed to import the closure). + - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 + with: + extra-conf: | + max-jobs = 2 + cores = 12 + - uses: cachix/cachix-action@3ba601ff5bbb07c7220846facfa2cd81eeee15a1 # v16 + with: + name: huggingface + env: + USER: runner + + # Symlink host CUDA driver into the Nix-expected location. + - name: Setup Nix driver location + run: | + sudo mkdir -p /run/opengl-driver/lib + sudo find /usr/lib64 \ + -name 'libcuda.so*' \ + -exec ln -s {} /run/opengl-driver/lib/ \; + find /run/opengl-driver + + # Retrieve the ci-test closure built in the previous job. + - name: Download ci-test closure + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 + with: + name: ci-test-closure + + # Decompress and import the closure into the local Nix store. + - name: Import ci-test closure + run: | + nix run nixpkgs#zstd -- -d ci-test-closure.nar.zst -c | nix-store --import + + # Run the kernel's test suite on a real GPU. + - name: Run GPU tests + run: | + CI_TEST_PATH="${{ needs.build-ci-test.outputs.ci-test-path }}" + "$CI_TEST_PATH/bin/ci-test" diff --git a/.github/workflows/manual-build-upload.yaml b/.github/workflows/manual-build-upload.yaml deleted file mode 100644 index 695183f4..00000000 --- a/.github/workflows/manual-build-upload.yaml +++ /dev/null @@ -1,140 +0,0 @@ -name: Manual Kernel Build -run-name: >- - Manual Kernel Build / ${{ inputs.kernel_name }} / target=${{ inputs.target_branch }} / request=${{ inputs.dispatch_key }} - -on: - workflow_dispatch: - inputs: - kernel_name: - description: "Kernel directory to build and upload (e.g. flash-attn3)" - required: true - pr_number: - description: "Optional PR number to checkout before building" - required: false - default: "" - target_branch: - description: "Target branch on kernels-community/ to publish to" - required: true - upload: - description: "Whether to upload after build (internal use by kernel-bot)" - required: false - default: "true" - allow_main_dispatch: - description: "Allow dispatch from default branch without pr_number (internal)" - required: false - default: "false" - dispatch_key: - description: "Unique request token for correlating workflow runs (internal)" - required: false - default: "manual" - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - build-and-upload: - runs-on: - group: aws-highmemory-32-plus-nix - steps: - - name: Ensure workflow is not run from main - if: ${{ github.ref == 'refs/heads/main' && inputs.pr_number == '' && inputs.allow_main_dispatch != 'true' }} - run: | - echo "❌ This workflow must be dispatched from a non-main branch." - exit 1 - - - name: Checkout selected branch - if: ${{ inputs.pr_number == '' }} - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - fetch-depth: 0 - - - name: Validate PR number - if: ${{ inputs.pr_number != '' }} - id: validate-pr-number - env: - PR_NUMBER: ${{ inputs.pr_number }} - run: | - set -eu - case "$PR_NUMBER" in - ''|*[!0-9]*) - echo "Invalid pr_number input: must be numeric" - exit 1 - ;; - esac - echo "pr_number=$PR_NUMBER" >> "$GITHUB_OUTPUT" - - - name: Checkout PR branch - if: ${{ inputs.pr_number != '' }} - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - ref: refs/pull/${{ steps.validate-pr-number.outputs.pr_number }}/head - fetch-depth: 0 - - - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 - with: - extra-conf: | - max-jobs = 2 - cores = 12 - sandbox-fallback = false - - - name: Nix info - run: nix-shell -p nix-info --run "nix-info -m" - - - uses: cachix/cachix-action@3ba601ff5bbb07c7220846facfa2cd81eeee15a1 # v16 - with: - name: huggingface - env: - USER: runner - - - name: Validate kernel directory - id: validate - env: - KERNEL_INPUT: ${{ inputs.kernel_name }} - PR_TITLE: "${{ inputs.kernel_name }}: manual dispatch" - run: | - set -eu - case "$KERNEL_INPUT" in - ''|*[!A-Za-z0-9_-]*) - echo "Invalid kernel_name: must contain only alphanumeric characters, underscores, and hyphens" - exit 1 - ;; - esac - if KERNEL=$(python3 .github/workflows/validate-kernel-pr.py "release"); then - echo "kernel=$KERNEL" >> "$GITHUB_OUTPUT" - else - echo "Kernel validation failed." - exit 1 - fi - - - name: Build and copy kernel - run: | - set -eu - KERNEL="${{ steps.validate.outputs.kernel }}" - ( cd "$KERNEL" && nix run -L .#build-and-copy ) - - - name: Validate target branch - if: ${{ inputs.upload == 'true' }} - id: validate-target-branch - env: - TARGET_BRANCH: ${{ inputs.target_branch }} - run: | - set -eu - case "$TARGET_BRANCH" in - ''|*[!A-Za-z0-9/_-]*) - echo "Invalid target_branch: must contain only alphanumeric characters, underscores, hyphens, and slashes" - exit 1 - ;; - esac - echo "target_branch=$TARGET_BRANCH" >> "$GITHUB_OUTPUT" - - - name: Upload kernel - if: ${{ inputs.upload == 'true' }} - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - TARGET_BRANCH: ${{ steps.validate-target-branch.outputs.target_branch }} - run: | - set -eu - KERNEL="${{ steps.validate.outputs.kernel }}" - ( cd "$KERNEL" && nix run github:huggingface/kernels#kernel-builder -- upload --repo-type model --repo-id "kernels-community/$KERNEL" --branch "${TARGET_BRANCH}" . ) - ( cd "$KERNEL" && nix run github:huggingface/kernels#kernel-builder -- upload --repo-type kernel --repo-id "kernels-community/$KERNEL" --branch "${TARGET_BRANCH}" . )