Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
0c752b1
chore: checkpoint github app auth refactor
RedBoardDev May 16, 2026
78b91e8
fix(controller): index loop to avoid implicit loop-var capture (II.4)
RedBoardDev May 16, 2026
2e34cd0
fix(notification/discord): extract rate-limit retry to dedicated help…
RedBoardDev May 16, 2026
e2f4c3c
fix(cli/state): drop unused Groups field from daemonState (II.14)
RedBoardDev May 16, 2026
77f4991
fix(runner/cleanup): log SIGKILL failures on orphan runners (II.15)
RedBoardDev May 16, 2026
3d43316
refactor(runner/cleanup): extract removeStaleDir helper (II.16)
RedBoardDev May 16, 2026
d46020d
refactor(model,runner): narrow PID type to int32 (III.14)
RedBoardDev May 16, 2026
305d53a
feat(config): enforce GitHub label syntax on groups (III.6)
RedBoardDev May 16, 2026
c3b028f
refactor(state): centralize daemon path layout into a Paths package (…
RedBoardDev May 16, 2026
10291b8
feat(logging): remove runner log dir on cleanup (III.9)
RedBoardDev May 16, 2026
06ae09a
refactor(model): drop unused RunnerInstance.ID field (II.13)
RedBoardDev May 16, 2026
5ca921c
refactor(runner): unexport Process.cmd to keep exec.Cmd local (II.11)
RedBoardDev May 16, 2026
0da766f
fix(api): graceful Shutdown and wait for Serve to return (I.10, II.7)
RedBoardDev May 16, 2026
98fa650
fix(auth,runner): bound HTTP client lifetimes (I.5)
RedBoardDev May 16, 2026
ab4483f
fix(auth): truncate PAT validation error bodies and share helpers (II.3)
RedBoardDev May 16, 2026
b97d53a
fix(auth): mask PAT through slog.LogValuer on Credentials (I.13)
RedBoardDev May 16, 2026
4aebf7b
fix(launchd): xml-escape user-controlled fields in plist template (I.7)
RedBoardDev May 16, 2026
6b35301
fix(api): restrict unix socket to 0600 after listen (I.4)
RedBoardDev May 16, 2026
37bc5a3
fix(runner): reject non-local tar symlinks instead of silent fallback…
RedBoardDev May 16, 2026
948d122
fix(runner): refuse non-local symlinks during copyDir (I.9)
RedBoardDev May 16, 2026
b575a60
fix(runner): verify SHA-256 of the runner tarball before trusting it …
RedBoardDev May 16, 2026
37163d9
fix(runner): serialize EnsureBits per version with a .complete marker…
RedBoardDev May 16, 2026
261bda0
fix(runner,config): bound workdir_base and verify orphan pids (I.3)
RedBoardDev May 16, 2026
8734c70
fix(auth): warn when credentials file permissions are too loose (I.6)
RedBoardDev May 16, 2026
032ffc9
fix(launchd): migrate from load/unload to bootstrap/bootout/kickstart…
RedBoardDev May 16, 2026
5e3d44a
fix(config): expose runner_group_id and stop hardcoding 1 (II.5)
RedBoardDev May 16, 2026
ac9ae61
fix(health): dispatch reporters/notifier asynchronously and outside t…
RedBoardDev May 16, 2026
a0e0e64
test(health): add race regression covering Status/runChecks concurren…
RedBoardDev May 16, 2026
c0ca462
fix(cli): wrap daemon actors in panic-recovering safeActor (IV.1)
RedBoardDev May 16, 2026
42f5be8
fix(controller): add ±20%% jitter to group backoff (IV.8)
RedBoardDev May 16, 2026
d427794
fix(runner): bound the post-SIGKILL Wait in Stop (IV.9)
RedBoardDev May 16, 2026
e6f6ac7
fix(notification/discord): retry once on 5xx with a fixed backoff (IV…
RedBoardDev May 16, 2026
8099a19
fix(runner): garbage-collect old binary caches on success (IV.5)
RedBoardDev May 16, 2026
cd19fc1
fix(auth): wrap GitHub API calls in a circuit breaker (IV.2)
RedBoardDev May 16, 2026
9d27a08
feat(cli): add liveness watchdog that exits to let launchd respawn (I…
RedBoardDev May 16, 2026
9816fee
feat(logging): tag runner stdout/stderr with structured JSON envelope…
RedBoardDev May 16, 2026
6714867
docs(readme): sync repository structure with the actual layout (IX.1)
RedBoardDev May 16, 2026
bd24b59
chore: address lint findings introduced by the audit-fix series
RedBoardDev May 16, 2026
15d3a2f
style: gofmt watchdog.go
RedBoardDev May 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,6 @@ Thumbs.db
*.log

# Runner workdirs (local dev)
runners/
cache/
state/
/runners/
/cache/
/state/
23 changes: 15 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,15 +101,22 @@ golangci-lint run # lint (if installed)

```
ghr/
├── cmd/ghr/main.go # Entrypoint
├── cmd/ghr/main.go # Entrypoint
├── internal/
│ ├── cli/ # Cobra commands
│ ├── auth/ # Credentials management
│ ├── config/ # YAML + env config
│ ├── runner/ # Binary download & process lifecycle
│ ├── github/ # Scale set SDK adapter
│ ├── model/ # Shared data structs
│ └── logging/ # Structured logging
│ ├── cli/ # Cobra commands (start/stop/run/status/purge/login/...)
│ ├── auth/ # Credentials, JWT signing, installation tokens, breaker
│ ├── config/ # YAML + env loading, validation, defaults
│ ├── controller/ # Scale-set orchestration and per-group scaler
│ ├── runner/ # Binary download (with SHA-256 verify) and process lifecycle
│ ├── github/ # scaleset SDK adapter
│ ├── health/ # Health monitor and check functions
│ ├── notification/ # Discord + webhook providers with filtering
│ ├── monitoring/ # Uptime Kuma push reporter
│ ├── api/ # Unix-socket JSON API exposing status/health
│ ├── launchd/ # macOS service install/uninstall via bootstrap/bootout
│ ├── state/ # Centralized daemon-state file paths (pid/sock/state)
│ ├── model/ # Shared data structs (no logic)
│ └── logging/ # Structured logging, rotation, tagged runner output
├── go.mod
└── go.sum
```
Expand Down
798 changes: 798 additions & 0 deletions audit.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
github:
url: "https://github.com/my-org"
runner_group: "default"
runner_group_id: 1

runner:
version: "latest"
Expand Down
30 changes: 19 additions & 11 deletions internal/api/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ import (
"net"
"net/http"
"os"
"path/filepath"
"time"

"github.com/RedBoardDev/gh-runners-tool/v2/internal/health"
"github.com/RedBoardDev/gh-runners-tool/v2/internal/model"
"github.com/RedBoardDev/gh-runners-tool/v2/internal/state"
)

type controllerState interface {
Expand All @@ -32,7 +33,7 @@ type Server struct {

func NewServer(stateDir string, controller controllerState, healthProvider healthState, logger *slog.Logger) *Server {
return &Server{
socketPath: filepath.Join(stateDir, "ghr.sock"),
socketPath: state.New(stateDir).Socket(),
controller: controller,
health: healthProvider,
logger: logger,
Expand All @@ -50,6 +51,12 @@ func (s *Server) Run(ctx context.Context) error {
}
s.listener = ln

if chmodErr := os.Chmod(s.socketPath, 0o600); chmodErr != nil {
ln.Close()
_ = os.Remove(s.socketPath)
return fmt.Errorf("chmod socket %s: %w", s.socketPath, chmodErr)
}

srv := &http.Server{
Handler: s.routes(),
}
Expand All @@ -59,22 +66,23 @@ func (s *Server) Run(ctx context.Context) error {
errCh <- srv.Serve(ln)
}()

defer func() {
if cleanupErr := os.Remove(s.socketPath); cleanupErr != nil && !os.IsNotExist(cleanupErr) {
s.logger.Warn("failed to remove socket file", "path", s.socketPath, "error", cleanupErr)
}
}()

select {
case <-ctx.Done():
shutdownErr := srv.Close()
cleanupErr := os.Remove(s.socketPath)
shutdownCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 5*time.Second)
defer cancel()
shutdownErr := srv.Shutdown(shutdownCtx)
<-errCh
if shutdownErr != nil {
return fmt.Errorf("shutdown api server: %w", shutdownErr)
}
if cleanupErr != nil && !os.IsNotExist(cleanupErr) {
s.logger.Warn("failed to remove socket file", "path", s.socketPath, "error", cleanupErr)
}
return nil
case err := <-errCh:
cleanupErr := os.Remove(s.socketPath)
if cleanupErr != nil && !os.IsNotExist(cleanupErr) {
s.logger.Warn("failed to remove socket file", "path", s.socketPath, "error", cleanupErr)
}
if errors.Is(err, http.ErrServerClosed) {
return nil
}
Expand Down
45 changes: 45 additions & 0 deletions internal/api/server_test.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
package api

import (
"context"
"encoding/json"
"log/slog"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"testing"
"time"

Expand Down Expand Up @@ -191,3 +193,46 @@ func TestHandleStatus_EmptyGroups(t *testing.T) {
t.Fatalf("expected 0 groups, got %d", len(body.Groups))
}
}

func TestServer_Run_SocketPermissions(t *testing.T) {
// Unix domain sockets on macOS cap at ~104 chars, so avoid t.TempDir() which
// returns long /var/folders/... paths. Use a short directory under /tmp.
stateDir, err := os.MkdirTemp("/tmp", "ghr-sock-")
if err != nil {
t.Fatalf("mkdir temp: %v", err)
}
t.Cleanup(func() { os.RemoveAll(stateDir) })

srv := NewServer(stateDir, &mockController{}, &mockHealth{},
slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelError + 1})))

ctx, cancel := context.WithCancel(context.Background())
done := make(chan error, 1)
go func() { done <- srv.Run(ctx) }()

socket := filepath.Join(stateDir, "ghr.sock")
deadline := time.Now().Add(2 * time.Second)
var info os.FileInfo
for time.Now().Before(deadline) {
if info, err = os.Stat(socket); err == nil {
break
}
time.Sleep(20 * time.Millisecond)
}

if err != nil {
cancel()
<-done
t.Fatalf("stat socket: %v", err)
}
mode := info.Mode().Perm()

cancel()
if err := <-done; err != nil {
t.Fatalf("server run: %v", err)
}

if mode != 0o600 {
t.Fatalf("socket perm = %#o, want 0600", mode)
}
}
34 changes: 34 additions & 0 deletions internal/auth/auth_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package auth
import (
"context"
"encoding/json"
"io"
"net/http"
"net/http/httptest"
"os"
Expand Down Expand Up @@ -230,6 +231,39 @@ func TestSave_And_Load(t *testing.T) {
}
}

func TestLoad_WarnsOnLoosePermissions(t *testing.T) {
dir := t.TempDir()
credFile := filepath.Join(dir, "credentials.json")
t.Setenv("GHR_CREDENTIALS_FILE", credFile)
t.Setenv("GITHUB_TOKEN", "")

creds := &Credentials{Method: "pat", PAT: "ghp_loose", GitHubURL: "https://github.com/x"}
if err := Save(creds); err != nil {
t.Fatalf("Save: %v", err)
}
if err := os.Chmod(credFile, 0o644); err != nil {
t.Fatalf("chmod: %v", err)
}

r, w, err := os.Pipe()
if err != nil {
t.Fatalf("pipe: %v", err)
}
origStderr := os.Stderr
os.Stderr = w
defer func() { os.Stderr = origStderr }()

if _, _, loadErr := Load(LoadOpts{}); loadErr != nil {
t.Fatalf("Load: %v", loadErr)
}
w.Close()

out, _ := io.ReadAll(r)
if !strings.Contains(string(out), "warning") || !strings.Contains(string(out), "chmod 600") {
t.Errorf("expected loose-perm warning, got: %q", out)
}
}

func TestSave_CreatesDirectory(t *testing.T) {
dir := t.TempDir()
nestedPath := filepath.Join(dir, "nested", "deep", "credentials.json")
Expand Down
81 changes: 81 additions & 0 deletions internal/auth/breaker.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package auth

import (
"errors"
"net/http"
"sync"
"time"
)

// ErrCircuitOpen is returned by the circuit breaker while it is tripped.
var ErrCircuitOpen = errors.New("github API circuit open: too many consecutive failures")

const (
breakerFailureThreshold = 5
breakerOpenDuration = 60 * time.Second
)

type circuitBreaker struct {
mu sync.Mutex
consecutiveFails int
openedAt time.Time
clock func() time.Time
}

func newCircuitBreaker() *circuitBreaker {
return &circuitBreaker{clock: time.Now}
}

func (b *circuitBreaker) allow() bool {
b.mu.Lock()
defer b.mu.Unlock()
if b.openedAt.IsZero() {
return true
}
if b.clock().Sub(b.openedAt) >= breakerOpenDuration {
// Half-open: allow a single probe.
b.openedAt = time.Time{}
b.consecutiveFails = breakerFailureThreshold - 1
return true
}
return false
}

func (b *circuitBreaker) recordSuccess() {
b.mu.Lock()
defer b.mu.Unlock()
b.consecutiveFails = 0
b.openedAt = time.Time{}
}

func (b *circuitBreaker) recordFailure() {
b.mu.Lock()
defer b.mu.Unlock()
b.consecutiveFails++
if b.consecutiveFails >= breakerFailureThreshold && b.openedAt.IsZero() {
b.openedAt = b.clock()
}
}

func isBreakable(resp *http.Response, err error) bool {
if err != nil {
return true
}
return resp.StatusCode >= 500
}

var apiBreaker = newCircuitBreaker()

// doGuarded routes the request through the package-level circuit breaker.
func doGuarded(req *http.Request) (*http.Response, error) {
if !apiBreaker.allow() {
return nil, ErrCircuitOpen
}
resp, err := httpClient.Do(req)
if isBreakable(resp, err) {
apiBreaker.recordFailure()
} else {
apiBreaker.recordSuccess()
}
return resp, err
}
71 changes: 71 additions & 0 deletions internal/auth/breaker_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package auth

import (
"errors"
"net/http"
"testing"
"time"
)

func TestCircuitBreaker_OpensAfterThreshold(t *testing.T) {
b := newCircuitBreaker()
for i := 0; i < breakerFailureThreshold; i++ {
if !b.allow() {
t.Fatalf("allow() = false before threshold, iteration %d", i)
}
b.recordFailure()
}
if b.allow() {
t.Error("allow() = true after threshold, want false")
}
}

func TestCircuitBreaker_SuccessResets(t *testing.T) {
b := newCircuitBreaker()
for i := 0; i < breakerFailureThreshold-1; i++ {
b.recordFailure()
}
b.recordSuccess()
if !b.allow() {
t.Error("allow() = false after success reset")
}
for i := 0; i < breakerFailureThreshold-1; i++ {
b.recordFailure()
}
if !b.allow() {
t.Error("allow() = false below threshold")
}
}

func TestCircuitBreaker_HalfOpenAfterTimeout(t *testing.T) {
b := newCircuitBreaker()
now := time.Now()
b.clock = func() time.Time { return now }

for i := 0; i < breakerFailureThreshold; i++ {
b.recordFailure()
}
if b.allow() {
t.Fatal("breaker should be open")
}

now = now.Add(breakerOpenDuration + time.Second)
if !b.allow() {
t.Error("breaker should allow a probe after open duration")
}
}

func TestIsBreakable(t *testing.T) {
if !isBreakable(nil, errors.New("net err")) {
t.Error("network error should be breakable")
}
if !isBreakable(&http.Response{StatusCode: 503}, nil) {
t.Error("5xx should be breakable")
}
if isBreakable(&http.Response{StatusCode: 401}, nil) {
t.Error("4xx should not trip the breaker")
}
if isBreakable(&http.Response{StatusCode: 200}, nil) {
t.Error("2xx should not trip the breaker")
}
}
Loading
Loading