Fair share simulator (#339)

itsomri · web-flow · commit d0d8cc3f6216 · 2025-07-28T11:11:19.000+03:00
* Added fair share simulator
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,7 @@ cover.out
 .DS_Store
 coverage/
 *.test
-launch.json
+launch.json
+
+cmd/fairshare-simulator/fairshare-simulator
+cmd/snapshot-tool/snapshot-tool
diff --git a/Makefile b/Makefile
@@ -16,7 +16,7 @@ KUSTOMIZE ?= $(LOCALBIN)/kustomize
 
 # Space seperated list of services to build by default
 # SERVICE_NAMES := service1 service2 service3
-SERVICE_NAMES := podgrouper scheduler binder webhookmanager resourcereservation snapshot-tool scalingpod nodescaleadjuster podgroupcontroller queuecontroller
+SERVICE_NAMES := podgrouper scheduler binder webhookmanager resourcereservation snapshot-tool scalingpod nodescaleadjuster podgroupcontroller queuecontroller fairshare-simulator
 
 
 lint: fmt-go vet-go lint-go
diff --git a/cmd/fairshare-simulator/README.md b/cmd/fairshare-simulator/README.md
@@ -0,0 +1,87 @@
+# Fairshare Simulator
+
+This is a simple HTTP server that simulates the fair share resource division algorithm used in the KAI Scheduler's proportion plugin.
+
+## Building and Running
+
+Build the simulator:
+
+```bash
+go build .
+```
+
+Run it:
+
+```bash
+./fairshare-simulator -port=8080
+```
+
+The port is configurable with the `-port` flag and defaults to 8080.
+
+## Usage
+
+Send a POST request to `/simulate` with a JSON body containing the simulation parameters.
+
+### Example Request
+
+```http
+POST /simulate HTTP/1.1
+Content-Type: application/json
+
+{
+    "totalResource": {
+      "GPU": 100,
+      "CPU": 16000,
+      "Memory": 32000000
+    },
+    "queues": [
+      {
+        "uid": "queue1",
+        "name": "test-queue",
+        "priority": 0,
+        "resourceShare": {
+          "gpu": {
+            "deserved": 10,
+            "request": 100,
+            "overQuotaWeight": 3
+          }
+        }
+      },
+      {
+        "uid": "queue2",
+        "name": "test-queue2",
+        "priority": 0,
+        "resourceShare": {
+          "gpu": {
+            "deserved": 10,
+            "request": 100,
+            "overQuotaWeight": 1
+          }
+        }
+      }
+    ]
+}
+```
+
+### Response
+
+The response is a JSON object with fair share values for each queue:
+
+```json
+{
+  "queue1": {
+    "gpu": 70,
+    "cpu": 16000,
+    "memory": 100000
+  },
+  "queue2": {
+    "gpu": 30,
+    "cpu": 16000,
+    "memory": 100000
+  }
+}
+```
+
+(Note: Actual values depend on the input parameters and the simulation logic.)
+
+This simulator uses the `SetResourcesShare` function from the proportion plugin to compute the fair shares. 
diff --git a/cmd/fairshare-simulator/example.http b/cmd/fairshare-simulator/example.http
@@ -0,0 +1,36 @@
+POST http://localhost:8080/simulate HTTP/1.1
+content-type: application/json
+
+{
+    "totalResource": {
+      "GPU": 100,
+      "CPU": 16000,
+      "Memory": 32000000
+    },
+    "queues": [
+      {
+        "uid": "queue1",
+        "name": "test-queue",
+        "priority": 0,
+        "resourceShare": {
+          "gpu": {
+            "deserved": 10,
+            "request": 100,
+            "overQuotaWeight": 3
+          }
+        }
+      },
+      {
+        "uid": "queue2",
+        "name": "test-queue2",
+        "priority": 0,
+        "resourceShare": {
+          "gpu": {
+            "deserved": 10,
+            "request": 100,
+            "overQuotaWeight": 1
+          }
+        }
+      }
+    ]
+}
diff --git a/cmd/fairshare-simulator/main.go b/cmd/fairshare-simulator/main.go
@@ -0,0 +1,103 @@
+// Copyright 2023 NVIDIA CORPORATION
+// SPDX-License-Identifier: Apache-2.0
+
+package main
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"log"
+	"net/http"
+
+	"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info"
+	"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/plugins/proportion/resource_division"
+	rs "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/plugins/proportion/resource_share"
+)
+
+type SimulateRequest struct {
+	TotalResource rs.ResourceQuantities `json:"totalResource"`
+	Queues        []rs.QueueOverrides   `json:"queues"`
+}
+
+type QueueFairShare struct {
+	GPU    float64 `json:"gpu"`
+	CPU    float64 `json:"cpu"`
+	Memory float64 `json:"memory"`
+}
+
+type server struct {
+	enableCors bool
+}
+
+func (s *server) enableCorsHeaders(w *http.ResponseWriter) {
+	(*w).Header().Set("Access-Control-Allow-Origin", "*")
+	(*w).Header().Set("Access-Control-Allow-Methods", "POST, OPTIONS")
+	(*w).Header().Set("Access-Control-Allow-Headers", "Content-Type")
+}
+
+func (s *server) simulateHandler(w http.ResponseWriter, r *http.Request) {
+	if s.enableCors {
+		s.enableCorsHeaders(&w)
+		if r.Method == "OPTIONS" {
+			w.WriteHeader(http.StatusOK)
+			return
+		}
+	}
+
+	if r.Method != "POST" {
+		http.Error(w, "Only POST allowed", http.StatusMethodNotAllowed)
+		return
+	}
+
+	var req SimulateRequest
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		http.Error(w, err.Error(), http.StatusBadRequest)
+		return
+	}
+
+	queues := SimulateSetResourcesShare(req.TotalResource, req.Queues)
+
+	resp := make(map[string]QueueFairShare)
+	for id, qa := range queues {
+		resp[string(id)] = QueueFairShare{
+			GPU:    qa.GPU.FairShare,
+			CPU:    qa.CPU.FairShare,
+			Memory: qa.Memory.FairShare,
+		}
+	}
+
+	w.Header().Set("Content-Type", "application/json")
+	err := json.NewEncoder(w).Encode(resp)
+	if err != nil {
+		http.Error(w, fmt.Sprintf("Failed to encode response: %v", err), http.StatusInternalServerError)
+		return
+	}
+}
+
+func main() {
+	var port = flag.Int("port", 8080, "Port to listen on")
+	var enableCors = flag.Bool("enable-cors", false, "Enable CORS headers for cross-origin requests")
+	flag.Parse()
+
+	s := &server{
+		enableCors: *enableCors,
+	}
+
+	http.HandleFunc("/simulate", s.simulateHandler)
+	log.Printf("Starting server on port %d (CORS enabled: %v)...", *port, *enableCors)
+	err := http.ListenAndServe(fmt.Sprintf(":%d", *port), nil)
+	if err != nil {
+		log.Fatalf("Failed to start server: %v", err)
+	}
+}
+
+func SimulateSetResourcesShare(totalResource rs.ResourceQuantities, queueOverrides []rs.QueueOverrides) map[common_info.QueueID]*rs.QueueAttributes {
+	queues := make(map[common_info.QueueID]*rs.QueueAttributes)
+	for _, qo := range queueOverrides {
+		qa := qo.ToQueueAttributes()
+		queues[qa.UID] = qa
+	}
+	resource_division.SetResourcesShare(totalResource, queues)
+	return queues
+}
diff --git a/pkg/scheduler/plugins/proportion/resource_share/resource_share_defaults.go b/pkg/scheduler/plugins/proportion/resource_share/resource_share_defaults.go
@@ -0,0 +1,130 @@
+// Copyright 2025 NVIDIA CORPORATION
+// SPDX-License-Identifier: Apache-2.0
+
+package resource_share
+
+import (
+	"time"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+	consts "github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
+	common_info "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info"
+	"knative.dev/pkg/ptr"
+)
+
+const (
+	defaultRequest = 100000
+)
+
+type ResourceShareOverrides struct {
+	Deserved                *float64 `json:"deserved"`
+	FairShare               *float64 `json:"fairShare"`
+	MaxAllowed              *float64 `json:"maxAllowed"`
+	OverQuotaWeight         *float64 `json:"overQuotaWeight"`
+	Allocated               *float64 `json:"allocated"`
+	AllocatedNotPreemptible *float64 `json:"allocatedNotPreemptible"`
+	Request                 *float64 `json:"request"`
+	AbsoluteUsage           *float64 `json:"absoluteUsage"`
+	VacantAdjustedUsage     *float64 `json:"vacantAdjustedUsage"`
+}
+
+func (r *ResourceShareOverrides) ResourceShare() *ResourceShare {
+	rs := ResourceShare{
+		Deserved:                0,
+		FairShare:               0,
+		MaxAllowed:              consts.UnlimitedResourceQuantity,
+		OverQuotaWeight:         1,
+		Allocated:               0,
+		AllocatedNotPreemptible: 0,
+		Request:                 defaultRequest,
+	}
+	if r.Deserved != nil {
+		rs.Deserved = *r.Deserved
+	}
+	if r.FairShare != nil {
+		rs.FairShare = *r.FairShare
+	}
+	if r.MaxAllowed != nil {
+		rs.MaxAllowed = *r.MaxAllowed
+	}
+	if r.OverQuotaWeight != nil {
+		rs.OverQuotaWeight = *r.OverQuotaWeight
+	}
+	if r.Allocated != nil {
+		rs.Allocated = *r.Allocated
+	}
+	if r.AllocatedNotPreemptible != nil {
+		rs.AllocatedNotPreemptible = *r.AllocatedNotPreemptible
+	}
+	if r.Request != nil {
+		rs.Request = *r.Request
+	}
+	return &rs
+}
+
+type QueueResourceShareOverrides struct {
+	GPU    *ResourceShareOverrides `json:"gpu"`
+	CPU    *ResourceShareOverrides `json:"cpu"`
+	Memory *ResourceShareOverrides `json:"memory"`
+}
+
+func (q QueueResourceShareOverrides) ResourceShare() QueueResourceShare {
+	if q.GPU == nil {
+		q.GPU = &ResourceShareOverrides{}
+	}
+	if q.CPU == nil {
+		q.CPU = &ResourceShareOverrides{
+			Deserved: ptr.Float64(consts.UnlimitedResourceQuantity),
+		}
+	}
+	if q.Memory == nil {
+		q.Memory = &ResourceShareOverrides{
+			Deserved: ptr.Float64(consts.UnlimitedResourceQuantity),
+		}
+	}
+	rs := QueueResourceShare{
+		GPU:    *q.GPU.ResourceShare(),
+		CPU:    *q.CPU.ResourceShare(),
+		Memory: *q.Memory.ResourceShare(),
+	}
+	return rs
+}
+
+type QueueOverrides struct {
+	UID               common_info.QueueID         `json:"uid"`
+	Name              string                      `json:"name"`
+	ParentQueue       common_info.QueueID         `json:"parentQueue"`
+	ChildQueues       []common_info.QueueID       `json:"childQueues"`
+	CreationTimestamp *string                     `json:"creationTimestamp"`
+	Priority          *int                        `json:"priority"`
+	ResourceShare     QueueResourceShareOverrides `json:"resourceShare"`
+}
+
+func (qo *QueueOverrides) ToQueueAttributes() *QueueAttributes {
+	qa := &QueueAttributes{
+		UID:                qo.UID,
+		Name:               qo.Name,
+		ParentQueue:        qo.ParentQueue,
+		ChildQueues:        qo.ChildQueues,
+		Priority:           0,
+		QueueResourceShare: qo.ResourceShare.ResourceShare(),
+	}
+
+	if qo.Priority != nil {
+		qa.Priority = *qo.Priority
+	}
+
+	if qo.CreationTimestamp != nil {
+		t, err := time.Parse(time.RFC3339, *qo.CreationTimestamp)
+		if err == nil {
+			qa.CreationTimestamp = metav1.Time{Time: t}
+		} else {
+			qa.CreationTimestamp = metav1.Now()
+		}
+	} else {
+		qa.CreationTimestamp = metav1.Now()
+	}
+
+	return qa
+}