lmcafee-nvidia
diff --git a/‎examples/inference/gpt/gpt_dynamic_inference_12b.sh‎
Lines changed: 8 additions & 2 deletions b/‎examples/inference/gpt/gpt_dynamic_inference_12b.sh‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎examples/inference/gpt/gpt_dynamic_inference_357m.sh‎
Lines changed: 8 additions & 2 deletions b/‎examples/inference/gpt/gpt_dynamic_inference_357m.sh‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎megatron/core/inference/contexts/dynamic_context.py‎
Lines changed: 48 additions & 2 deletions b/‎megatron/core/inference/contexts/dynamic_context.py‎
Lines changed: 48 additions & 2 deletions
@@ -33,6 +33,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 : ${CUDA_GRAPH_SHARE_IO_BUFFERS=1}
 
 # Miscellaneous.
+: ${USE_COORDINATOR=0}
 : ${ENGINE=dynamic}
 : ${EXTRA_ARGS=""}
 # NSIGHT_PREFIX=/path/to/nsight/profile
@@ -85,7 +86,7 @@ ARGS=" \
 "
 
 # Cuda graphs.
-if [ "${CUDA_GRAPH_IMPL}" = "local" ]; then
+if [ "${NUM_CUDA_GRAPHS}" != "0" ]; then
     ARGS+=" \
         --cuda-graph-impl local \
         --inference-dynamic-batching-num-cuda-graphs ${NUM_CUDA_GRAPHS} \
@@ -108,7 +109,12 @@ else
 fi
 
 # Command.
-CMD="python -m examples.inference.gpt.gpt_${ENGINE}_inference ${ARGS}"
+if [[ "${USE_COORDINATOR}" == "0" ]]; then
+    CMD="python -m examples.inference.gpt.gpt_${ENGINE}_inference ${ARGS}"
+else
+    CMD="python -um examples.inference.gpt.gpt_${ENGINE}_inference_with_coordinator ${ARGS}"
+fi
+
 if [[ -v NSIGHT_PREFIX ]]; then
     CMD="nsys profile -s none -t nvtx,cuda --cudabacktrace=all --cuda-graph-trace=node --python-backtrace=cuda --wait all -o ${NSIGHT_PREFIX} --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop ${CMD}"
 fi
 
@@ -34,6 +34,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 : ${CUDA_GRAPH_SHARE_IO_BUFFERS=1}
 
 # Miscellaneous.
+: ${USE_COORDINATOR=0}
 : ${ENGINE=dynamic}
 : ${EXTRA_ARGS=""}
 # NSIGHT_PREFIX=/path/to/nsight/profile
@@ -71,7 +72,7 @@ ARGS=" \
 "
 
 # Cuda graphs.
-if [ "${CUDA_GRAPH_IMPL}" = "local" ]; then
+if [ "${NUM_CUDA_GRAPHS}" != "0" ]; then
     ARGS+=" \
         --cuda-graph-impl local \
         --inference-dynamic-batching-num-cuda-graphs ${NUM_CUDA_GRAPHS} \
@@ -94,7 +95,12 @@ else
 fi
 
 # Command.
-CMD="python -m examples.inference.gpt.gpt_${ENGINE}_inference ${ARGS}"
+if [[ "${USE_COORDINATOR}" == "0" ]]; then
+    CMD="python -m examples.inference.gpt.gpt_${ENGINE}_inference ${ARGS}"
+else
+    CMD="python -um examples.inference.gpt.gpt_${ENGINE}_inference_with_coordinator ${ARGS}"
+fi
+
 if [[ -v NSIGHT_PREFIX ]]; then
     CMD="nsys profile -s none -t nvtx,cuda --cudabacktrace=all --cuda-graph-trace=node --python-backtrace=cuda --wait all -o ${NSIGHT_PREFIX} --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop ${CMD}"
 fi
 
@@ -59,8 +59,10 @@ def __init__(
         self, request_id: Optional[int], message: Optional[str] = None, *, is_transient: bool = True
     ):
         request_str = '--' if request_id is None else str(request_id)
-        message = "" if message is None else f" | {message}"
-        super().__init__(f"request {request_str}{message}")
+        _message = "" if message is None else f" | {message}"
+        super().__init__(f"request {request_str}{_message}")
+        self.request_id = request_id
+        self.message = message
         self.is_transient = is_transient
 
 
@@ -102,6 +104,50 @@ def __init__(self, max_request_count, active_request_count):
         )
 
 
+class ContextErrorFactory:
+    """Factory class for serializing/deserializing context errors."""
+
+    @classmethod
+    def serialize(cls, error: ContextOverflowError) -> dict:
+        """Serialize error.
+
+        Args:
+            error (ContextOverflowError): Error.
+
+        Returns:
+            (dict) Serialized error data.
+        """
+        assert isinstance(error, ContextOverflowError)
+        return {
+            "type": type(error).__name__,
+            "request_id": error.request_id,
+            "message": error.message,
+            "is_transient": error.is_transient,
+        }
+
+    @classmethod
+    def deserialize(cls, obj: dict) -> ContextOverflowError:
+        """Deserialize error.
+
+        Args:
+            obj (dict): Serialized error data.
+
+        Returns:
+            (ContextOverflowError) Deserialized error.
+        """
+        error_cls = {
+            "ContextOverflowError": ContextOverflowError,
+            "RequestOverflowError": RequestOverflowError,
+            "TokenOverflowError": TokenOverflowError,
+            "MaxSequenceLengthOverflowError": MaxSequenceLengthOverflowError,
+            "BlockOverflowError": BlockOverflowError,
+            "ActiveRequestCountOverflowError": ActiveRequestCountOverflowError,
+        }[obj["type"]]
+        error = ContextOverflowError(**{k: v for k, v in obj.items() if k != "type"})
+        error.__class__ = error_cls  # todo (@lmcafe): better/safer alternative?
+        return error
+
+
 class WarmupEngineMode(Enum):
     """Enumeration for warmup engine modes used during cuda graph capture."""