[CLI] Report KV cache memory usage in mlc_llm compile (#3221)

CharlieFRuan · web-flow · commit c009bde46e00 · 2025-09-06T23:50:05.000-04:00
This PR prints out the memory usage of KV cache: MB for one token's KV cache, and the total MB for model weights + intermediate buffers + a 4K-long KV cache.

If somehow the required fields are not present in `config` and `metadata` (e.g. for an old model), we do nothing.

Sample output in CLI:

```
[2025-05-03 21:44:24] INFO model_metadata.py:94: Total memory usage without KV cache: 2254.16 MB (Parameters: 923.16 MB. Temporary buffer: 1331.00 MB)
[2025-05-03 21:44:24] INFO model_metadata.py:128: KV cache size: 0.11 MB per token in the context window
[2025-05-03 21:44:24] INFO model_metadata.py:133: Total memory usage with a 4K KV cache: 2702.16 MB
```
diff --git a/python/mlc_llm/cli/model_metadata.py b/python/mlc_llm/cli/model_metadata.py
@@ -93,12 +93,49 @@ def _report_memory_usage(metadata: Dict[str, Any], config: Union[Dict, ConfigBas
     total_size = params_bytes + temp_func_bytes
     logger.info(
         "%s: %.2f MB (Parameters: %.2f MB. Temporary buffer: %.2f MB)",
-        green("Total memory usage without KV cache:"),
+        green("Total memory usage without KV cache"),
         total_size / 1024 / 1024,
         params_bytes / 1024 / 1024,
         temp_func_bytes / 1024 / 1024,
     )
 
+    # Compute KV cache size per token of context window.
+    if isinstance(config, ConfigBase):
+        config = asdict(config)
+    if (
+        "head_dim" in config
+        and "num_hidden_layers" in config
+        and "num_key_value_heads" in config
+        and "quantization" in metadata
+    ):
+        quantization_type = metadata["quantization"]
+        dtype_bytes = None
+        if "f32" in quantization_type:
+            dtype_bytes = 4
+        elif "bf16" in quantization_type:
+            dtype_bytes = 2
+        elif "f16" in quantization_type:
+            dtype_bytes = 2
+        # TODO: If support quantized KV in future, need to change this  # pylint: disable=fixme
+        if dtype_bytes is not None:
+            bytes_per_token = (
+                config["head_dim"]
+                * config["num_hidden_layers"]
+                * config["num_key_value_heads"]
+                * dtype_bytes
+                * 2  # 2 for key and value
+            )
+            logger.info(
+                "%s: %.2f MB per token in the context window",
+                green("KV cache size"),
+                bytes_per_token / 1024 / 1024,
+            )
+            logger.info(
+                "%s: %.2f MB",
+                green("Total memory usage with a 4K KV cache"),
+                (total_size + bytes_per_token * 4096) / 1024 / 1024,
+            )
+
     logger.info(
         "To reduce memory usage, "
         "tweak `prefill_chunk_size`, `context_window_size` and `sliding_window_size`"