apache · janiussyafiq · Jul 3, 2026
diff --git a/apisix/plugins/ai-cache.lua b/apisix/plugins/ai-cache.lua
@@ -22,6 +22,7 @@ local binding    = require("apisix.plugins.ai-protocols.binding")
 local redis_util = require("apisix.utils.redis")
 local semantic   = require("apisix.plugins.ai-cache.semantic")
 local stream     = require("apisix.plugins.ai-cache.stream")
+local exporter   = require("apisix.plugins.prometheus.exporter")
 
 local ngx        = ngx
 local ngx_null   = ngx.null
@@ -132,6 +133,7 @@ end
 local function serve_hit(conf, ctx, cached, similarity)
     local status = "HIT"
     ctx.ai_cache_status = status
+    ctx.ai_cache_hit_layer = similarity and "semantic" or "exact"
     if conf.cache_headers ~= false then
         core.response.set_header(CACHE_STATUS_HEADER, status)
         local age = ngx.time() - (cached.created_at or ngx.time())
@@ -313,6 +315,12 @@ end
 
 
 function _M.log(conf, ctx)
+    if ctx.ai_cache_status then
+        exporter.inc_ai_cache_status(ctx, ctx.ai_cache_status, ctx.ai_cache_hit_layer)
+    end
+    if ctx.ai_cache_embedding_latency then
+        exporter.observe_ai_cache_embedding_latency(ctx, ctx.ai_cache_embedding_latency)
+    end
     if ctx.ai_cache_status ~= "MISS" or not ctx.ai_cache_fingerprint then
         return
     end

diff --git a/apisix/plugins/ai-cache/semantic.lua b/apisix/plugins/ai-cache/semantic.lua
@@ -26,6 +26,7 @@ local type   = type
 local next   = next
 local concat = table.concat
 local tostring = tostring
+local ngx_now = ngx.now
 
 -- Pre-require both drivers so a misconfigured provider name cannot escape
 -- lookup()'s fail-open boundary via a request-time require() raise.
@@ -262,7 +263,9 @@ function _M.embed_query(conf, ctx, body)
         return nil
     end
 
+    local started = ngx_now()
     local vec, err = embed(conf, text)
+    ctx.ai_cache_embedding_latency = (ngx_now() - started) * 1000
     if not vec then
         core.log.warn("ai-cache: embedding failed, fail-open as MISS: ", err)
         return nil

diff --git a/apisix/plugins/prometheus/exporter.lua b/apisix/plugins/prometheus/exporter.lua
@@ -158,6 +158,14 @@ local metric_label_map = {
         "request_type", "request_llm_model", "llm_model"},
     llm_completion_tokens_dist = {"route_id", "service_id", "consumer", "node",
         "request_type", "request_llm_model", "llm_model"},
+    ai_cache_hits_total = {"layer", "route_id", "service_id", "consumer", "node",
+        "request_type", "request_llm_model", "llm_model"},
+    ai_cache_misses_total = {"route_id", "service_id", "consumer", "node",
+        "request_type", "request_llm_model", "llm_model"},
+    ai_cache_bypasses_total = {"route_id", "service_id", "consumer", "node",
+        "request_type", "request_llm_model", "llm_model"},
+    ai_cache_embedding_latency = {"route_id", "service_id", "consumer", "node",
+        "request_type", "request_llm_model", "llm_model"},
 }
 
 
@@ -282,6 +290,14 @@ function _M.http_init(prometheus_enabled_in_stream)
                                                             "llm_prompt_tokens_dist", "expire")
     local llm_completion_tokens_dist_exptime = core.table.try_read_attr(attr, "metrics",
                                                             "llm_completion_tokens_dist", "expire")
+    local ai_cache_hits_exptime = core.table.try_read_attr(attr, "metrics",
+                                                            "ai_cache_hits_total", "expire")
+    local ai_cache_misses_exptime = core.table.try_read_attr(attr, "metrics",
+                                                            "ai_cache_misses_total", "expire")
+    local ai_cache_bypasses_exptime = core.table.try_read_attr(attr, "metrics",
+                                                            "ai_cache_bypasses_total", "expire")
+    local ai_cache_embedding_latency_exptime = core.table.try_read_attr(attr, "metrics",
+                                                            "ai_cache_embedding_latency", "expire")
 
     prometheus = base_prometheus.init("prometheus-metrics", metric_prefix)
 
@@ -395,6 +411,35 @@ function _M.http_init(prometheus_enabled_in_stream)
         llm_completion_tokens_buckets,
         llm_completion_tokens_dist_exptime)
 
+    metrics.ai_cache_hits_total = prometheus:counter("ai_cache_hits_total",
+            "Total AI cache hits served, per cache layer",
+            append_tables(metric_label_map.ai_cache_hits_total,
+                          extra_labels("ai_cache_hits_total")),
+            ai_cache_hits_exptime)
+
+    metrics.ai_cache_misses_total = prometheus:counter("ai_cache_misses_total",
+            "Total AI cache misses",
+            append_tables(metric_label_map.ai_cache_misses_total,
+                          extra_labels("ai_cache_misses_total")),
+            ai_cache_misses_exptime)
+
+    metrics.ai_cache_bypasses_total = prometheus:counter("ai_cache_bypasses_total",
+            "Total AI cache bypassed requests",
+            append_tables(metric_label_map.ai_cache_bypasses_total,
+                          extra_labels("ai_cache_bypasses_total")),
+            ai_cache_bypasses_exptime)
+
+    local ai_cache_embedding_latency_buckets = DEFAULT_BUCKETS
+    if attr and attr.ai_cache_embedding_latency_buckets then
+        ai_cache_embedding_latency_buckets = attr.ai_cache_embedding_latency_buckets
+    end
+    metrics.ai_cache_embedding_latency = prometheus:histogram("ai_cache_embedding_latency",
+            "Latency of AI cache embedding calls in milliseconds",
+            append_tables(metric_label_map.ai_cache_embedding_latency,
+                          extra_labels("ai_cache_embedding_latency")),
+            ai_cache_embedding_latency_buckets,
+            ai_cache_embedding_latency_exptime)
+
     if prometheus_enabled_in_stream then
         init_stream_metrics()
     end
@@ -974,6 +1019,80 @@ function _M.dec_llm_active_connections(ctx)
     inc_llm_active_connections(ctx, -1)
 end
 
+
+local AI_CACHE_STATUS_METRICS = {
+    HIT    = "ai_cache_hits_total",
+    MISS   = "ai_cache_misses_total",
+    BYPASS = "ai_cache_bypasses_total",
+}
+
+
+function _M.inc_ai_cache_status(ctx, status, layer)
+    local name = AI_CACHE_STATUS_METRICS[status]
+    if not name or not metrics or not metrics[name] then
+        return
+    end
+
+    local vars = ctx.var
+
+    local route_id = ""
+    local balancer_ip = ctx.balancer_ip or ""
+    local service_id = ""
+    local consumer_name = ctx.consumer_name or ""
+
+    local matched_route = ctx.matched_route and ctx.matched_route.value
+    if matched_route then
+        route_id = matched_route.id
+        service_id = matched_route.service_id or ""
+    end
+
+    local disabled_label_metric_map = get_disabled_label_metric_map()
+
+    if status == "HIT" then
+        metrics[name]:inc(1,
+            get_enabled_label_values_for_metric(name, disabled_label_metric_map,
+                layer or "exact", route_id, service_id, consumer_name, balancer_ip,
+                vars.request_type, vars.request_llm_model, vars.llm_model,
+                unpack(extra_labels(name, ctx))))
+    else
+        metrics[name]:inc(1,
+            get_enabled_label_values_for_metric(name, disabled_label_metric_map,
+                route_id, service_id, consumer_name, balancer_ip,
+                vars.request_type, vars.request_llm_model, vars.llm_model,
+                unpack(extra_labels(name, ctx))))
+    end
+end
+
+
+function _M.observe_ai_cache_embedding_latency(ctx, latency)
+    if not metrics or not metrics.ai_cache_embedding_latency then
+        return
+    end
+
+    local vars = ctx.var
+
+    local route_id = ""
+    local balancer_ip = ctx.balancer_ip or ""
+    local service_id = ""
+    local consumer_name = ctx.consumer_name or ""
+
+    local matched_route = ctx.matched_route and ctx.matched_route.value
+    if matched_route then
+        route_id = matched_route.id
+        service_id = matched_route.service_id or ""
+    end
+
+    local disabled_label_metric_map = get_disabled_label_metric_map()
+
+    metrics.ai_cache_embedding_latency:observe(latency,
+        get_enabled_label_values_for_metric("ai_cache_embedding_latency",
+            disabled_label_metric_map,
+            route_id, service_id, consumer_name, balancer_ip,
+            vars.request_type, vars.request_llm_model, vars.llm_model,
+            unpack(extra_labels("ai_cache_embedding_latency", ctx))))
+end
+
+
 function _M.get_prometheus()
     return prometheus
 end

diff --git a/docs/en/latest/plugins/prometheus.md b/docs/en/latest/plugins/prometheus.md
@@ -100,7 +100,7 @@ You can configure the Plugin through its [Plugin Metadata](../terminology/plugin
 
 | Name            | Type   | Required | Description                                                                                                                                                                                                                                                                                |
 | --------------- | ------ | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| disabled_labels | object | False    | Per-metric map of built-in label names whose values are collapsed to an empty string `""` to reduce metric cardinality. Keyed by metric name: `http_status`, `http_latency`, `bandwidth`, `llm_latency`, `llm_prompt_tokens`, `llm_completion_tokens`, `llm_active_connections`, `llm_prompt_tokens_dist`, `llm_completion_tokens_dist`. Structural labels that define a metric's identity (`code` on `http_status`, `type` on `http_latency`, `bandwidth` and `llm_latency`) cannot be disabled. |
+| disabled_labels | object | False    | Per-metric map of built-in label names whose values are collapsed to an empty string `""` to reduce metric cardinality. Keyed by metric name: `http_status`, `http_latency`, `bandwidth`, `llm_latency`, `llm_prompt_tokens`, `llm_completion_tokens`, `llm_active_connections`, `llm_prompt_tokens_dist`, `llm_completion_tokens_dist`, `ai_cache_hits_total`, `ai_cache_misses_total`, `ai_cache_bypasses_total`, `ai_cache_embedding_latency`. Structural labels that define a metric's identity (`code` on `http_status`, `type` on `http_latency`, `bandwidth` and `llm_latency`) cannot be disabled. |
 
 Collapsing a label's value to `""` keeps the label registered in the metric schema, so existing dashboards, `absent()` alerts, and recording rules keep working — only the high-cardinality time series that differ solely by those labels are collapsed into one. This is useful in dynamic environments such as Kubernetes autoscaling, where the upstream node IP (`node` label) churns rapidly and would otherwise overflow the `prometheus-metrics` shared dict.
 
@@ -249,6 +249,63 @@ The `type` label distinguishes the kind of latency, similar to `apisix_http_late
 | request_type       | traditional_http / ai_chat / ai_stream                                                                                          |
 | llm_model       | For non-traditional_http requests, name of the llm_model                                                                                          |
 
+### Labels for `apisix_ai_cache_hits_total`
+
+`apisix_ai_cache_hits_total` counts requests served from the [`ai-cache`](./ai-cache.md) Plugin's cache, per serving layer.
+
+| Name | Description                                                                                                                   |
+| ---------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| layer      | Cache layer that served the hit: `exact` or `semantic`.                                                                                 |
+| route_id      | ID of the Route that the metric corresponds to. Default to an empty string if a request does not match any Route.                         |
+| service_id    | ID of the Service that the matched Route belongs to. Default to an empty string if the matched Route does not belong to any Service. |
+| consumer   | Name of the Consumer associated with a request. Default to an empty string if no Consumer is associated with the request.                       |
+| node       | IP address of the upstream node. Empty for requests served from the cache, which never reach the upstream.                                                                                          |
+| request_type       | traditional_http / ai_chat / ai_stream                                                                                          |
+| request_llm_model       | Model name requested by the client.                                                                                          |
+| llm_model       | Model name reported by the LLM response. Empty for requests served from the cache, which never reach the upstream.                                                                                          |
+
+### Labels for `apisix_ai_cache_misses_total`
+
+`apisix_ai_cache_misses_total` counts requests the [`ai-cache`](./ai-cache.md) Plugin looked up but could not serve from the cache.
+
+| Name | Description                                                                                                                   |
+| ---------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| route_id      | ID of the Route that the metric corresponds to. Default to an empty string if a request does not match any Route.                         |
+| service_id    | ID of the Service that the matched Route belongs to. Default to an empty string if the matched Route does not belong to any Service. |
+| consumer   | Name of the Consumer associated with a request. Default to an empty string if no Consumer is associated with the request.                       |
+| node       | IP address of the upstream node. Empty for requests served from the cache, which never reach the upstream.                                                                                          |
+| request_type       | traditional_http / ai_chat / ai_stream                                                                                          |
+| request_llm_model       | Model name requested by the client.                                                                                          |
+| llm_model       | Model name reported by the LLM response. Empty for requests served from the cache, which never reach the upstream.                                                                                          |
+
+### Labels for `apisix_ai_cache_bypasses_total`
+
+`apisix_ai_cache_bypasses_total` counts requests that bypassed the [`ai-cache`](./ai-cache.md) Plugin's cache lookup entirely.
+
+| Name | Description                                                                                                                   |
+| ---------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| route_id      | ID of the Route that the metric corresponds to. Default to an empty string if a request does not match any Route.                         |
+| service_id    | ID of the Service that the matched Route belongs to. Default to an empty string if the matched Route does not belong to any Service. |
+| consumer   | Name of the Consumer associated with a request. Default to an empty string if no Consumer is associated with the request.                       |
+| node       | IP address of the upstream node. Empty for requests served from the cache, which never reach the upstream.                                                                                          |
+| request_type       | traditional_http / ai_chat / ai_stream                                                                                          |
+| request_llm_model       | Model name requested by the client.                                                                                          |
+| llm_model       | Model name reported by the LLM response. Empty for requests served from the cache, which never reach the upstream.                                                                                          |
+
+### Labels for `apisix_ai_cache_embedding_latency`
+
+`apisix_ai_cache_embedding_latency` is a histogram of the latency, in milliseconds, of the embedding calls made by the [`ai-cache`](./ai-cache.md) Plugin's semantic layer, measured around the embedding provider round-trip for successful and failed calls alike.
+
+| Name | Description                                                                                                                   |
+| ---------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| route_id      | ID of the Route that the metric corresponds to. Default to an empty string if a request does not match any Route.                         |
+| service_id    | ID of the Service that the matched Route belongs to. Default to an empty string if the matched Route does not belong to any Service. |
+| consumer   | Name of the Consumer associated with a request. Default to an empty string if no Consumer is associated with the request.                       |
+| node       | IP address of the upstream node. Empty for requests served from the cache, which never reach the upstream.                                                                                          |
+| request_type       | traditional_http / ai_chat / ai_stream                                                                                          |
+| request_llm_model       | Model name requested by the client.                                                                                          |
+| llm_model       | Model name reported by the LLM response. Empty for requests served from the cache, which never reach the upstream.                                                                                          |
+
 ### Labels for `apisix_http_latency`
 
 The following labels are used to differentiate `apisix_http_latency` metrics.