Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions apisix/plugins/ai-cache.lua
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ local binding = require("apisix.plugins.ai-protocols.binding")
local redis_util = require("apisix.utils.redis")
local semantic = require("apisix.plugins.ai-cache.semantic")
local stream = require("apisix.plugins.ai-cache.stream")
local exporter = require("apisix.plugins.prometheus.exporter")

local ngx = ngx
local ngx_null = ngx.null
Expand Down Expand Up @@ -132,6 +133,7 @@ end
local function serve_hit(conf, ctx, cached, similarity)
local status = "HIT"
ctx.ai_cache_status = status
ctx.ai_cache_hit_layer = similarity and "semantic" or "exact"
if conf.cache_headers ~= false then
core.response.set_header(CACHE_STATUS_HEADER, status)
local age = ngx.time() - (cached.created_at or ngx.time())
Expand Down Expand Up @@ -313,6 +315,12 @@ end


function _M.log(conf, ctx)
if ctx.ai_cache_status then
exporter.inc_ai_cache_status(ctx, ctx.ai_cache_status, ctx.ai_cache_hit_layer)
end
if ctx.ai_cache_embedding_latency then
exporter.observe_ai_cache_embedding_latency(ctx, ctx.ai_cache_embedding_latency)
end
if ctx.ai_cache_status ~= "MISS" or not ctx.ai_cache_fingerprint then
return
end
Expand Down
3 changes: 3 additions & 0 deletions apisix/plugins/ai-cache/semantic.lua
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ local type = type
local next = next
local concat = table.concat
local tostring = tostring
local ngx_now = ngx.now

-- Pre-require both drivers so a misconfigured provider name cannot escape
-- lookup()'s fail-open boundary via a request-time require() raise.
Expand Down Expand Up @@ -262,7 +263,9 @@ function _M.embed_query(conf, ctx, body)
return nil
end

local started = ngx_now()
local vec, err = embed(conf, text)
ctx.ai_cache_embedding_latency = (ngx_now() - started) * 1000
if not vec then
core.log.warn("ai-cache: embedding failed, fail-open as MISS: ", err)
return nil
Expand Down
119 changes: 119 additions & 0 deletions apisix/plugins/prometheus/exporter.lua
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,14 @@ local metric_label_map = {
"request_type", "request_llm_model", "llm_model"},
llm_completion_tokens_dist = {"route_id", "service_id", "consumer", "node",
"request_type", "request_llm_model", "llm_model"},
ai_cache_hits_total = {"layer", "route_id", "service_id", "consumer", "node",
"request_type", "request_llm_model", "llm_model"},
ai_cache_misses_total = {"route_id", "service_id", "consumer", "node",
"request_type", "request_llm_model", "llm_model"},
ai_cache_bypasses_total = {"route_id", "service_id", "consumer", "node",
"request_type", "request_llm_model", "llm_model"},
ai_cache_embedding_latency = {"route_id", "service_id", "consumer", "node",
"request_type", "request_llm_model", "llm_model"},
}


Expand Down Expand Up @@ -282,6 +290,14 @@ function _M.http_init(prometheus_enabled_in_stream)
"llm_prompt_tokens_dist", "expire")
local llm_completion_tokens_dist_exptime = core.table.try_read_attr(attr, "metrics",
"llm_completion_tokens_dist", "expire")
local ai_cache_hits_exptime = core.table.try_read_attr(attr, "metrics",
"ai_cache_hits_total", "expire")
local ai_cache_misses_exptime = core.table.try_read_attr(attr, "metrics",
"ai_cache_misses_total", "expire")
local ai_cache_bypasses_exptime = core.table.try_read_attr(attr, "metrics",
"ai_cache_bypasses_total", "expire")
local ai_cache_embedding_latency_exptime = core.table.try_read_attr(attr, "metrics",
"ai_cache_embedding_latency", "expire")

prometheus = base_prometheus.init("prometheus-metrics", metric_prefix)

Expand Down Expand Up @@ -395,6 +411,35 @@ function _M.http_init(prometheus_enabled_in_stream)
llm_completion_tokens_buckets,
llm_completion_tokens_dist_exptime)

metrics.ai_cache_hits_total = prometheus:counter("ai_cache_hits_total",
"Total AI cache hits served, per cache layer",
append_tables(metric_label_map.ai_cache_hits_total,
extra_labels("ai_cache_hits_total")),
ai_cache_hits_exptime)

metrics.ai_cache_misses_total = prometheus:counter("ai_cache_misses_total",
"Total AI cache misses",
append_tables(metric_label_map.ai_cache_misses_total,
extra_labels("ai_cache_misses_total")),
ai_cache_misses_exptime)

metrics.ai_cache_bypasses_total = prometheus:counter("ai_cache_bypasses_total",
"Total AI cache bypassed requests",
append_tables(metric_label_map.ai_cache_bypasses_total,
extra_labels("ai_cache_bypasses_total")),
ai_cache_bypasses_exptime)

local ai_cache_embedding_latency_buckets = DEFAULT_BUCKETS
if attr and attr.ai_cache_embedding_latency_buckets then
ai_cache_embedding_latency_buckets = attr.ai_cache_embedding_latency_buckets
end
metrics.ai_cache_embedding_latency = prometheus:histogram("ai_cache_embedding_latency",
"Latency of AI cache embedding calls in milliseconds",
append_tables(metric_label_map.ai_cache_embedding_latency,
extra_labels("ai_cache_embedding_latency")),
ai_cache_embedding_latency_buckets,
ai_cache_embedding_latency_exptime)

if prometheus_enabled_in_stream then
init_stream_metrics()
end
Expand Down Expand Up @@ -974,6 +1019,80 @@ function _M.dec_llm_active_connections(ctx)
inc_llm_active_connections(ctx, -1)
end


local AI_CACHE_STATUS_METRICS = {
HIT = "ai_cache_hits_total",
MISS = "ai_cache_misses_total",
BYPASS = "ai_cache_bypasses_total",
}


function _M.inc_ai_cache_status(ctx, status, layer)
local name = AI_CACHE_STATUS_METRICS[status]
if not name or not metrics or not metrics[name] then
return
end

local vars = ctx.var

local route_id = ""
local balancer_ip = ctx.balancer_ip or ""
local service_id = ""
local consumer_name = ctx.consumer_name or ""

local matched_route = ctx.matched_route and ctx.matched_route.value
if matched_route then
route_id = matched_route.id
service_id = matched_route.service_id or ""
end

local disabled_label_metric_map = get_disabled_label_metric_map()

if status == "HIT" then
metrics[name]:inc(1,
get_enabled_label_values_for_metric(name, disabled_label_metric_map,
layer or "exact", route_id, service_id, consumer_name, balancer_ip,
vars.request_type, vars.request_llm_model, vars.llm_model,
unpack(extra_labels(name, ctx))))
else
metrics[name]:inc(1,
get_enabled_label_values_for_metric(name, disabled_label_metric_map,
route_id, service_id, consumer_name, balancer_ip,
vars.request_type, vars.request_llm_model, vars.llm_model,
unpack(extra_labels(name, ctx))))
end
end


function _M.observe_ai_cache_embedding_latency(ctx, latency)
if not metrics or not metrics.ai_cache_embedding_latency then
return
end

local vars = ctx.var

local route_id = ""
local balancer_ip = ctx.balancer_ip or ""
local service_id = ""
local consumer_name = ctx.consumer_name or ""

local matched_route = ctx.matched_route and ctx.matched_route.value
if matched_route then
route_id = matched_route.id
service_id = matched_route.service_id or ""
end

local disabled_label_metric_map = get_disabled_label_metric_map()

metrics.ai_cache_embedding_latency:observe(latency,
get_enabled_label_values_for_metric("ai_cache_embedding_latency",
disabled_label_metric_map,
route_id, service_id, consumer_name, balancer_ip,
vars.request_type, vars.request_llm_model, vars.llm_model,
unpack(extra_labels("ai_cache_embedding_latency", ctx))))
end


function _M.get_prometheus()
return prometheus
end
Expand Down
59 changes: 58 additions & 1 deletion docs/en/latest/plugins/prometheus.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ You can configure the Plugin through its [Plugin Metadata](../terminology/plugin

| Name | Type | Required | Description |
| --------------- | ------ | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| disabled_labels | object | False | Per-metric map of built-in label names whose values are collapsed to an empty string `""` to reduce metric cardinality. Keyed by metric name: `http_status`, `http_latency`, `bandwidth`, `llm_latency`, `llm_prompt_tokens`, `llm_completion_tokens`, `llm_active_connections`, `llm_prompt_tokens_dist`, `llm_completion_tokens_dist`. Structural labels that define a metric's identity (`code` on `http_status`, `type` on `http_latency`, `bandwidth` and `llm_latency`) cannot be disabled. |
| disabled_labels | object | False | Per-metric map of built-in label names whose values are collapsed to an empty string `""` to reduce metric cardinality. Keyed by metric name: `http_status`, `http_latency`, `bandwidth`, `llm_latency`, `llm_prompt_tokens`, `llm_completion_tokens`, `llm_active_connections`, `llm_prompt_tokens_dist`, `llm_completion_tokens_dist`, `ai_cache_hits_total`, `ai_cache_misses_total`, `ai_cache_bypasses_total`, `ai_cache_embedding_latency`. Structural labels that define a metric's identity (`code` on `http_status`, `type` on `http_latency`, `bandwidth` and `llm_latency`) cannot be disabled. |

Collapsing a label's value to `""` keeps the label registered in the metric schema, so existing dashboards, `absent()` alerts, and recording rules keep working — only the high-cardinality time series that differ solely by those labels are collapsed into one. This is useful in dynamic environments such as Kubernetes autoscaling, where the upstream node IP (`node` label) churns rapidly and would otherwise overflow the `prometheus-metrics` shared dict.

Expand Down Expand Up @@ -249,6 +249,63 @@ The `type` label distinguishes the kind of latency, similar to `apisix_http_late
| request_type | traditional_http / ai_chat / ai_stream |
| llm_model | For non-traditional_http requests, name of the llm_model |

### Labels for `apisix_ai_cache_hits_total`

`apisix_ai_cache_hits_total` counts requests served from the [`ai-cache`](./ai-cache.md) Plugin's cache, per serving layer.

| Name | Description |
| ---------- | ----------------------------------------------------------------------------------------------------------------------------- |
| layer | Cache layer that served the hit: `exact` or `semantic`. |
| route_id | ID of the Route that the metric corresponds to. Default to an empty string if a request does not match any Route. |
| service_id | ID of the Service that the matched Route belongs to. Default to an empty string if the matched Route does not belong to any Service. |
| consumer | Name of the Consumer associated with a request. Default to an empty string if no Consumer is associated with the request. |
| node | IP address of the upstream node. Empty for requests served from the cache, which never reach the upstream. |
| request_type | traditional_http / ai_chat / ai_stream |
| request_llm_model | Model name requested by the client. |
| llm_model | Model name reported by the LLM response. Empty for requests served from the cache, which never reach the upstream. |

### Labels for `apisix_ai_cache_misses_total`

`apisix_ai_cache_misses_total` counts requests the [`ai-cache`](./ai-cache.md) Plugin looked up but could not serve from the cache.

| Name | Description |
| ---------- | ----------------------------------------------------------------------------------------------------------------------------- |
| route_id | ID of the Route that the metric corresponds to. Default to an empty string if a request does not match any Route. |
| service_id | ID of the Service that the matched Route belongs to. Default to an empty string if the matched Route does not belong to any Service. |
| consumer | Name of the Consumer associated with a request. Default to an empty string if no Consumer is associated with the request. |
| node | IP address of the upstream node. Empty for requests served from the cache, which never reach the upstream. |
| request_type | traditional_http / ai_chat / ai_stream |
| request_llm_model | Model name requested by the client. |
| llm_model | Model name reported by the LLM response. Empty for requests served from the cache, which never reach the upstream. |

### Labels for `apisix_ai_cache_bypasses_total`

`apisix_ai_cache_bypasses_total` counts requests that bypassed the [`ai-cache`](./ai-cache.md) Plugin's cache lookup entirely.

| Name | Description |
| ---------- | ----------------------------------------------------------------------------------------------------------------------------- |
| route_id | ID of the Route that the metric corresponds to. Default to an empty string if a request does not match any Route. |
| service_id | ID of the Service that the matched Route belongs to. Default to an empty string if the matched Route does not belong to any Service. |
| consumer | Name of the Consumer associated with a request. Default to an empty string if no Consumer is associated with the request. |
| node | IP address of the upstream node. Empty for requests served from the cache, which never reach the upstream. |
| request_type | traditional_http / ai_chat / ai_stream |
| request_llm_model | Model name requested by the client. |
| llm_model | Model name reported by the LLM response. Empty for requests served from the cache, which never reach the upstream. |

### Labels for `apisix_ai_cache_embedding_latency`

`apisix_ai_cache_embedding_latency` is a histogram of the latency, in milliseconds, of the embedding calls made by the [`ai-cache`](./ai-cache.md) Plugin's semantic layer, measured around the embedding provider round-trip for successful and failed calls alike.

| Name | Description |
| ---------- | ----------------------------------------------------------------------------------------------------------------------------- |
| route_id | ID of the Route that the metric corresponds to. Default to an empty string if a request does not match any Route. |
| service_id | ID of the Service that the matched Route belongs to. Default to an empty string if the matched Route does not belong to any Service. |
| consumer | Name of the Consumer associated with a request. Default to an empty string if no Consumer is associated with the request. |
| node | IP address of the upstream node. Empty for requests served from the cache, which never reach the upstream. |
| request_type | traditional_http / ai_chat / ai_stream |
| request_llm_model | Model name requested by the client. |
| llm_model | Model name reported by the LLM response. Empty for requests served from the cache, which never reach the upstream. |

### Labels for `apisix_http_latency`

The following labels are used to differentiate `apisix_http_latency` metrics.
Expand Down
Loading
Loading