Skip to content

Commit e72c5b8

Browse files
Ignore "Instrument splice.trigger.latency.duration.seconds has exceeded the maximum allowed cardinality (1999)" (#1177)
--------- Signed-off-by: Oriol Muñoz <[email protected]>
1 parent 1535532 commit e72c5b8

File tree

2 files changed

+8
-1
lines changed

2 files changed

+8
-1
lines changed

cluster/expected/infra/expected.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1983,7 +1983,7 @@
19831983
"id": "",
19841984
"inputs": {
19851985
"description": "Logs with a severity level of warning or above",
1986-
"filter": "severity>=WARNING\nresource.type=\"k8s_container\"\nresource.labels.cluster_name=\"cn-mocknet\"\n-- Note that we ignore the validator runbook. This is because we reset it periodically, which sometimes produces noise.\nresource.labels.namespace_name=~\"sv|validator1|multi-validator|splitwell\"\n-(resource.labels.container_name=~\"participant\" AND jsonPayload.message=~\"Instrument .* has recorded multiple values for the same attributes.\")\n-- https://github.com/DACH-NY/canton-network-node/issues/10475\n-(resource.labels.container_name=\"cometbft\" AND\n ( jsonPayload.err=~\"\\Aerror adding vote\\z|\\Aalready stopped\\z|use of closed network connection\"\n OR jsonPayload._msg=~\"\\A(Stopping peer for error|Stopped accept routine, as transport is closed|Failed to write PacketMsg|Connection failed @ sendRoutine)\\z\"\n OR jsonPayload.error=\"already stopped\"\n OR textPayload=\"cp: not replacing '/cometbft/data/priv_validator_state.json'\"\n OR (jsonPayload._msg=\"Error stopping connection\" AND jsonPayload.err=\"already stopped\")\n OR jsonPayload._msg=\"Error adding peer to new bucket\"))\n-- execution context overload\n-jsonPayload.message=~\"Task runner canton-env-ec is .* overloaded\"\n-- on startup\n-textPayload=~\"Picked up JAVA_TOOL_OPTIONS:\"\n-- \\A and \\z anchor a search (=~) at beginning/end of string, respectively\n-- regex is significantly faster than OR; gcp docs themselves recommend\n-- regex-based factoring\n-resource.labels.container_name=~\"\\A(ans|wallet|scan|sv|splitwell)-web-ui\\z\"\n-- sequencer down\n-(resource.labels.namespace_name=~\"validator|splitwell\"\n AND resource.labels.container_name=~\"participant\"\n AND jsonPayload.message=~\"SEQUENCER_SUBSCRIPTION_LOST|Request failed for sequencer|Sequencer shutting down|Submission timed out|Response message for request .* timed out |periodic acknowledgement failed|Token refresh failed with Status{code=UNAVAILABLE\")\n-(resource.labels.container_name=\"postgres-exporter\" AND jsonPayload.msg=~\"Error loading config|Excluded databases\")\n-jsonPayload.message=~\"UnknownHostException\"\n-(resource.labels.container_name=~\"participant|mediator\" AND jsonPayload.message=~\"Late processing \\(or clock skew\\) of batch\")\n-(resource.labels.container_name=\"sequencer\" AND jsonPayload.stack_trace=~\"UnresolvedAddressException\")\n-(resource.labels.container_name=\"sequencer-pg\" AND\n (\"checkpoints are occurring too frequently\" OR \"Consider increasing the configuration parameter \\\"max_wal_size\\\".\"))\n-(resource.labels.container_name=~\"participant\" AND\n jsonPayload.message=~\"SYNC_SERVICE_ALARM.*Received a request.*where the view.*has (missing|extra) recipients|LOCAL_VERDICT_MALFORMED_PAYLOAD.*Rejected transaction due to malformed payload within views.*WrongRecipients|channel.*shutdown did not complete gracefully in allotted|LOCAL_VERDICT_FAILED_MODEL_CONFORMANCE_CHECK.*: UnvettedPackages\")\n-(resource.labels.container_name=\"mediator\" AND\n jsonPayload.message=~\"MEDIATOR_RECEIVED_MALFORMED_MESSAGE.*(Reason: (Missing root hash message for informee participants|Superfluous root hash message)|Received a (mediator|confirmation) response.*with an invalid root hash)\")\n-(jsonPayload.logger_name=~\"c.d.n.a.AdminAuthExtractor:\" AND jsonPayload.message=~\"Authorization Failed\")\n-(jsonPayload.level=\"error\" AND jsonPayload.msg=~\"/readyz\")\n-- The prometheus export server does not wait for any ongoing requests when shutting down https://github.com/prometheus/client_java/issues/938\n-jsonPayload.message=\"The Prometheus metrics HTTPServer caught an Exception while trying to send the metrics response.\"\n-- istio-proxy is spammy with warnings\n-(resource.labels.container_name=\"istio-proxy\" AND severity<ERROR)\n-resource.labels.container_name=\"postgres\"\n-(resource.labels.container_name=~\"postgres\" AND resource.labels.namespace_name=\"multi-validator\")\n-- TODO(#14570): Remove this once we have improved our sv onboarding logic\n-(resource.labels.container_name=\"sv-app\" AND jsonPayload.stack_trace=~\"io.grpc.StatusRuntimeException: FAILED_PRECONDITION: UNHANDLED_EXCEPTION.*SV party has not yet operated a node\")\n-- TODO(#15716): Don't just ignore this - investigate!\n-(resource.labels.container_name=\"splitwell-app\" AND jsonPayload.message=~\"Waiting for domain Domain 'global' to be connected has not completed after\")\n-- TODO(#17636): Our apps can't handle ingesting bursts of transactions after delays due to the record order publisher\n-(jsonPayload.message=~\"signalWhenIngested.* has not completed after .* milliseconds\")\n\n-- TODO(#17025): Stop ignoring these again once we have topology-aware package selection\n-(jsonPayload.\"span-name\"=\"MergeValidatorLicenseContractsTrigger\" AND (severity=WARNING OR \"has not vetted\"))\n-(jsonPayload.\"error-code\"=~\"ACS_COMMITMENT_MISMATCH\" AND jsonPayload.remote=~\"tw-cn-testnet-participant\")\n\n\n-- TODO(#19192): suppressed faulty validator warnings until timestamp\n-(resource.labels.container_name=\"participant\"\n AND resource.labels.namespace_name=\"sv-1\"\n AND jsonPayload.message=~\"ACS_COMMITMENT_MISMATCH\"\n AND jsonPayload.remote=~\"sender = PAR::tw-cn-testnet-participant-1::122051b3a160\"\n AND timestamp <= \"2025-05-14T09:00:00.000Z\")\n",
1986+
"filter": "severity>=WARNING\nresource.type=\"k8s_container\"\nresource.labels.cluster_name=\"cn-mocknet\"\n-- Note that we ignore the validator runbook. This is because we reset it periodically, which sometimes produces noise.\nresource.labels.namespace_name=~\"sv|validator1|multi-validator|splitwell\"\n-(resource.labels.container_name=~\"participant\" AND jsonPayload.message=~\"Instrument .* has recorded multiple values for the same attributes.\")\n-- https://github.com/DACH-NY/canton-network-node/issues/10475\n-(resource.labels.container_name=\"cometbft\" AND\n ( jsonPayload.err=~\"\\Aerror adding vote\\z|\\Aalready stopped\\z|use of closed network connection\"\n OR jsonPayload._msg=~\"\\A(Stopping peer for error|Stopped accept routine, as transport is closed|Failed to write PacketMsg|Connection failed @ sendRoutine)\\z\"\n OR jsonPayload.error=\"already stopped\"\n OR textPayload=\"cp: not replacing '/cometbft/data/priv_validator_state.json'\"\n OR (jsonPayload._msg=\"Error stopping connection\" AND jsonPayload.err=\"already stopped\")\n OR jsonPayload._msg=\"Error adding peer to new bucket\"))\n-- execution context overload\n-jsonPayload.message=~\"Task runner canton-env-ec is .* overloaded\"\n-- on startup\n-textPayload=~\"Picked up JAVA_TOOL_OPTIONS:\"\n-- \\A and \\z anchor a search (=~) at beginning/end of string, respectively\n-- regex is significantly faster than OR; gcp docs themselves recommend\n-- regex-based factoring\n-resource.labels.container_name=~\"\\A(ans|wallet|scan|sv|splitwell)-web-ui\\z\"\n-- sequencer down\n-(resource.labels.namespace_name=~\"validator|splitwell\"\n AND resource.labels.container_name=~\"participant\"\n AND jsonPayload.message=~\"SEQUENCER_SUBSCRIPTION_LOST|Request failed for sequencer|Sequencer shutting down|Submission timed out|Response message for request .* timed out |periodic acknowledgement failed|Token refresh failed with Status{code=UNAVAILABLE\")\n-(resource.labels.container_name=\"postgres-exporter\" AND jsonPayload.msg=~\"Error loading config|Excluded databases\")\n-jsonPayload.message=~\"UnknownHostException\"\n-(resource.labels.container_name=~\"participant|mediator\" AND jsonPayload.message=~\"Late processing \\(or clock skew\\) of batch\")\n-(resource.labels.container_name=\"sequencer\" AND jsonPayload.stack_trace=~\"UnresolvedAddressException\")\n-(resource.labels.container_name=\"sequencer-pg\" AND\n (\"checkpoints are occurring too frequently\" OR \"Consider increasing the configuration parameter \\\"max_wal_size\\\".\"))\n-(resource.labels.container_name=~\"participant\" AND\n jsonPayload.message=~\"SYNC_SERVICE_ALARM.*Received a request.*where the view.*has (missing|extra) recipients|LOCAL_VERDICT_MALFORMED_PAYLOAD.*Rejected transaction due to malformed payload within views.*WrongRecipients|channel.*shutdown did not complete gracefully in allotted|LOCAL_VERDICT_FAILED_MODEL_CONFORMANCE_CHECK.*: UnvettedPackages\")\n-(resource.labels.container_name=\"mediator\" AND\n jsonPayload.message=~\"MEDIATOR_RECEIVED_MALFORMED_MESSAGE.*(Reason: (Missing root hash message for informee participants|Superfluous root hash message)|Received a (mediator|confirmation) response.*with an invalid root hash)\")\n-(jsonPayload.logger_name=~\"c.d.n.a.AdminAuthExtractor:\" AND jsonPayload.message=~\"Authorization Failed\")\n-(jsonPayload.level=\"error\" AND jsonPayload.msg=~\"/readyz\")\n-- The prometheus export server does not wait for any ongoing requests when shutting down https://github.com/prometheus/client_java/issues/938\n-jsonPayload.message=\"The Prometheus metrics HTTPServer caught an Exception while trying to send the metrics response.\"\n-- istio-proxy is spammy with warnings\n-(resource.labels.container_name=\"istio-proxy\" AND severity<ERROR)\n-resource.labels.container_name=\"postgres\"\n-(resource.labels.container_name=~\"postgres\" AND resource.labels.namespace_name=\"multi-validator\")\n-- TODO(#14570): Remove this once we have improved our sv onboarding logic\n-(resource.labels.container_name=\"sv-app\" AND jsonPayload.stack_trace=~\"io.grpc.StatusRuntimeException: FAILED_PRECONDITION: UNHANDLED_EXCEPTION.*SV party has not yet operated a node\")\n-- TODO(#15716): Don't just ignore this - investigate!\n-(resource.labels.container_name=\"splitwell-app\" AND jsonPayload.message=~\"Waiting for domain Domain 'global' to be connected has not completed after\")\n-- TODO(#17636): Our apps can't handle ingesting bursts of transactions after delays due to the record order publisher\n-(jsonPayload.message=~\"signalWhenIngested.* has not completed after .* milliseconds\")\n\n-- TODO(#17025): Stop ignoring these again once we have topology-aware package selection\n-(jsonPayload.\"span-name\"=\"MergeValidatorLicenseContractsTrigger\" AND (severity=WARNING OR \"has not vetted\"))\n-(jsonPayload.\"error-code\"=~\"ACS_COMMITMENT_MISMATCH\" AND jsonPayload.remote=~\"tw-cn-testnet-participant\")\n\n\n-- TODO(#19192): suppressed faulty validator warnings until timestamp\n-(resource.labels.container_name=\"participant\"\n AND resource.labels.namespace_name=\"sv-1\"\n AND jsonPayload.message=~\"ACS_COMMITMENT_MISMATCH\"\n AND jsonPayload.remote=~\"sender = PAR::tw-cn-testnet-participant-1::122051b3a160\"\n AND timestamp <= \"2025-05-14T09:00:00.000Z\")\n\n",
19871987
"labelExtractors": {
19881988
"cluster": "EXTRACT(resource.labels.cluster_name)",
19891989
"namespace": "EXTRACT(resource.labels.namespace_name)"

cluster/pulumi/infra/src/gcpAlerts.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,13 @@ ${conditionalString(
124124
AND jsonPayload.remote=~"sender = PAR::tw-cn-testnet-participant-1::122051b3a160"
125125
AND timestamp <= "2025-05-14T09:00:00.000Z")
126126
`
127+
)}
128+
${conditionalString(
129+
// making this condition more complicated causes GCP to be unable to parse the query because there's too many filters
130+
isDevNet,
131+
`-- TODO(hyperledger-labs/splice#447): remove this once configured cardinality is respected
132+
-(jsonPayload.message="Instrument splice.trigger.latency.duration.seconds has exceeded the maximum allowed cardinality (1999).")
133+
`
127134
)}`,
128135
labelExtractors: {
129136
cluster: 'EXTRACT(resource.labels.cluster_name)',

0 commit comments

Comments
 (0)