diff --git a/Cargo.lock b/Cargo.lock index 995f3629..ef9feeee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -61,6 +61,12 @@ dependencies = [ "libc", ] +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + [[package]] name = "anstream" version = "0.6.21" @@ -205,7 +211,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -226,6 +232,16 @@ dependencies = [ "bytemuck", ] +[[package]] +name = "atomic-wait" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a55b94919229f2c42292fd71ffa4b75e83193bffdd77b1e858cd55fd2d0b0ea8" +dependencies = [ + "libc", + "windows-sys 0.42.0", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -442,6 +458,20 @@ name = "bytemuck" version = "1.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.116", +] [[package]] name = "byteorder" @@ -564,6 +594,33 @@ dependencies = [ "stacker", ] +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + [[package]] name = "clap" version = "4.5.59" @@ -595,7 +652,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -784,6 +841,64 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-epoch", + "crossbeam-queue", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-deque" version = "0.8.6" @@ -855,7 +970,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn", + "syn 2.0.116", ] [[package]] @@ -866,7 +981,7 @@ checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" dependencies = [ "darling_core", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -875,6 +990,12 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be1e0bca6c3637f992fc1cc7cbc52a78c1ef6db076dbf1059c4323d6a2048376" +[[package]] +name = "defer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "930c7171c8df9fb1782bdf9b918ed9ed2d33d1d22300abb754f9085bc48bf8e8" + [[package]] name = "der" version = "0.7.10" @@ -904,7 +1025,7 @@ checksum = "2cdc8d50f426189eef89dac62fabfa0abb27d5cc008f25bf4156a0203325becc" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -924,7 +1045,7 @@ checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", "unicode-xid", ] @@ -948,7 +1069,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -975,6 +1096,22 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" +[[package]] +name = "dyn-stack" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c4713e43e2886ba72b8271aa66c93d722116acf7a75555cce11dcde84388fe8" +dependencies = [ + "bytemuck", + "dyn-stack-macros", +] + +[[package]] +name = "dyn-stack-macros" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d926b4d407d372f141f93bb444696142c29d32962ccbd3531117cf3aa0bfa9" + [[package]] name = "either" version = "1.15.0" @@ -1012,6 +1149,18 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "enum-as-inner" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.116", +] + [[package]] name = "env_filter" version = "1.0.0" @@ -1032,6 +1181,61 @@ dependencies = [ "log", ] +[[package]] +name = "equator" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c35da53b5a021d2484a7cc49b2ac7f2d840f8236a286f84202369bd338d761ea" +dependencies = [ + "equator-macro 0.2.1", +] + +[[package]] +name = "equator" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4711b213838dfee0117e3be6ac926007d7f433d7bbe33595975d4190cb07e6fc" +dependencies = [ + "equator-macro 0.4.2", +] + +[[package]] +name = "equator" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02da895aab06bbebefb6b2595f6d637b18c9ff629b4cd840965bb3164e4194b0" +dependencies = [ + "equator-macro 0.6.0", +] + +[[package]] +name = "equator-macro" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.116", +] + +[[package]] +name = "equator-macro" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.116", +] + +[[package]] +name = "equator-macro" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b14b339eb76d07f052cdbad76ca7c1310e56173a138095d3bf42a23c06ef5d8" + [[package]] name = "equivalent" version = "1.0.2" @@ -1079,6 +1283,49 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "faer" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02d2ecfb80b6f8b0c569e36988a052e64b14d8def9d372390b014e8bf79f299a" +dependencies = [ + "bytemuck", + "dyn-stack", + "equator 0.6.0", + "faer-traits", + "gemm", + "generativity", + "libm", + "nano-gemm", + "npyz", + "num-complex", + "num-traits", + "private-gemm-x86", + "pulp", + "rand 0.9.2", + "rand_distr", + "rayon", + "reborrow", + "spindle", +] + +[[package]] +name = "faer-traits" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b87d23ed7ab1f26c0cba0e5b9e061a796fbb7dc170fa8bee6970055a1308bb0f" +dependencies = [ + "bytemuck", + "dyn-stack", + "generativity", + "libm", + "num-complex", + "num-traits", + "pulp", + "qd", + "reborrow", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -1297,7 +1544,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -1329,6 +1576,146 @@ dependencies = [ "slab", ] +[[package]] +name = "gemm" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa0673db364b12263d103b68337a68fbecc541d6f6b61ba72fe438654709eacb" +dependencies = [ + "dyn-stack", + "gemm-c32", + "gemm-c64", + "gemm-common", + "gemm-f16", + "gemm-f32", + "gemm-f64", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "seq-macro", +] + +[[package]] +name = "gemm-c32" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "086936dbdcb99e37aad81d320f98f670e53c1e55a98bee70573e83f95beb128c" +dependencies = [ + "dyn-stack", + "gemm-common", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "seq-macro", +] + +[[package]] +name = "gemm-c64" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20c8aeeeec425959bda4d9827664029ba1501a90a0d1e6228e48bef741db3a3f" +dependencies = [ + "dyn-stack", + "gemm-common", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "seq-macro", +] + +[[package]] +name = "gemm-common" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88027625910cc9b1085aaaa1c4bc46bb3a36aad323452b33c25b5e4e7c8e2a3e" +dependencies = [ + "bytemuck", + "dyn-stack", + "half", + "libm", + "num-complex", + "num-traits", + "once_cell", + "paste", + "pulp", + "raw-cpuid", + "rayon", + "seq-macro", + "sysctl", +] + +[[package]] +name = "gemm-f16" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3df7a55202e6cd6739d82ae3399c8e0c7e1402859b30e4cb780e61525d9486e" +dependencies = [ + "dyn-stack", + "gemm-common", + "gemm-f32", + "half", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "rayon", + "seq-macro", +] + +[[package]] +name = "gemm-f32" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02e0b8c9da1fbec6e3e3ab2ce6bc259ef18eb5f6f0d3e4edf54b75f9fd41a81c" +dependencies = [ + "dyn-stack", + "gemm-common", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "seq-macro", +] + +[[package]] +name = "gemm-f64" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "056131e8f2a521bfab322f804ccd652520c79700d81209e9d9275bbdecaadc6a" +dependencies = [ + "dyn-stack", + "gemm-common", + "num-complex", + "num-traits", + "paste", + "raw-cpuid", + "seq-macro", +] + +[[package]] +name = "generativity" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5881e4c3c2433fe4905bb19cfd2b5d49d4248274862b68c27c33d9ba4e13f9ec" + +[[package]] +name = "generator" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f04ae4152da20c76fe800fa48659201d5cf627c5149ca0b707b69d7eef6cf9" +dependencies = [ + "cc", + "cfg-if", + "libc", + "log", + "rustversion", + "windows-link", + "windows-result", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -1423,6 +1810,19 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "bytemuck", + "cfg-if", + "crunchy", + "num-traits", + "zerocopy 0.8.39", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -1480,6 +1880,12 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + [[package]] name = "hex" version = "0.4.3" @@ -1822,6 +2228,17 @@ version = "0.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8fae54786f62fb2918dcfae3d568594e50eb9b5c25bf04371af6fe7516452fb" +[[package]] +name = "interpol" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb58032ba748f4010d15912a1855a8a0b1ba9eaad3395b0c171c09b3b356ae50" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "ipnet" version = "2.11.0" @@ -1838,12 +2255,32 @@ dependencies = [ "serde", ] +[[package]] +name = "is-terminal" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.61.2", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.17" @@ -2034,12 +2471,34 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "loom" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca" +dependencies = [ + "cfg-if", + "generator", + "scoped-tls", + "tracing", + "tracing-subscriber", +] + [[package]] name = "lru-slab" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + [[package]] name = "matchit" version = "0.8.4" @@ -2141,7 +2600,7 @@ dependencies = [ "cfg-if", "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -2161,6 +2620,76 @@ dependencies = [ "version_check", ] +[[package]] +name = "nano-gemm" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e04345dc84b498ff89fe0d38543d1f170da9e43a2c2bcee73a0f9069f72d081" +dependencies = [ + "equator 0.2.2", + "nano-gemm-c32", + "nano-gemm-c64", + "nano-gemm-codegen", + "nano-gemm-core", + "nano-gemm-f32", + "nano-gemm-f64", + "num-complex", +] + +[[package]] +name = "nano-gemm-c32" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0775b1e2520e64deee8fc78b7732e3091fb7585017c0b0f9f4b451757bbbc562" +dependencies = [ + "nano-gemm-codegen", + "nano-gemm-core", + "num-complex", +] + +[[package]] +name = "nano-gemm-c64" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9af49a20d58816e6b5ee65f64142e50edb5eba152678d4bb7377fcbf63f8437a" +dependencies = [ + "nano-gemm-codegen", + "nano-gemm-core", + "num-complex", +] + +[[package]] +name = "nano-gemm-codegen" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6cc8d495c791627779477a2cf5df60049f5b165342610eb0d76bee5ff5c5d74c" + +[[package]] +name = "nano-gemm-core" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d998dfa644de87a0f8660e5ea511d7cb5c33b5a2d9847b7af57a2565105089f0" + +[[package]] +name = "nano-gemm-f32" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879d962e79bc8952e4ad21ca4845a21132540ed3f5e01184b2ff7f720e666523" +dependencies = [ + "nano-gemm-codegen", + "nano-gemm-core", +] + +[[package]] +name = "nano-gemm-f64" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9a513473dce7dc00c7e7c318481ca4494034e76997218d8dad51bd9f007a815" +dependencies = [ + "nano-gemm-codegen", + "nano-gemm-core", +] + [[package]] name = "native-tls" version = "0.2.16" @@ -2196,6 +2725,17 @@ dependencies = [ "memchr", ] +[[package]] +name = "npyz" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f0e759e014e630f90af745101b614f761306ddc541681e546649068e25ec1b9" +dependencies = [ + "byteorder", + "num-bigint", + "py_literal", +] + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -2231,6 +2771,17 @@ dependencies = [ "zeroize", ] +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "bytemuck", + "num-traits", + "rand 0.8.5", +] + [[package]] name = "num-conv" version = "0.2.0" @@ -2267,6 +2818,16 @@ dependencies = [ "libm", ] +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "object" version = "0.37.3" @@ -2320,7 +2881,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -2381,6 +2942,12 @@ dependencies = [ "subtle", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + [[package]] name = "pbkdf2" version = "0.12.2" @@ -2413,7 +2980,7 @@ dependencies = [ "proc-macro2", "proc-macro2-diagnostics", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -2471,7 +3038,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -2539,6 +3106,34 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + [[package]] name = "png" version = "0.17.16" @@ -2615,7 +3210,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn", + "syn 2.0.116", +] + +[[package]] +name = "private-gemm-x86" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0af8c3e5087969c323f667ccb4b789fa0954f5aa650550e38e81cf9108be21b5" +dependencies = [ + "crossbeam", + "defer", + "interpol", + "num_cpus", + "raw-cpuid", + "rayon", + "spindle", + "sysctl", ] [[package]] @@ -2635,7 +3246,7 @@ checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", "version_check", "yansi", ] @@ -2650,6 +3261,54 @@ dependencies = [ "cc", ] +[[package]] +name = "pulp" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e205bb30d5b916c55e584c22201771bcf2bad9aabd5d4127f38387140c38632" +dependencies = [ + "bytemuck", + "cfg-if", + "libm", + "num-complex", + "paste", + "pulp-wasm-simd-flag", + "raw-cpuid", + "reborrow", + "version_check", +] + +[[package]] +name = "pulp-wasm-simd-flag" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40e24eee682d89fb193496edf918a7f407d30175b2e785fe057e4392dfd182e0" + +[[package]] +name = "py_literal" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "102df7a3d46db9d3891f178dcc826dc270a6746277a9ae6436f8d29fd490a8e1" +dependencies = [ + "num-bigint", + "num-complex", + "num-traits", + "pest", + "pest_derive", +] + +[[package]] +name = "qd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15f1304a5aecdcfe9ee72fbba90aa37b3aa067a69d14cb7f3d9deada0be7c07c" +dependencies = [ + "bytemuck", + "libm", + "num-traits", + "pulp", +] + [[package]] name = "quickcheck" version = "1.1.0" @@ -2814,6 +3473,51 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba" +[[package]] +name = "rand_distr" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463" +dependencies = [ + "num-traits", + "rand 0.9.2", +] + +[[package]] +name = "raw-cpuid" +version = "11.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" +dependencies = [ + "bitflags 2.11.0", +] + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "reborrow" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03251193000f4bd3b042892be858ee50e8b3719f2b08e5833ac4353724632430" + [[package]] name = "redox_syscall" version = "0.5.18" @@ -2849,7 +3553,7 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -2974,7 +3678,7 @@ checksum = "d4322a2a4e8cf30771dd9f27f7f37ca9ac8fe812dddd811096a98483080dabe6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -3198,6 +3902,12 @@ dependencies = [ "serde_json", ] +[[package]] +name = "scoped-tls" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" + [[package]] name = "scopeguard" version = "1.2.0" @@ -3233,6 +3943,12 @@ version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + [[package]] name = "serde" version = "1.0.228" @@ -3280,7 +3996,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -3365,7 +4081,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -3530,6 +4246,19 @@ dependencies = [ "lock_api", ] +[[package]] +name = "spindle" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aaca3d8aa5387a6eba861fbf984af5348d9df5d940c25c6366b19556fdf64" +dependencies = [ + "atomic-wait", + "crossbeam", + "equator 0.4.2", + "loom", + "rayon", +] + [[package]] name = "spki" version = "0.7.3" @@ -3599,7 +4328,7 @@ dependencies = [ "quote", "sqlx-core", "sqlx-macros-core", - "syn", + "syn 2.0.116", ] [[package]] @@ -3622,7 +4351,7 @@ dependencies = [ "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", - "syn", + "syn 2.0.116", "tokio", "url", ] @@ -3800,6 +4529,17 @@ dependencies = [ "siphasher", ] +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.116" @@ -3828,7 +4568,21 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", +] + +[[package]] +name = "sysctl" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01198a2debb237c62b6826ec7081082d951f46dbb64b0e8c7649a452230d1dfc" +dependencies = [ + "bitflags 2.11.0", + "byteorder", + "enum-as-inner", + "libc", + "thiserror 1.0.69", + "walkdir", ] [[package]] @@ -3948,7 +4702,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -3959,7 +4713,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -4047,6 +4801,16 @@ dependencies = [ "zerovec", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "1.10.0" @@ -4086,7 +4850,7 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -4265,6 +5029,7 @@ dependencies = [ "tokio", "toml 0.9.12+spec-1.1.0", "torrust-index-located-error", + "torrust-sentinel", "tower", "tower-http", "tracing", @@ -4283,6 +5048,17 @@ dependencies = [ "tracing", ] +[[package]] +name = "torrust-sentinel" +version = "3.0.0-develop" +dependencies = [ + "criterion", + "faer", + "rand 0.10.0", + "serde", + "tracing", +] + [[package]] name = "tower" version = "0.5.3" @@ -4355,7 +5131,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -4395,12 +5171,16 @@ version = "0.3.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" dependencies = [ + "matchers", "nu-ansi-term", + "once_cell", + "regex-automata", "serde", "serde_json", "sharded-slab", "smallvec", "thread_local", + "tracing", "tracing-core", "tracing-log", "tracing-serde", @@ -4699,7 +5479,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn", + "syn 2.0.116", "wasm-bindgen-shared", ] @@ -4742,7 +5522,7 @@ checksum = "f579cdd0123ac74b94e1a4a72bd963cf30ebac343f2df347da0b8df24cdebed2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -4875,7 +5655,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -4886,7 +5666,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -4924,6 +5704,21 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-sys" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +dependencies = [ + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + [[package]] name = "windows-sys" version = "0.45.0" @@ -5266,7 +6061,7 @@ dependencies = [ "heck", "indexmap 2.13.0", "prettyplease", - "syn", + "syn 2.0.116", "wasm-metadata", "wit-bindgen-core", "wit-component", @@ -5282,7 +6077,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "syn", + "syn 2.0.116", "wit-bindgen-core", "wit-bindgen-rust", ] @@ -5376,7 +6171,7 @@ checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", "synstructure", ] @@ -5407,7 +6202,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -5418,7 +6213,7 @@ checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] @@ -5438,7 +6233,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", "synstructure", ] @@ -5478,7 +6273,7 @@ checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.116", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index ea1099e3..502e41be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ opt-level = 3 [dependencies] torrust-index-located-error = { version = "3.0.0-develop", path = "packages/located-error" } +torrust-sentinel = { version = "3.0.0-develop", path = "packages/sentinel" } anyhow = "1" argon2 = "0" @@ -101,13 +102,13 @@ tempfile = "3" which = "6" [package.metadata.cargo-machete] -ignored = ["futures"] +ignored = ["futures", "torrust-sentinel"] [lints] workspace = true [workspace.lints.rust] -warnings = { level = "deny", priority = -1 } +deprecated-safe = { level = "deny", priority = -1 } future-incompatible = { level = "deny", priority = -1 } let-underscore = { level = "deny", priority = -1 } nonstandard-style = { level = "deny", priority = -1 } @@ -115,9 +116,9 @@ rust-2018-compatibility = { level = "deny", priority = -1 } rust-2018-idioms = { level = "deny", priority = -1 } rust-2021-compatibility = { level = "deny", priority = -1 } rust-2024-compatibility = { level = "warn", priority = -1 } -unused = { level = "deny", priority = -2 } -deprecated-safe = { level = "deny", priority = -1 } unsafe-code = "warn" +unused = { level = "deny", priority = -2 } +warnings = { level = "deny", priority = -1 } [workspace.lints.clippy] all = { level = "deny", priority = -1 } diff --git a/packages/g-v-dual-tree/concept.md b/packages/g-v-dual-tree/concept.md new file mode 100644 index 00000000..cf3aa4ca --- /dev/null +++ b/packages/g-v-dual-tree/concept.md @@ -0,0 +1,1481 @@ +# Formal Specification: Dual-Tree Value-Stratified Index + +A Kraft–McMillan Geometric-Value Graph. + +--- + +Conceptual Summary. The Geometric-Value (G-V) Tree maintains a φ‑Kraft invariant: along any root‑to‑leaf path in the value tree, intensities decay at least as fast as φ⁻ᵈ, where φ is the golden ratio. This is precisely the condition for optimal prefix codes under a (1,2)‑cost model, and it guarantees that heavy entries rise near the root for efficient proportional sampling — all while the geometric tree independently refines spatial resolution where the value tree authorizes. The max‑uncle constraint (V‑I3) is the local, checkable form of this invariant; the Fibonacci recurrence (§17) proves the equivalence. + +--- + +## 1. Overview + +This structure maintains a spatial index whose logical hierarchy adapts to value distribution. It consists of two trees sharing a common set of G-nodes. + +**The Geometric Tree (G-Tree)** is a binary tree over dyadic intervals of $[0, 2^N)$. Every materialized node stores the accumulated value for its range since creation — by default, the exact count of all observations. The G-Tree answers spatial queries. It grows and shrinks only as the V-Tree directs. + +**The Value Tree (V-Tree)** is a dynamic tree with branching factor 2 or 3, governed by the **max-uncle constraint**: no node may outweigh every one of its uncles. The V-Tree is a tournament bracket: G-nodes sit at the leaves as competitors; internal nodes are pure structural scaffolding. The V-Tree reshapes so that high-intensity entries reside near the root while low-intensity entries are consolidated deeper. + +**The V-Tree governs. The G-Tree obeys.** The V-Tree decides what earns spatial resolution and what loses it. The G-Tree executes immediately. Both trees reference the same G-nodes. The G-Tree owns spatial structure and accumulated values. The V-Tree owns attention structure and competitive ranking. + +**Three orthogonal concerns are separated:** + +| Concern | Mechanism | Owner | +| --------------------- | --------------------------------------------------------------------- | ------------ | +| Spatial resolution | Depth-gated catalytic splits | Architecture | +| Value-driven priority | Uncle constraint + promotion/contraction | Architecture | +| Temporal semantics | User-supplied filter on G-Tree values (exact accumulation by default) | **User** | + +The architecture provides scaffolding. The user plugs in temporal meaning. A user who wants "what matters now" applies decay. A user who wants "most significant ever" does nothing. The V-Tree adapts to whatever the numbers say. + +**Key design principles:** + +- **Catalytic creation.** When a G-node splits, it is not consumed. It persists as a V-entry carrying pre-split history, serving as the competitive benchmark its children must exceed. No information is destroyed by spatial refinement. +- **Tournament structure.** The V-Tree is a bracket. G-node entries are competitors at the leaves. Structural internal nodes are freely created and destroyed with no external consequences. +- **One-way governance.** Every structural decision flows from V-Tree to G-Tree. The G-Tree never makes structural decisions independently. +- **Max-uncle stability.** The heaviest uncle acts as a shield. Three siblings of comparable intensity coexist without violations. The V-Tree restructures only when a node outgrows its entire neighborhood. + +--- + +## 2. Domain + +Fix an integer $N \geq 1$. The domain is $[0, 2^N)$. + +The system begins with a single G-node covering the entire domain. It grows only where the V-Tree authorizes spatial refinement. + +--- + +## 3. Node Types + +### 3.1 G-Node + +A G-node $g$ carries: + +| Field | Type | Description | +| -------------------------------- | --------------------- | -------------------------------------------------------- | +| $g.l,\ g.r$ | integers | Dyadic range $[l, r)$, width $r - l = 2^k$ | +| $g.\text{sum}$ | $\mathbb{R}_{\geq 0}$ | Accumulated value for this range since creation | +| $g.\text{left},\ g.\text{right}$ | G-Node or null | Children (null if terminal) | +| $g.\text{geo\_parent}$ | G-Node or null | Parent in G-Tree | +| $g.\text{entry}$ | V-Entry or null | V-Tree membership token | +| $g.\text{removed}$ | bool | Whether this G-node has been removed by eviction cleanup | + +A G-node is either: + +- **Terminal**: both children null. Observations route here and accumulate directly. +- **Internal**: both children non-null. Carries pre-split history plus children's contributions. + +### 3.2 V-Entry (V-Tree Leaf) + +Every V-Tree leaf is a G-node's membership token in the tournament. + +| Field | Type | Description | +| ---------------------- | --------------------- | ------------------------------------------------- | +| $v.\text{int}$ | $\mathbb{R}_{\geq 0}$ | Intensity accumulated since this entry's creation | +| $v.\text{val\_parent}$ | V-Structural or null | Parent in V-Tree | +| $v.\text{gnode}$ | G-Node | Backing G-node | +| $v.\text{children}$ | $\emptyset$ | Always empty — entries are always V-Tree leaves | + +### 3.3 V-Structural (V-Tree Internal) + +Pure scaffolding. No external identity. Freely created and destroyed by rebalancing. + +| Field | Type | Description | +| ---------------------- | --------------------- | ----------------------------- | +| $v.\text{int}$ | $\mathbb{R}_{\geq 0}$ | Sum of children's intensities | +| $v.\text{val\_parent}$ | V-Structural or null | Parent in V-Tree | +| $v.\text{children}$ | List of V-nodes | Length 2 or 3 | + +The V-Tree is **blind to G-Tree topology**. It does not distinguish internal G-nodes from terminal G-nodes — both appear as V-Tree leaf entries with intensities. + +--- + +## 4. The Geometric Tree + +### 4.1 Structure + +The G-Tree is a binary tree of variable depth over $[0, 2^N)$. + +- The root covers $[0, 2^N)$ and always exists. +- Each internal node $[l, r)$ has children $[l, m)$ and $[m, r)$ where $m = (l+r)/2$. +- Internal nodes always have exactly two materialized children. +- Terminal nodes exist at variable depths, from 0 (root is terminal) to $N$ (unit cell $[x, x{+}1)$). +- The tree grows by **catalytic splitting** (§9) and shrinks by **eviction-triggered collapse** (§11). + +**The G-Tree only adds or removes resolution. It never moves boundaries.** + +### 4.2 Accounting + +For any G-node $g$, define the **pre-split own value**: + +$$g.\text{own}\ =\ g.\text{sum} - g.\text{left}.\text{sum} - g.\text{right}.\text{sum}$$ + +For terminal nodes, $g.\text{own} = g.\text{sum}$. When $g$ splits, its children start with $\text{sum} = 0$, so $g.\text{own}$ captures the full pre-split accumulation. As observations route to children, children's sums grow, $g.\text{sum}$ grows (via propagation), but $g.\text{own}$ remains fixed — it is the historical record of what accumulated before refinement. + +### 4.3 Invariants + +$$\textbf{G-I1 (Summation):}\quad g.\text{sum} = g.\text{own} + g.\text{left}.\text{sum} + g.\text{right}.\text{sum}$$ + +At terminal nodes, $g.\text{sum} = g.\text{own}$ (children are null with implicit sum 0). + +$$\textbf{G-I2 (Binary):}\quad \text{Every internal G-node has exactly two materialized children.}$$ + +$$\textbf{G-I3 (Dyadic):}\quad \text{Every G-node covers a dyadic interval } [l, r) \text{ where } r - l = 2^k \text{ for some } k \geq 0.$$ + +### 4.4 Queries + +**Point query** for coordinate $x$: traverse from root, branching left if $x < m$, right otherwise. Terminates at the shallowest terminal node containing $x$. Cost: $O(\text{depth})$, at most $O(N)$. + +``` +function point_query(g, x) → terminal G-node: + if g is terminal: return g + m ← (g.l + g.r) / 2 + if x < m: return point_query(g.left, x) + else: return point_query(g.right, x) +``` + +**Range sum** over $[a, b)$: segment-tree decomposition. Pre-split observations pro-rated at partial overlaps. + +``` +function range_sum(g, a, b) → ℝ: + if g = null: return 0 + if b ≤ g.l or g.r ≤ a: return 0 + if a ≤ g.l and g.r ≤ b: return g.sum + if g is terminal: + overlap ← min(g.r, b) − max(g.l, a) + return g.sum × overlap / (g.r − g.l) + own ← g.sum − g.left.sum − g.right.sum + overlap ← min(g.r, b) − max(g.l, a) + own_share ← own × overlap / (g.r − g.l) + return own_share + range_sum(g.left, a, b) + range_sum(g.right, a, b) +``` + +Cost: $O(N)$. Fully-contained queries aligned with node boundaries are exact. Partial overlaps of pre-split accumulation are pro-rated under a uniform-within-cell assumption. + +### 4.5 Path Compression (Implementation Note) + +Long chains of internal nodes where only one branch carries significant intensity can be compressed into single edges with explicit range annotations, reducing materialized node count from $O(N \cdot L)$ to $O(L)$. The invariants and algorithms are unaffected; only the in-memory representation changes. + +--- + +## 5. The Value Tree + +### 5.1 Structure + +The V-Tree is a dynamic tree where: + +- **Leaves** are V-Entries (G-node membership tokens). +- **Internal nodes** are V-Structural (pure scaffolding with 2 or 3 children). +- The root may be either an entry (when only one entry exists) or structural. + +The V-Tree knows nothing about spatial coordinates. It ranks entries purely by intensity. + +### 5.2 Invariants + +$$\textbf{V-I1 (Summation):}\quad v.\text{int} = \sum_{c\, \in\, v.\text{children}} c.\text{int} \qquad \text{for every structural V-node, at query time}$$ + +$$\textbf{V-I2 (Branching):}\quad |\,v.\text{children}\,| \in \{2,\, 3\} \qquad \text{for every structural V-node}$$ + +**V-I3 (Max-Uncle Constraint):** For every V-node $c$ with parent $p$ and grandparent $g$: + +$$c.\text{int}\ \leq\ \max\!\bigl\{u.\text{int} : u \in \text{siblings}(p,\, g)\bigr\}$$ + +A node is in violation only when it is strictly heavier than **every** uncle. In a 2-node grandparent (one uncle), the node must beat that single uncle. In a 3-node grandparent (two uncles), a violation requires beating both. + +$$\textbf{V-I4 (Unique Backing):}\quad \text{Every V-Entry corresponds to exactly one G-node.}$$ +$$\text{Every G-node has at most one V-Entry.}$$ + +$$\textbf{V-I5 (Entry-Leaf):}\quad \text{Every V-Entry is a leaf of the V-Tree.}$$ +$$\text{Every V-Tree internal node is structural.}$$ + +V-I5 implies entries never become structural; structural nodes never become entries. The two types are permanently distinct. Rebalancing creates and destroys only structural nodes. Entries are rearranged but never transmuted. + +V-I1 is a correctness condition that must hold when values are used — at sampling time, at rebalancing time. How and when propagation maintains this between operations is an implementation concern. + +### 5.3 Uncle Constraint Semantics + +The max-uncle formulation encodes competitive dominance. When a G-node splits and its V-entry becomes uncle to its children's entries, the constraint says: **a child cannot outweigh its parent's entry without triggering promotion.** The child starts at zero, accumulates its own observations, and must earn its way up. The moment it exceeds every uncle, V-I3 fires and rebalancing promotes the child — which is exactly the moment the child region has proven more significant than the entire context it emerged from. + +Three siblings of comparable intensity sit without violations indefinitely. Only when one dramatically outgrows all others does rebalancing fire. The V-Tree restructures when the competitive landscape genuinely shifts, not on every minor intensity fluctuation. + +### 5.4 Definitions + +**Uncle.** Given node $c$ with parent $p$ and grandparent $g$, any sibling of $p$ in $g$ is an uncle of $c$. + +**Violation.** Node $c$ is **in violation** if $c.\text{int} > \max\{u.\text{int} : u \in \text{siblings}(p,\, g)\}$ — it outweighs every uncle. + +**Sibling.** $\text{siblings}(c, p) = \{x \in p.\text{children} : x \neq c\}$. + +**3-node / 2-node.** A structural node $p$ is a 3-node if $|p.\text{children}| = 3$, a 2-node if $|p.\text{children}| = 2$. + +### 5.5 Proportional Sampling + +The V-Tree enables sampling an entry with probability proportional to intensity: + +``` +function sample(v) → V-Entry: + if v is entry: + if is_ghost(v): + cleanup_ghost(v) + return sample(v.val_parent) // re-route after cleanup + return v + choose child c with probability c.int / v.int + return sample(c) +``` + +The expected sampling cost is: + +$$E[\text{cost}] = \sum_{i} \frac{v_i.\text{int}}{I_{\text{total}}} \cdot d(v_i)$$ + +where $d(v_i)$ is the depth of entry $v_i$ in the V-Tree. Since V-I3 pushes high-intensity entries to shallow depth, the expected cost scales with the **entropy of the intensity distribution**, not with $\log L$. For concentrated distributions — a few dominant hotspots — sampling approaches $O(1)$. + +A V-Tree sample returns a G-node at whatever scale the system has found significant. Sampling preferentially directs attention toward deeply-refined regions that have proven sustained significance through repeated competitive promotion. + +--- + +## 6. Depth Gates + +$$\textbf{D-I1:}\quad \text{split}(g) \implies \text{depth}_V(g.\text{entry}) \leq D_{\text{create}}$$ + +$$\textbf{D-I2:}\quad \text{evict}(v) \iff \text{depth}_V(v) > D_{\text{evict}}$$ + +$$\textbf{D-I3:}\quad D_{\text{create}} < D_{\text{evict}} \qquad \text{(buffer zone prevents oscillation)}$$ + +The buffer zone $[D_{\text{create}},\, D_{\text{evict}})$ is where entries live on borrowed time — too deep to create children, not yet deep enough to be evicted. They can still receive observations and promote upward. The buffer gives them time to prove themselves. + +> _Design note._ The depth gates replace a single intensity threshold as the governing mechanism for spatial resolution. A pure intensity gate is a local condition — a leaf may exceed $\theta$ yet be globally insignificant relative to the rest of the tree. The V-Tree depth is a global condition: a shallow position means the entry has outcompeted its neighborhood. The combination of a basic intensity eligibility ($g.\text{sum} > \theta$) with the depth gate ($\text{depth}_V \leq D_{\text{create}}$) ensures that only locally intense AND globally significant nodes earn spatial refinement. + +--- + +## 7. Observation Flow + +**Input:** Coordinate $x \in [0, 2^N)$, intensity increment $\Delta > 0$. + +``` +function observe(x, Δ): + + // Step 1: Route to terminal leaf through G-Tree + ℓ ← point_query(G_root, x) + + // Step 2: Accumulate in G-Tree (permanent) + g ← ℓ + while g ≠ null: + g.sum ← g.sum + Δ + g ← g.geo_parent + + // Step 3: Ensure terminal leaf has a V-entry + if ℓ.entry = null: + vtree_insert(ℓ) + + // Step 4: Update V-entries for all G-ancestors with entries + g ← ℓ + while g ≠ null: + if g.entry ≠ null: + g.entry.int ← g.entry.int + Δ + propagate_v_sums(g.entry) + g ← g.geo_parent + + // Step 5: Attempt split + attempt_split(ℓ) + + // Step 6: Restore V-I3 + rebalance() +``` + +**Step 3 before Step 4** ensures that a newly inserted entry (at intensity 0) receives the triggering observation's $\Delta$ during Step 4. No observation is lost. + +**Multi-level V-entry updates.** Each observation increments V-entries for every G-ancestor that holds a V-entry. A region with $k$ G-ancestors holding V-entries receives $k \times \Delta$ total V-Tree intensity per observation. This is intentional: regions with deeper spatial refinement carry proportionally more V-Tree weight, biasing attention-weighted sampling toward regions where resolution exists to exploit. + +The V-Tree's total intensity is therefore not equal to the G-Tree root's sum — it is larger by the amount of multi-level counting. This is by design: the V-Tree ranks by competitive significance, not by raw observation count. + +``` +function propagate_v_sums(v): + p ← v.val_parent + while p ≠ null: + p.int ← sum of p.children's .int + p ← p.val_parent +``` + +After Steps 2–4, G-I1 and V-I1 hold. V-I3 may be violated by the intensity injection. Step 5 may involve preprocessing that creates further local violations (§9.1). The final `rebalance()` in Step 6 restores V-I3. + +> _Eviction timing._ Eviction (D-I2) is checked after rebalancing, since rebalancing changes V-Tree depths. Any entry whose depth exceeds $D_{\text{evict}}$ after the final `rebalance()` is evicted per §11.1. + +--- + +## 8. V-Tree Entry Management + +### 8.1 Zero-Intensity Insertion + +**Purpose.** Add a G-node as a new V-Tree entry with intensity 0. Used when a terminal G-node receives its first observation after eviction or initialization. + +``` +function vtree_insert(g): + e ← new V-Entry(int = 0, gnode = g) + g.entry ← e + + // Case 1: empty V-Tree + if V_root = null: + V_root ← e + e.val_parent ← null + return + + // Case 2: V-root is a single entry + if V_root is entry: + s ← new V-Structural() + s.int ← V_root.int + s.children ← [V_root, e] + V_root.val_parent ← s + e.val_parent ← s + s.val_parent ← null + V_root ← s + return + + // Case 3: general — descend to lightest entry, buddy-insert + node ← V_root + while node is structural: + node ← argmin { c.int : c ∈ node.children } + buddy ← node + p ← buddy.val_parent + + s ← new V-Structural() + s.int ← buddy.int // + 0 + s.children ← [buddy, e] + buddy.val_parent ← s + e.val_parent ← s + s.val_parent ← p + + replace buddy with s in p.children +``` + +**Invariant preservation.** + +- **V-I1:** $s.\text{int} = \text{buddy}.\text{int} + 0$. Parent's sum unchanged. $\checkmark$ +- **V-I2:** $s$ has 2 children. Parent's child count unchanged. $\checkmark$ +- **V-I3:** $e.\text{int} = 0 \leq$ anything. Buddy's new uncles are its former siblings — and it was the lightest child. $\checkmark$ +- **V-I5:** $e$ is an entry (V-leaf). $s$ is structural (V-internal). $\checkmark$ + +**No rebalancing is required.** + +### 8.2 Entry Removal + +**Purpose.** Remove a V-Entry from the V-Tree. Used by eviction (§11.1) and ghost cleanup (§11.3). + +``` +function vtree_remove_leaf(v): + p ← v.val_parent + + if p = null: + // v was V-root (sole entry) + V_root ← null + v.gnode.entry ← null + return + + p.children ← p.children \ {v} + v.gnode.entry ← null + + if |p.children| ≥ 2: + // p remains valid (was 3-node, now 2-node) + recompute p.int + propagate intensity decrease upward from p + return + + // p has 1 child — collapse p into its sole child + sole ← p.children[0] + g ← p.val_parent + sole.val_parent ← g + + if g = null: + V_root ← sole + else: + replace p with sole in g.children + recompute g.int + propagate intensity decrease upward from g + + destroy(p) +``` + +**Invariant notes.** + +- **V-I2:** If $p$ was a 3-node, it becomes a 2-node — valid. If $p$ was a 2-node, the collapse replaces $p$ with its sole child in the grandparent. This may cascade upward: if the grandparent was also a 2-node, it now has one child and must itself collapse. +- **V-I3:** Uncle relationships change. The caller checks for new violations and rebalances. +- **V-I5:** Only entries are removed. Structural nodes may be destroyed by collapse. No type transmutation. $\checkmark$ + +--- + +## 9. Splitting + +### 9.1 Attempt Split + +Splitting requires preprocessing to ensure the V-parent is a 2-node. The preprocessing contraction is mandatory and stands regardless of whether the split ultimately proceeds. + +``` +function attempt_split(g): + // Basic eligibility + if g is not terminal in G-Tree: return + if g.r − g.l ≤ 1: return // unit cell — cannot subdivide + if g.sum ≤ θ: return // insufficient local intensity + if g.entry = null: return // no V-Tree membership + + // Bootstrap case: entry is V-root with no parent + if g.entry.val_parent = null: + bootstrap_split(g) + return + + p ← g.entry.val_parent + + // Preprocessing: ensure 2-node parent (always isolate heaviest) + if |p.children| = 3: + contract(p, isolate = heaviest_child(p)) + p ← g.entry.val_parent // refresh — entry may have moved + + // Depth gate (checked against post-contraction position) + if depth_V(g.entry) > D_create: return + + // Proceed with catalytic split + catalytic_split(g) +``` + +**Why always isolate the heaviest.** Contraction isolates the child with the highest intensity and merges the other two. If the splitting node is heaviest, it stays as a direct child. If the splitting node is not heaviest, contraction pushes it one level deeper, which may cause the depth gate to deny the split. This is correct: the node wasn't significant enough relative to its neighborhood to justify spatial refinement at this time. + +### 9.2 Catalytic Split + +**Precondition.** $g$ is a terminal G-node. $g.\text{entry}.\text{val\_parent}$ is a 2-node. $\text{depth}_V(g.\text{entry}) \leq D_{\text{create}}$. + +``` +function catalytic_split(g): + m ← (g.l + g.r) / 2 + + // G-Tree: create children + left ← new G-Node([g.l, m), sum = 0, removed = false) + right ← new G-Node([m, g.r), sum = 0, removed = false) + g.left ← left + g.right ← right + left.geo_parent ← g + right.geo_parent ← g + + // V-Tree: create entries for children + le ← new V-Entry(int = 0, gnode = left) + re ← new V-Entry(int = 0, gnode = right) + left.entry ← le + right.entry ← re + + // V-Tree: structural node grouping the children + s ← new V-Structural() + s.int ← 0 + s.children ← [le, re] + le.val_parent ← s + re.val_parent ← s + + // Add s as third child of g.entry's parent + p ← g.entry.val_parent // guaranteed 2-node + p.children ← p.children ∪ {s} // 2-node → 3-node + s.val_parent ← p + // p.int unchanged (added 0) +``` + +**Result.** The G-parent's V-entry becomes uncle to its own G-children's entries: + +``` +p (3-node) +├── g.entry (int = I) ← uncle to its G-children +├── existing_sibling (int = E) ← also uncle +└── s (structural, int = 0) + ├── left.entry (int = 0) + └── right.entry (int = 0) +``` + +**Invariant preservation.** + +- **G-I1:** $g.\text{sum}$ unchanged. Children start at 0. $g.\text{own} = g.\text{sum}$. $\checkmark$ +- **V-I1:** $p.\text{int}$ unchanged ($+ 0$). $s.\text{int} = 0 + 0 = 0$. $\checkmark$ +- **V-I2:** $p$ is now a 3-node. $s$ is a 2-node. $\checkmark$ +- **V-I5:** New entries are V-leaves. New structural node is V-internal. $\checkmark$ + +**V-I3 is preserved without rebalancing.** See §9.4. + +### 9.3 Bootstrap Split + +When the V-Tree contains a single entry (the G-root at initialization), that entry is the V-root and has no V-parent. The first split is handled specially. + +``` +function bootstrap_split(g): + m ← (g.l + g.r) / 2 + + // G-Tree: create children + left ← new G-Node([g.l, m), sum = 0, removed = false) + right ← new G-Node([m, g.r), sum = 0, removed = false) + g.left ← left; g.right ← right + left.geo_parent ← g; right.geo_parent ← g + + // V-Tree: create entries for children + le ← new V-Entry(int = 0, gnode = left); left.entry ← le + re ← new V-Entry(int = 0, gnode = right); right.entry ← re + + // V-Tree: structural node for children + child_s ← new V-Structural() + child_s.int ← 0 + child_s.children ← [le, re] + le.val_parent ← child_s + re.val_parent ← child_s + + // V-Tree: structural root holding g.entry and child_s + root_s ← new V-Structural() + root_s.int ← g.entry.int + root_s.children ← [g.entry, child_s] + g.entry.val_parent ← root_s + child_s.val_parent ← root_s + root_s.val_parent ← null + + V_root ← root_s +``` + +**Result:** + +``` +root_s (structural, 2-node, int = I) +├── g.entry (int = I) +└── child_s (structural, 2-node, int = 0) + ├── le (int = 0) + └── re (int = 0) +``` + +Uncle check: $\text{le}(0)$, uncle $g.\text{entry}(I)$. $0 \leq I$. $\checkmark$ No violations. + +### 9.4 Why Catalytic Splits Create No Violations + +The max-uncle formulation is the key. After a catalytic split, the new structural node $s(0)$ becomes a sibling of $g.\text{entry}(I)$ and the existing sibling $(E)$ under the now-3-node parent $p$. + +**New children:** intensity 0, uncles $\{g.\text{entry}(I), \text{existing}(E)\}$. Max uncle $\geq I > 0$. Trivially satisfied. + +**Existing grandchildren through the existing sibling:** they previously had uncle $g.\text{entry}(I)$ (when $p$ was a 2-node). Now they have uncles $\{g.\text{entry}(I),\; s(0)\}$. Their max uncle is still $I$ — unchanged. **No change to their constraint.** + +**Existing grandchildren through $g.\text{entry}$:** none — $g.\text{entry}$ is an entry (V-leaf, by V-I5), so it has no V-children. + +No node's max uncle decreased. No new violations. **Catalytic splits are violation-free.** Adding a lightweight newcomer as a sibling cannot weaken any node's shield. + +--- + +## 10. V-Tree Rebalancing + +### 10.1 Violation Detection + +``` +function is_violated(c) → bool: + p ← c.val_parent + if p = null: return false + g ← p.val_parent + if g = null: return false + max_uncle ← max { u.int : u ∈ siblings(p, g) } + return c.int > max_uncle +``` + +### 10.2 When Violations Occur + +Under the max-uncle constraint, violations are rare and meaningful: + +- **2-node grandparent (one uncle):** $c > u$. A node has outgrown its single uncle. +- **3-node grandparent (two uncles):** $c > u_1$ AND $c > u_2$. The node has outgrown both alternative branches — a genuinely dominant region. + +Violations arise from exactly one source: observations that increase an entry's intensity past its max uncle. Catalytic splits (§9.4) and contraction at 3-nodes that isolate the heaviest child do not create violations _at the contracted node's grandchild level_. However, contraction may create violations one level deeper — at the grandchild level of the newly created merged node (§10.9). + +### 10.3 Main Loop + +``` +function rebalance(): + loop: + c ← find_deepest_violated_node() + if c = null: return // all invariants hold + resolve(c) +``` + +**Priority rule.** Resolve the deepest violation first. This prevents higher-level restructurings from disturbing lower-level corrections. In practice, a work queue seeded with affected nodes and expanded as restructurings create new uncle relationships. + +### 10.4 Resolve + +``` +function resolve(c): + p ← c.val_parent + g ← p.val_parent + + // Phase 1: Ensure p is a 2-node + if |p.children| = 3: + contract(p, isolate = heaviest_child(p)) + if not is_violated(c): return // contraction may resolve outright + + // Phase 2: Promote c + if c is structural and |c.children| = 2: + standard_promote(c) + else: + // c is an entry (0 children) or a 3-child structural node + if |g.children| = 3: + contract(g, isolate = heaviest_child(g)) + if not is_violated(c): return + skip_promote(c) +``` + +### 10.5 Contraction (3-Node → 2-Node) + +**Purpose.** Reduce a 3-node to a 2-node by merging two of its children under a new structural node. + +**Rule.** Always isolate the child with the **highest intensity**. Merge the other two. + +``` +function contract(p, isolate): + {a, b} ← p.children \ {isolate} + + s ← new V-Structural() + s.int ← a.int + b.int + s.children ← [a, b] + a.val_parent ← s + b.val_parent ← s + s.val_parent ← p + + p.children ← [isolate, s] +``` + +**V-I1:** $p.\text{int} = \text{isolate}.\text{int} + a.\text{int} + b.\text{int} = \text{isolate}.\text{int} + s.\text{int}$. $\checkmark$ + +**V-I3 analysis.** Contraction changes uncle relationships at two levels: + +_Grandchildren of $p$ through $s$ (i.e., $a$ and $b$ themselves):_ Their uncle is now $\text{isolate}$ (the heaviest child). Since $\text{isolate}.\text{int} \geq a.\text{int}$ and $\geq b.\text{int}$, no violations here. $\checkmark$ + +_Grandchildren of $s$ (i.e., children of $a$ and children of $b$):_ Before contraction, children of $a$ were grandchildren of $p$, with uncles $\{\text{isolate}, b\}$ and max uncle $= \text{isolate}.\text{int}$. After contraction, children of $a$ are grandchildren of $s$, with uncle $= b$ only. The max uncle **decreased** from $\text{isolate}.\text{int}$ to $b.\text{int}$. Symmetrically for children of $b$, whose max uncle changed from $\text{isolate}.\text{int}$ to $a.\text{int}$. This may create violations. + +**Contraction may create violations at the grandchild-of-merged-node level.** These are bounded in number (at most 6 nodes: 2 children of $s$, each with at most 3 children) and are caught by the side-effect check (§10.9). + +**Key property.** When contraction is the first phase of `resolve`, the merged node's aggregate intensity may exceed $c.\text{int}$, resolving the original violation outright. `resolve` re-checks after each contraction. + +### 10.6 Standard Promote (Explode a 2-Child Structural Node) + +**Precondition.** $c$ is structural with exactly 2 children $\{c_1, c_2\}$. Parent $p$ is a 2-node. + +**Effect.** Destroy $c$. Install $c_1$ and $c_2$ as direct children of $p$. $p$ becomes a 3-node. + +``` +function standard_promote(c): + p ← c.val_parent + s ← sibling(c, p) + {c₁, c₂} ← c.children + + p.children ← [c₁, c₂, s] + c₁.val_parent ← p + c₂.val_parent ← p + + destroy(c) // structural only — V-I5 preserved +``` + +**V-I1:** $p.\text{int} = c.\text{int} + s.\text{int} = c_1.\text{int} + c_2.\text{int} + s.\text{int}$. $\checkmark$ + +**Why this helps.** $c$ was heavy ($c.\text{int} > \text{max uncle}$). After explosion, $c_1$ and $c_2$ are individually lighter. The heavy mass is separated: one child carries most of the intensity and may still violate, but the other is lighter and settles. This is **value diffusion**: each promotion separates heavy from light. + +### 10.7 Skip Promote (Elevate an Indivisible Node) + +**Precondition.** $c$ is an entry (0 children) or a 3-child structural node. Parent $p$ is a 2-node. Grandparent $g$ is a 2-node. + +**Effect.** Destroy $p$. Install $c$ and its sibling as direct children of $g$. $g$ becomes a 3-node. + +``` +function skip_promote(c): + p ← c.val_parent + g ← p.val_parent + s ← sibling(c, p) + u ← sibling(p, g) + + g.children ← [c, s, u] + c.val_parent ← g + s.val_parent ← g + + destroy(p) // structural only — V-I5 preserved +``` + +**V-I1:** $g.\text{int} = p.\text{int} + u.\text{int} = c.\text{int} + s.\text{int} + u.\text{int}$. $\checkmark$ + +**Why this resolves the violation.** The violation was $c.\text{int} > u.\text{int}$ where $u$ was $c$'s uncle. After promotion, $c$ and $u$ are siblings under $g$. Siblings face no constraint relative to each other — the uncle relationship is dissolved. + +**Side effects.** Children of $u$ previously had uncle $p$ with $p.\text{int} = c.\text{int} + s.\text{int}$. After promotion, their uncles are $\{c, s\}$ with max uncle $= \max(c.\text{int}, s.\text{int}) \leq p.\text{int}$. The shield weakened. These nodes are checked by §10.9. + +### 10.8 Decision Table + +| Condition on $c$ | Operation | Rationale | +| --------------------------------- | ---------------- | ---------------------------------------------------- | +| $c$ is structural with 2 children | Standard Promote | Splits heavy from light; $p$ → 3-node | +| $c$ is an entry (0 children) | Skip Promote | Cannot split; rises bodily | +| $c$ is structural with 3 children | Skip Promote | Exploding would give $p$ four children; rises bodily | + +### 10.9 Side-Effect Violations + +Restructuring changes uncle relationships. After any restructuring operation: + +**After contraction at $p$ creating merged node $s$:** + +1. Check each grandchild of $p$ for new violations (children of isolate and children of $s$ — the latter being $a$ and $b$ themselves). At most 6 nodes. +2. Check each grandchild of $s$ for new violations (children of $a$ and children of $b$). At most 6 nodes. +3. Add any new violators to the work queue. + +**After promotion at $g$ (the node whose children changed):** + +1. Check each grandchild of $g$ for new violations. At most 9 nodes. +2. Add any new violators to the work queue. + +Total per-restructuring cost: $O(1)$. + +### 10.10 Termination + +**Claim.** The `rebalance()` loop terminates after finitely many iterations. + +**Argument.** Define the weighted depth $\Phi = \sum_{\ell} \ell.\text{int} \cdot d(\ell)$ over all entries $\ell$, where $d(\ell)$ is the entry's V-Tree depth. Each complete resolution cycle (possible contraction + promotion) moves heavy intensity closer to the root: + +- **Skip Promotion** destroys one structural node, moving a heavy node one level closer to the root. $\Phi$ decreases by at least $c.\text{int}$. +- **Standard Promotion** destroys one structural node, replacing it with its two lighter children at the same depth. The heavy aggregate is dispersed. +- **Contraction** creates one structural node, potentially increasing $\Phi$ locally. However, contraction either resolves the triggering violation outright or enables a subsequent promotion that decreases $\Phi$ by more. + +Since $\Phi$ is bounded below by 0, intensities are fixed during rebalancing, there are finitely many entries, and each resolution cycle makes net progress, the loop terminates. A rigorous proof uses a composite potential combining $\Phi$ with the violation count. + +--- + +## 11. Eviction and Ghosts + +### 11.1 Eviction + +When a V-Tree entry sinks past $D_{\text{evict}}$: + +``` +function evict(v): + g ← v.gnode + + // Remove entry from V-Tree + vtree_remove_leaf(v) + g.entry ← null + + // Collapse G-node: remove all G-children + if g.left ≠ null: + remove_g_subtree(g.left) + remove_g_subtree(g.right) + g.left ← null + g.right ← null + // g.sum unchanged — retains full accumulated history + + // g is now terminal in G-Tree with its accumulated value, no V-entry +``` + +**`remove_g_subtree`** marks all G-nodes in the subtree as removed and detaches them from the G-Tree. V-entries backing removed G-nodes become ghosts — discovered lazily on access. + +``` +function remove_g_subtree(g): + g.removed ← true + g.geo_parent ← null + if g.left ≠ null: + remove_g_subtree(g.left) + remove_g_subtree(g.right) +``` + +### 11.2 Ghost Detection + +A V-Tree entry becomes a ghost when its backing G-node is marked `removed` by a previous eviction's cleanup. + +``` +function is_ghost(v) → bool: + return v.gnode.removed +``` + +One field check. Ghost detection is leaf-only (V-I5 guarantees only entries can be ghosts). + +### 11.3 Ghost Cascade + +When a ghost is discovered during sampling, observation routing, or rebalancing: + +``` +function cleanup_ghost(v): + g ← v.gnode + + // Remove this entry from V-Tree + vtree_remove_leaf(v) + + // Remove its G-node's children from G-Tree (if any) + if g.left ≠ null: + remove_g_subtree(g.left) + remove_g_subtree(g.right) + g.left ← null + g.right ← null + // May create more ghost entries — discovered lazily +``` + +The cascade proceeds entry-to-entry through the G-Tree. Each ghost cleanup is a V-Tree leaf removal plus standard 2-3 underflow handling. The cascade is bounded by the size of the removed G-subtree. + +### 11.4 The Three Conditions for V-Entry Removal + +| Condition | Type | Trigger | +| ---------------------------------------------- | --------- | ------------------------------------------------ | +| Too deep ($\text{depth}_V > D_{\text{evict}}$) | Active | V-Tree's structural rules | +| Ghost ($\text{gnode.removed}$) | Lazy | Next access by sampling, routing, or rebalancing | +| G-subtree removed by ancestor's cleanup | Immediate | Ancestor entry's eviction or ghost cleanup | + +Condition 1 is the **trigger**. Condition 2 is **lazy discovery**. Condition 3 creates new ghost entries for subsequent lazy discovery. The G-Tree obeys immediately at every step. + +The V-Tree's skewed access pattern makes lazy discovery efficient. Hot nodes are visited frequently — their neighborhoods are always clean. Cold ghosts sit harmlessly until accessed. The system does cleanup work exactly where and when it matters. + +--- + +## 12. Governance: One-Way, Lazy Discovery + +The V-Tree is the sole decision-maker about what deserves to exist. The G-Tree structurally follows. + +``` +V evicts entry (too deep) [V decides] + → G-node collapsed, G-children removed [G obeys, immediate] + → V-entries for removed children are ghosts + → ghost entry discovered on access [lazy — whenever] + → entry removed from V-Tree [immediate, leaf removal] + → structural parent may underflow [standard 2-3 housekeeping] + → ghost's G-children removed [G obeys, immediate] + → more ghost entries… [lazy discovery continues] +``` + +Every ghost is a V-Tree leaf. Every ghost cleanup is a leaf removal. Every G-Tree modification is immediate. The only lazy step is discovering that a leaf entry's backing is gone — a single field check. + +--- + +## 13. Temporal Semantics (User Concern) + +The architecture stores exact accumulated values by default. The user may apply any temporal filter: + +**Exponential decay.** Multiply all `entry.int` by $\lambda \in (0, 1)$ periodically. V-I3 is preserved by uniform scaling: if $c.\text{int} \leq \max\{u.\text{int}\}$ before decay, then $\lambda\, c.\text{int} \leq \lambda\, \max\{u.\text{int}\}$ after. V-Tree shape unchanged. User controls when and how often. + +**Sliding window.** Maintain per-entry timestamps or counters. Subtract expired observations. May create V-I3 violations (non-uniform changes); call `rebalance()` after adjustment. + +**No filter.** Use raw accumulation. Entries reflect lifetime significance. The V-Tree ranks by all-time importance. + +The architecture has no opinion. It adapts to whatever the numbers say. + +> _Design note._ Under exponential decay, entries whose traffic has subsided will decay in V-intensity, be pushed deeper by newly hot entries via the uncle constraint, eventually cross $D_{\text{evict}}$, and be evicted. Their G-subtrees collapse. The tree breathes: expands under load, contracts when traffic subsides. The triad of split/decay/eviction is the mechanism; the architecture provides split and eviction, the user provides decay (or any alternative temporal filter). + +--- + +## 14. Initialization + +``` +function initialize(): + root ← new G-Node(l = 0, r = 2^N, sum = 0) + root.geo_parent ← null + root.removed ← false + + e ← new V-Entry(int = 0, gnode = root) + root.entry ← e + e.val_parent ← null + + G_root ← root + V_root ← e +``` + +Both trees start as a single shared reference. The first observation that pushes the root's sum above $\theta$ triggers the bootstrap split. + +> _Design note._ The G-root entry is permanent under typical operation. Every observation increments the G-root's entry (as the ultimate G-ancestor), so `root.entry` accumulates total intensity and is always the heaviest V-entry. It never violates V-I3, sits at V-depth 1 (after the first split), and is never evicted. + +--- + +## 15. Worked Example + +**Setup.** $N = 3$, domain $[0, 8)$, split threshold $\theta = 5$, $D_{\text{create}} = 3$, $D_{\text{evict}} = 6$. + +### Initialization + +Single G-node covering $[0, 8)$, entry with intensity 0, serving as V-root. + +``` +G-Tree: [0,8) sum=0 V-Tree: root.entry(0) +``` + +--- + +### Step 1: observe(3, 8) + +**Route:** G-root is terminal. $\ell = [0, 8)$. + +**G-propagation:** $[0,8).\text{sum} = 8$. + +**V-insert:** $\ell.\text{entry}$ already exists. + +**V-entry update:** $\text{root.entry.int} = 8$. + +**Split check:** $8 > \theta = 5$, range $8 > 1$, entry exists. Entry is V-root. **Bootstrap split.** + +``` +G-Tree: V-Tree: + [0,8) sum=8 SR (structural, 2-node, int=8) + / \ ├── root.entry (int=8) +[0,4) s=0 [4,8) s=0 └── cs (structural, 2-node, int=0) + ├── L.entry (int=0) [0,4) + └── R.entry (int=0) [4,8) +``` + +**Rebalance:** $L(0)$ uncle $\text{root}(8)$: $0 \leq 8$. $\checkmark$. $R(0)$ uncle $\text{root}(8)$: $0 \leq 8$. $\checkmark$. +No violations. + +--- + +### Step 2: observe(3, 10) + +**Route:** $[0,8) \to [0,4) = L$ (terminal). + +**G-propagation:** $[0,4).\text{sum} = 10$. $[0,8).\text{sum} = 18$. + +**V-entry updates:** $L.\text{entry.int} \mathrel{+}= 10 = 10$. $\text{root.entry.int} \mathrel{+}= 10 = 18$. + +**V-sums:** $\text{cs.int} = 10 + 0 = 10$. $SR.\text{int} = 18 + 10 = 28$. + +**Split check for $L$:** $10 > 5$, range $4 > 1$. Parent cs is 2-node. $\text{depth}_V(L.\text{entry}) = 2 \leq D_{\text{create}} = 3$. $\checkmark$. **Catalytic split.** + +Create $\text{LL} = [0, 2)$ sum=0, $\text{LR} = [2, 4)$ sum=0. Add structural $s_1$ to cs. + +``` +G-Tree: V-Tree: + [0,8) sum=18 SR (2-node, int=28) + / \ ├── root.entry (int=18) + [0,4) s=10 [4,8) s=0 └── cs (3-node, int=10) + / \ ├── L.entry (int=10) +[0,2) s=0 [2,4) s=0 ├── R.entry (int=0) + └── s₁ (2-node, int=0) + ├── LL.entry (int=0) + └── LR.entry (int=0) +``` + +**Rebalance:** $LL(0)$ uncles $\{L(10), R(0)\}$: max $= 10$. $0 \leq 10$. $\checkmark$. $LR(0)$: same. $\checkmark$. All other relationships hold. **No violations.** + +--- + +### Step 3: observe(3, 12) + +**Route:** $[0,8) \to [0,4) \to [2,4) = LR$ (terminal). + +**G-propagation:** $[2,4).\text{sum} = 12$. $[0,4).\text{sum} = 22$. $[0,8).\text{sum} = 30$. + +**V-entry updates:** $LR.\text{entry.int} \mathrel{+}= 12 = 12$. $L.\text{entry.int} \mathrel{+}= 12 = 22$. $\text{root.entry.int} \mathrel{+}= 12 = 30$. + +**V-sums:** $s_1.\text{int} = 0 + 12 = 12$. $\text{cs.int} = 22 + 0 + 12 = 34$. $SR.\text{int} = 30 + 34 = 64$. + +``` +V-Tree: +SR (2-node, int=64) +├── root.entry (int=30) +└── cs (3-node, int=34) + ├── L.entry (int=22) + ├── R.entry (int=0) + └── s₁ (2-node, int=12) + ├── LL.entry (int=0) + └── LR.entry (int=12) +``` + +**Rebalance:** $LR(12)$ has parent $s_1$, grandparent cs. Uncles $= \{L(22), R(0)\}$. Max uncle $= 22$. $12 \leq 22$. $\checkmark$. **No violation.** + +The max-uncle shield in action: $L(22)$ shields $LR(12)$ even though $R(0)$ is tiny. + +**Split check for $LR$:** $12 > 5$, range $[2,4)$ width $= 2 > 1$. Parent $s_1$ is 2-node. $\text{depth}_V(LR) = 3 \leq D_{\text{create}} = 3$. $\checkmark$. **Catalytic split.** + +``` +G-Tree: V-Tree: + [0,8) sum=30 SR (2-node, int=64) + / \ ├── root.entry (int=30) + [0,4) s=22 [4,8) s=0 └── cs (3-node, int=34) + / \ ├── L.entry (int=22) +[0,2) s=0 [2,4) s=12 ├── R.entry (int=0) + / \ └── s₁ (3-node, int=12) + [2,3) s=0 [3,4) s=0 ├── LR.entry (int=12) + ├── LL.entry (int=0) + └── s₂ (2-node, int=0) + ├── LRL.entry (int=0) + └── LRR.entry (int=0) +``` + +**Rebalance:** $LRL(0)$ uncles $\{LR(12), LL(0)\}$: max $= 12$. $\checkmark$. $LRR(0)$: same. $\checkmark$. **No violations.** + +--- + +### Step 4: observe(5, 40) + +**Route:** $[0,8) \to [4,8) = R$ (terminal). + +**G-propagation:** $[4,8).\text{sum} = 40$. $[0,8).\text{sum} = 70$. + +**V-entry updates:** $R.\text{entry.int} \mathrel{+}= 40 = 40$. $\text{root.entry.int} \mathrel{+}= 40 = 70$. + +**V-sums:** $\text{cs.int} = 22 + 40 + 12 = 74$. $SR.\text{int} = 70 + 74 = 144$. + +``` +V-Tree: +SR (2-node, int=144) +├── root.entry (int=70) +└── cs (3-node, int=74) + ├── L.entry (int=22) + ├── R.entry (int=40) + └── s₁ (3-node, int=12) + ├── LR.entry (int=12) + ├── LL.entry (int=0) + └── s₂ (2-node, int=0) + ├── LRL.entry (int=0) + └── LRR.entry (int=0) +``` + +**Rebalance:** $R(40)$ uncle $\text{root}(70)$: $40 \leq 70$. $\checkmark$. All deeper entries: max uncles unchanged or increased. **No violations.** A massive injection creates no rebalancing because $R(40)$ is shielded by uncle $\text{root}(70)$. + +**Split check for $R$:** $40 > 5$, range $[4,8)$ width $= 4 > 1$. Parent cs is 3-node. **Preprocess:** contract cs, isolate heaviest. + +Heaviest child of cs: $R(40)$. Merge $L(22)$ and $s_1(12)$ into merged$(34)$: + +``` +cs (2-node, int=74) +├── R.entry (int=40) +└── merged (2-node, int=34) + ├── L.entry (int=22) + └── s₁ (3-node, int=12) + ├── LR.entry (int=12) + ├── LL.entry (int=0) + └── s₂ (2-node, int=0) + ├── LRL.entry (int=0) + └── LRR.entry (int=0) +``` + +**Contraction side-effect check (§10.9).** + +_Grandchildren of cs_ (children of R and children of merged): R is an entry (no children). Children of merged $= \{L(22), s_1(12)\}$. Their uncle $= R(40)$. $22 \leq 40$. $\checkmark$. $12 \leq 40$. $\checkmark$. + +_Grandchildren of merged_ (children of L and children of $s_1$): L is an entry (no children). Children of $s_1 = \{LR(12), LL(0), s_2(0)\}$. Their uncle $= L(22)$. $12 \leq 22$. $\checkmark$. $0 \leq 22$. $\checkmark$. $0 \leq 22$. $\checkmark$. + +**No violations.** The heavy isolate $R(40)$ shields at the cs level, and $L(22)$ shields at the merged level. + +$\text{depth}_V(R.\text{entry}) = 2 \leq D_{\text{create}} = 3$. $\checkmark$. **Catalytic split.** + +Create $[4,6)$ sum=0, $[6,8)$ sum=0. Add structural $s_3$ to cs. + +``` +G-Tree: V-Tree: + [0,8) sum=70 SR (2-node, int=144) + / \ ├── root.entry (int=70) + [0,4) s=22 [4,8) s=40 └── cs (3-node, int=74) + / \ / \ ├── R.entry (int=40) +[0,2) [2,4) [4,6) [6,8) ├── merged (2-node, int=34) + s=0 s=12 s=0 s=0 │ ├── L.entry (int=22) + / \ │ └── s₁ (3-node, int=12) + [2,3) [3,4) │ ├── LR.entry (int=12) + s=0 s=0 │ ├── LL.entry (int=0) + │ └── s₂ (2-node, int=0) + │ ├── LRL.entry (int=0) + │ └── LRR.entry (int=0) + └── s₃ (2-node, int=0) + ├── RL.entry (int=0) [4,6) + └── RR.entry (int=0) [6,8) +``` + +**Rebalance:** New entries RL(0), RR(0) have uncles $\{R(40), \text{merged}(34)\}$, max $= 40$. Trivially satisfied. All other uncle relationships unchanged by the catalytic split (§9.4). **No violations.** + +--- + +### Step 5: observe(5, 50) + +**Route:** $[0,8) \to [4,8) \to [4,6) = RL$ (terminal). + +**G-propagation:** $[4,6).\text{sum} = 50$. $[4,8).\text{sum} = 90$. $[0,8).\text{sum} = 120$. + +**V-entry updates:** $RL.\text{entry.int} \mathrel{+}= 50 = 50$. $R.\text{entry.int} \mathrel{+}= 50 = 90$. $\text{root.entry.int} \mathrel{+}= 50 = 120$. + +**V-sums:** $s_3.\text{int} = 50 + 0 = 50$. $\text{merged.int} = 22 + 12 = 34$ (unchanged). $\text{cs.int} = 90 + 34 + 50 = 174$. $SR.\text{int} = 120 + 174 = 294$. + +``` +V-Tree: +SR (2-node, int=294) +├── root.entry (int=120) +└── cs (3-node, int=174) + ├── R.entry (int=90) + ├── merged (2-node, int=34) + │ ├── L.entry (int=22) + │ └── s₁ (3-node, int=12) + │ ├── LR.entry (int=12) + │ ├── LL.entry (int=0) + │ └── s₂ (2-node, int=0) + │ ├── LRL.entry (int=0) + │ └── LRR.entry (int=0) + └── s₃ (2-node, int=50) + ├── RL.entry (int=50) + └── RR.entry (int=0) +``` + +**Split check for $RL$:** $50 > 5$, range $[4,6)$ width $= 2 > 1$. Parent $s_3$ is 2-node. $\text{depth}_V(RL) = 3 \leq D_{\text{create}} = 3$. $\checkmark$. **Catalytic split.** + +Create $[4,5)$ sum=0, $[5,6)$ sum=0. Add structural $s_4$ to $s_3$. + +``` +G-Tree: V-Tree: + [0,8) sum=120 SR (2-node, int=294) + / \ ├── root.entry (int=120) + [0,4) s=22 [4,8) s=90 └── cs (3-node, int=174) + / \ / \ ├── R.entry (int=90) +[0,2) [2,4) [4,6) [6,8) ├── merged (2-node, int=34) + s=0 s=12 s=50 s=0 │ ├── L.entry (int=22) + / \ / \ │ └── s₁ (3-node, int=12) + [2,3) [3,4) [4,5) [5,6) │ ├── LR.entry (int=12) + s=0 s=0 s=0 s=0 │ ├── LL.entry (int=0) + │ └── s₂ (2-node, int=0) + │ ├── LRL.entry (int=0) + │ └── LRR.entry (int=0) + └── s₃ (3-node, int=50) + ├── RL.entry (int=50) [4,6) + ├── RR.entry (int=0) [6,8) + └── s₄ (2-node, int=0) + ├── RLL.entry (int=0) [4,5) + └── RLR.entry (int=0) [5,6) +``` + +**Rebalance:** $RL(50)$ parent $s_3$, grandparent cs. Uncles $= \{R(90), \text{merged}(34)\}$. Max uncle $= 90$. $50 \leq 90$. $\checkmark$. New entries RLL(0), RLR(0) have uncles $\{RL(50), RR(0)\}$, max $= 50$. Trivially satisfied. All other uncle relationships unchanged (§9.4). **No violations.** + +--- + +### Final State Summary + +``` +G-Tree (spatial structure): + + [0,8) sum=120 + / \ + [0,4) s=22 [4,8) s=90 + / \ / \ +[0,2) [2,4) [4,6) s=50 [6,8) + s=0 s=12 / \ s=0 + / \ [4,5) [5,6) + [2,3) [3,4) s=0 s=0 + s=0 s=0 +``` + +``` +V-Tree (value structure): + +SR (2-node, int=294) +├── root.entry (int=120) depth 1 — domain overview +└── cs (3-node, int=174) + ├── R.entry (int=90) depth 2 — hot region parent + ├── merged (2-node, int=34) + │ ├── L.entry (int=22) depth 3 — warm region + │ └── s₁ (3-node, int=12) + │ ├── LR.entry (int=12) depth 4 — warm subregion + │ ├── LL.entry (int=0) depth 4 — cold + │ └── s₂ (2-node, int=0) + │ ├── LRL.entry (int=0) depth 5 — cold + │ └── LRR.entry (int=0) depth 5 — cold + └── s₃ (3-node, int=50) + ├── RL.entry (int=50) depth 3 — hot subregion + ├── RR.entry (int=0) depth 3 — cold (new) + └── s₄ (2-node, int=0) + ├── RLL.entry (int=0) depth 4 — cold (new) + └── RLR.entry (int=0) depth 4 — cold (new) +``` + +**Key observations:** + +1. **Heavy entries near the root.** root(120) at depth 1, R(90) at depth 2, RL(50) at depth 3. Sampling finds the hot region in 2–3 steps. + +2. **Cold entries deep.** Zero-intensity children at depths 4–5, within the buffer zone $[D_{\text{create}}, D_{\text{evict}}) = [3, 6)$. They can accumulate observations and promote if traffic sustains. + +3. **The G-Tree has full spatial resolution where it matters.** $[4, 8)$ is refined to $[4, 5)$ and $[5, 6)$. $[0, 4)$ is refined to $[2, 3)$ and $[3, 4)$. $[6, 8)$ and $[0, 2)$ remain coarse. + +4. **No information lost.** The G-root carries $\text{sum} = 120$ (all observations). Pre-split history lives in $g.\text{own}$ at each level. + +5. **Zero rebalancing operations occurred.** Across five steps with three catalytic splits and one bootstrap split, every injection and every split was absorbed by the max-uncle shield without triggering a single violation. The heaviest entries at each level — root(120), R(90), L(22) — act as shields that stabilise their entire neighborhoods. + +**Proportional sampling from the final V-Tree:** + +At $SR(294)$: choose root$(120)$ with $p = 120/294 \approx 0.41$, cs$(174)$ with $p \approx 0.59$. + +If cs (3-node): choose $R(90)$ with $p = 90/174 \approx 0.52$, $s_3(50)$ with $p \approx 0.29$, merged$(34)$ with $p \approx 0.20$. + +Expected sampling cost for hottest leaf ($RL$): 3 steps, reached with probability $50/294 \approx 0.17$. The combined top-3 entries (root + R + RL, total intensity 260/294 $\approx 0.88$) are reached in $\leq 3$ steps. + +--- + +## 16. Properties + +### 16.1 Invariant Preservation Summary + +| Operation | V-I3 (Uncle) | V-I5 (Entry-Leaf) | G-I1 (Sums) | +| ------------------------------ | -------------------------------------------------- | ----------------- | ------------ | +| Observation (intensity update) | May violate | $\checkmark$ | $\checkmark$ | +| V-Tree insertion | Preserved | $\checkmark$ | — | +| V-Tree leaf removal | Caller checks | $\checkmark$ | — | +| Bootstrap split | Preserved | $\checkmark$ | $\checkmark$ | +| Catalytic split | **Preserved** | $\checkmark$ | $\checkmark$ | +| Standard promote | May create side-effects | $\checkmark$ | — | +| Skip promote | Resolves trigger; may create side-effects | $\checkmark$ | — | +| Contraction | May create violations at merged-node grandchildren | $\checkmark$ | — | +| Eviction | Caller checks | $\checkmark$ | $\checkmark$ | +| Ghost cleanup | Caller checks | $\checkmark$ | $\checkmark$ | + +**Catalytic splits are violation-free** under the max-uncle constraint. Only observations (which increase an entry's intensity above its max uncle) are the root cause of violations. Rebalancing resolves the trigger and handles side-effects locally. + +### 16.2 V-Tree Height + +The uncle constraint permits a degenerate chain: + +``` + R + / \ + e₁ N₁ + / \ + e₂ N₂ + ... +``` + +This satisfies V-I3 as long as $e_1.\text{int} \geq e_2.\text{int} \geq \cdots$. The height may reach $O(L)$. + +**This is not a bug.** It is the correct shape for a distribution where every entry has distinct intensity and depth should reflect rank. Under concentrated distributions (a few hotspots dominating), $h = O(\log L)$. Under uniform distributions, $h = O(L)$ but proportional sampling still costs $O(1)$ per sample on average since all entries have equal weight. + +--- + +## 17. Complexity Analysis + +### 17.1 The Main Theorem: Fibonacci Depth Bound + +**Theorem.** In any V-Tree satisfying V-I3 and V-I2, a node $v_i$ with weight fraction $w_i = v_i.\text{int}\,/\,I_{\text{total}}$ has depth + +$$d_i \;\leq\; \log_\phi\!\left(\frac{1}{w_i}\right) + c$$ + +where $\phi = \frac{1+\sqrt{5}}{2} \approx 1.618$ is the golden ratio and $c = \log_\phi\!\sqrt{5} \approx 1.672$. + +**Proof.** Let $v$ be a node at depth $d$. Denote the path from root to $v$ as $v_0, v_1, \ldots, v_d = v$ where $v_0$ is the V-Tree root and $v_{k+1}$ is a child of $v_k$. Write $I_k = v_k.\text{int}$. + +**Step 1: The Fibonacci recurrence.** For any $k \geq 2$, the triple $(v_k,\, v_{k-1},\, v_{k-2})$ is child–parent–grandparent. V-I3 states $v_k.\text{int} \leq \max\{u.\text{int} : u \in \text{siblings}(v_{k-1},\, v_{k-2})\}$. If $v_k$ is not in violation, some uncle $u$ satisfies $u.\text{int} \geq I_k$. Since $v_{k-2}$ has at least 2 children (V-I2), $v_{k-1}$ has at least one sibling $u$ under $v_{k-2}$: + +$$I_{k-2} \;=\; I_{k-1} \;+\; \sum_{\text{siblings}} u.\text{int} \;\geq\; I_{k-1} + I_k$$ + +This gives $I_{k-2} \geq I_{k-1} + I_k$ for all $k \geq 2$ — **the Fibonacci recurrence** running backward from the leaf. + +> _Note on the max-uncle formulation._ The proof requires only that at least one uncle satisfies $u.\text{int} \geq I_k$. The max-uncle constraint guarantees this: if $v_k$ is not in violation, $\max\{u.\text{int}\} \geq I_k$, so some uncle satisfies the bound. The recurrence is identical whether the constraint is against every uncle or against the heaviest uncle. + +**Step 2: Unwinding the recurrence.** Define $\alpha_k = I_k / I_d$ (normalised so $\alpha_d = 1$). Then: + +- $\alpha_d = 1$ +- $\alpha_{d-1} \geq 1$ (parent intensity $\geq$ child intensity) +- $\alpha_{k-2} \geq \alpha_{k-1} + \alpha_k$ for all $k \geq 2$ + +By induction, $\alpha_{d-j} \geq F_{j+1}$ where $F_n$ is the $n$-th Fibonacci number ($F_1 = F_2 = 1$). + +At the root: $\alpha_0 = I_0/I_d = I_{\text{total}}/v.\text{int} = 1/w_i$, so: + +$$\frac{1}{w_i} \;\geq\; F_{d+1}$$ + +**Step 3: Inverting the Fibonacci bound.** Using $F_n \geq \phi^{n-2}/\sqrt{5}$ (valid for $n \geq 1$): + +$$\frac{1}{w_i} \;\geq\; \frac{\phi^{d-1}}{\sqrt{5}}$$ + +$$\boxed{\;d_i \;\leq\; \log_\phi\!\left(\frac{1}{w_i}\right) + \log_\phi\!\sqrt{5}\;}$$ + +where $\log_\phi\!\sqrt{5} \approx 1.672$. $\blacksquare$ + +### 17.2 Expected Sampling Cost + +**Corollary.** + +$$E[\text{sampling}] \;=\; \sum_i w_i\, d_i \;\leq\; \frac{H}{\log_2 \phi} \;+\; \log_\phi\!\sqrt{5}$$ + +where $H = \sum_i w_i \log_2(1/w_i)$ is the Shannon entropy of the intensity distribution and $1/\log_2\phi \approx 1.4404$. + +_Proof._ Sum the per-entry bound weighted by $w_i$: + +$$\sum_i w_i\, d_i \;\leq\; \sum_i w_i\!\left(\frac{\log_2(1/w_i)}{\log_2\phi} + \log_\phi\!\sqrt{5}\right) \;=\; \frac{H}{\log_2\phi} + \log_\phi\!\sqrt{5}$$ + +### 17.3 Tightness: The Constant Is Exact + +The constant $1/\log_2\phi \approx 1.4404$ is achieved, not an artifact of loose analysis. + +**Fibonacci chain construction.** Take the degenerate chain from §16.2 with $L$ entries, setting intensities to satisfy $w_i = w_{i+1} + w_{i+2}$ at equality. The solution is $w_i \propto \phi^{-i}$. This chain satisfies all V-Tree invariants and every entry sits at maximum depth permitted by the uncle constraint. + +For a V-Tree with one dominant entry (weight $1 - \epsilon$) at depth 1 and $L - 1$ light entries forming a Fibonacci sub-chain: as $L \to \infty$, the light entries dominate both $E$ and $H$, and the ratio $E/H$ converges to $1/\log_2\phi$ from below. No V-Tree satisfying V-I3 can exceed this ratio for sufficiently large $H$. + +### 17.4 The AVL Parallel + +The parallel with AVL trees is exact: + +| | AVL Tree | V-Tree | +| ----------------------- | ----------------------------------------------------- | ------------------------------------------------- | +| **Balance constraint** | Height-balance: children's heights differ by $\leq 1$ | Max-uncle: no grandchild outweighs heaviest uncle | +| **Nature** | Local, structural | Local, value-based | +| **Produces recurrence** | $N_h \geq N_{h-1} + N_{h-2} + 1$ | $I_{k-2} \geq I_{k-1} + I_k$ | +| **Growth rate** | $\phi^h$ | $\phi^d$ | +| **Bound** | Height $\leq \log_\phi n + O(1)$ | Depth $\leq \log_\phi(1/w_i) + O(1)$ | +| **Overhead** | $1/\log_2\phi \approx 1.44$ | $1/\log_2\phi \approx 1.44$ | + +Both sacrifice $\approx 44\%$ overhead for the same reason: the local constraint's tightest extremal configuration follows Fibonacci, and $\log_2\phi \approx 0.694$ is the Fibonacci sequence's per-step entropy. + +The conceptual inversion: **AVL** enforces a local height constraint → worst-case $O(1.44 \log_2 n)$ depth for all operations. **V-Tree** enforces a local weight constraint → worst-case $O(1.44\, H)$ expected sampling → heavy entries pay less, light entries pay more. + +### 17.5 Information-Theoretic Framing + +**Lower bound.** Each step descends to one of 2 or 3 children. Maximum information per step: $\log_2 3 \approx 1.585$ bits. To identify an entry from a distribution with entropy $H$: + +$$E[\text{sampling}] \;\geq\; \frac{H}{\log_2 3} \;\approx\; 0.631\, H$$ + +**Upper bound (Theorem).** + +$$E[\text{sampling}] \;\leq\; \frac{H}{\log_2 \phi} + O(1) \;\approx\; 1.440\, H + O(1)$$ + +**Gap.** + +$$\frac{\text{V-Tree worst-case}}{\text{information-theoretic optimum}} \;=\; \frac{\log_2 3}{\log_2 \phi} \;=\; \log_\phi 3 \;\approx\; 2.28$$ + +The V-Tree is within a factor of $\approx 2.28$ of any tree-based proportional sampler, including an oracle with advance knowledge of the distribution. + +**Comparison with known schemes:** + +| Scheme | Expected sampling cost | Update cost | Dynamic? | +| ------------------------- | ----------------------- | --------------------- | -------- | +| Balanced segment tree | $\Theta(\log L)$ always | $O(\log L)$ | Yes | +| Static Huffman tree | $\leq H + 1$ | $O(L)$ rebuild | No | +| Adaptive Huffman (Vitter) | $\leq H + 1$ amortised | $O(L)$ worst case | Yes | +| **V-Tree** | $\leq 1.44\, H + O(1)$ | $O(\log_\phi(1/w_i))$ | **Yes** | + +The V-Tree sacrifices $\approx 44\%$ in sampling quality relative to Huffman but achieves dynamism with per-update cost proportional to the updated entry's depth. The uncle constraint is a **local** invariant that approximates a **global** optimum. Huffman optimality is a global property — unverifiable from any node's neighborhood. The uncle constraint is verifiable from parent and grandparent. This locality permits $O(d_i)$ dynamic maintenance rather than $O(L)$ restructuring. + +### 17.6 Regime-Dependent Behaviour + +| Regime | $H$ | $E[\text{sampling}]$ | Balanced tree | +| --------------------------- | ----------- | -------------------- | ---------------- | +| Concentrated ($k$ hotspots) | $O(\log k)$ | $O(\log k)$ | $\Theta(\log L)$ | +| Uniform | $\log_2 L$ | $\leq 1.44\log_2 L$ | $\log_2 L$ | +| Zipf ($\alpha > 1$) | $O(1)$ | $O(1)$ | $\Theta(\log L)$ | +| Geometric ($\beta^i$) | $O(1)$ | $O(1)$ | $\Theta(\log L)$ | + +For concentrated distributions, the V-Tree achieves speedup $\Theta(\log L / \log k)$ over balanced trees — unbounded as $L/k \to \infty$. + +### 17.7 Two-Stage Cost Separation + +| Phase | Structure | Overhead | Purpose | +| ---------- | -------------- | ------------ | -------------------------- | +| Attention | V-Tree walk | $1.44\times$ | Scale and region selection | +| Resolution | G-Tree descent | $1.0\times$ | Cell-level refinement | + +Spatial cost depends on $N$ (resolution parameter). Value cost depends on $H$ (entropy). These are independent quantities. A high-resolution domain ($N = 32$) with a concentrated traffic pattern ($H = 3$ bits) has spatial routing of 32 steps but proportional sampling of $\approx 6$ steps. A balanced tree would cost $\Theta(\log L)$ for both. + +### 17.8 Amortised Rebalancing Under Proportional Traffic + +If observations arrive at entry $i$ with probability proportional to $w_i$, the expected per-observation rebalancing cost is: + +$$E[\text{rebalance}] \;=\; \sum_i w_i \cdot O(d_i) \;=\; O\!\left(\frac{H}{\log_2 \phi}\right)$$ + +Under concentrated distributions, this is $O(1)$ amortised: the hot entries that cause most updates are already near the root, so their promotions (if any) traverse only a few levels. The structure automatically allocates maintenance effort in proportion to the entropy of the traffic pattern. + +### 17.9 Cost Summary + +| Operation | Cost | +| ------------------------------- | ---------------------------------------------- | +| Observe (route + G-propagate) | $O(d_{\text{geo}})$ | +| V-entry updates per observation | $O(k \cdot h_V)$ where $k$ = entries on G-path | +| Single promotion / contraction | $O(1)$ | +| Rebalance after injection | $O(h_V)$ typical | +| Catalytic split | $O(1)$ — violation-free | +| Bootstrap split | $O(1)$ | +| Eviction | $O(1)$ + leaf removal | +| Ghost detection | $O(1)$ — one field check | +| Ghost cascade | $O(\text{removed subtree size})$ | +| Range sum | $O(N)$ | +| Attention sample | $O(1.44\,H + 1.67)$ expected | +| Point query | $O(N)$ worst, $O(d_{\text{geo}})$ typical | + +Under lazy V-I1 propagation, V-entry update cost is deferred to query time. The choice is an implementation decision. Rebalancing must operate on up-to-date values (forced propagation before comparison if using lazy mode). + +--- + +## 18. Spray Resistance + +### 18.1 Defence Layers + +| Layer | Mechanism | What it prevents | +| -------------------------------------- | ------------------------------------------------------ | ---------------------------------------------------------- | +| Local intensity gate ($\theta$) | Must exceed threshold to split | Low-intensity spray forcing splits | +| Global rank gate ($D_{\text{create}}$) | Must earn shallow V-position to split | Locally-intense but globally-insignificant nodes splitting | +| Parent arity gate | V-parent must be 2-node (preprocessing may deny split) | Overcrowding at a V-level | +| Eviction ($D_{\text{evict}}$) | Entries past depth threshold are removed | Accumulation of insignificant entries | + +### 18.2 Behaviour Under Spray + +Spray-generated entries are born adjacent to their G-parent with zero intensity. Under diffuse spray, each entry carries a tiny fraction of global intensity. The uncle constraint pushes them deep. Deep entries fail the $D_{\text{create}}$ check and never split further. Entries past $D_{\text{evict}}$ are evicted, their G-children collapsed. + +An adversary controls _where_ they spray. They cannot force spatial refinement without concentrating enough intensity to earn a shallow V-Tree position — which is exactly the condition under which refinement is warranted. + +### 18.3 Resource Bound + +Under sustained spray of $R$ observations per batch at intensity $\Delta$, with user-applied decay rate $\lambda$, the steady-state leaf count satisfies: + +$$L_{\text{steady}} = \frac{R \cdot \Delta}{(1 - \lambda) \cdot \theta}$$ + +This is independent of domain size $2^N$, the number of distinct coordinates targeted, and spray duration. Under example parameters ($R = 1000$, $\Delta = 1$, $\lambda = 0.99$, $\theta = 1000$): a million-address spray stabilises at **100 leaves**. + +The full memory bound is $O(L_{\text{steady}})$ for the V-Tree and $O(L_{\text{steady}})$ for the path-compressed G-Tree — both controlled by the four-parameter budget, not by the adversary's strategy. + +**Recovery.** When the spray stops, user-applied decay continues. Entries cool, sink past $D_{\text{evict}}$, and are evicted. G-subtrees collapse. The tree returns to its pre-spray size. + +**The triad is necessary:** + +| Mechanism | Without it | +| ----------------------------------- | ------------------------------------------------------------------------------------ | +| Variable-depth leaves / depth gates | Every unique address forces a leaf. G-Tree grows $O(N \cdot R \cdot T)$. | +| Exponential decay (user) | Intensity accumulates monotonically. Every entry eventually qualifies for splitting. | +| Eviction | Cold entries persist after spray ends. Memory is never reclaimed. | + +--- + +## 19. Structural Summary + +``` + ┌──────────────────────────────────────────────────────────┐ + │ G-Tree (spatial, additively evolving) │ + │ │ + │ [0, 8) sum=120 │ + │ / \ │ + │ [0,4) s=22 [4,8) s=90 │ + │ / \ / \ │ + │ [0,2) [2,4) [4,6) [6,8) │ + │ s=0 s=12 s=50 s=0 │ + │ / \ / \ │ + │ [2,3) [3,4) [4,5) [5,6) │ + │ s=0 s=0 s=0 s=0 │ + │ │ │ │ │ │ + │ ▼ ▼ ▼ ▼ │ + │ ┌──────────────────────────────────────────────┐ │ + │ │ G-nodes serve as V-entries in V-Tree │ │ + │ │ at multiple levels (terminal + internal) │ │ + │ └──────────┬───────────────────────────────────┘ │ + └─────────────┼────────────────────────────────────────────┘ + │ + ┌─────────────┼────────────────────────────────────────────┐ + │ ▼ V-Tree (tournament bracket) │ + │ │ + │ SR(294) │ + │ ├── root(120) [0,8) │ + │ └── cs(174) │ + │ ├── R(90) [4,8) │ + │ ├── merged(34) │ + │ │ ├── L(22) [0,4) │ + │ │ └── s₁(12) │ + │ │ ├── LR(12) [2,4) │ + │ │ ├── LL(0) [0,2) │ + │ │ └── s₂(0) │ + │ │ ├── LRL(0) [2,3) │ + │ │ └── LRR(0) [3,4) │ + │ └── s₃(50) │ + │ ├── RL(50) [4,6) │ + │ ├── RR(0) [6,8) │ + │ └── s₄(0) │ + │ ├── RLL(0) [4,5) │ + │ └── RLR(0) [5,6) │ + │ │ + │ Entries (V-leaves) = G-nodes. Structural = scaffolding. │ + │ Hot entries near root. Cold entries deep. │ + │ No geometry — just competitive ranking. │ + └──────────────────────────────────────────────────────────┘ +``` + +**The G-Tree** is a spatial index that grows where the V-Tree authorizes resolution and shrinks when entries are evicted. It carries exact accumulated values and answers $O(N)$ range queries. + +**The V-Tree** is a tournament where G-node entries compete for attention. Heavy entries occupy shallow positions near the root. Light entries are consolidated deep. It provides entropy-sensitive proportional sampling. + +**Geometry is the G-Tree's concern.** The V-Tree knows nothing about coordinates. + +**Competitive ranking is the V-Tree's concern.** The G-Tree makes no structural decisions. + +**Temporal semantics are the user's concern.** The architecture stores exact accumulation and adapts to whatever the user's filter makes the numbers say. diff --git a/packages/sentinel/Cargo.toml b/packages/sentinel/Cargo.toml new file mode 100644 index 00000000..fb53129c --- /dev/null +++ b/packages/sentinel/Cargo.toml @@ -0,0 +1,34 @@ +[package] +description = "Hierarchical online subspace anomaly spectrometer for network traffic." +keywords = ["anomaly-detection", "network", "traffic", "svd"] +name = "torrust-sentinel" +readme = "README.md" + +authors.workspace = true +documentation.workspace = true +edition.workspace = true +homepage.workspace = true +license.workspace = true +publish.workspace = true +repository.workspace = true +rust-version.workspace = true +version.workspace = true + +[lints] +workspace = true + +[features] +serde = ["dep:serde"] + +[dependencies] +faer = "0" +rand = "0" +serde = { version = "1", features = ["derive"], optional = true } +tracing = "0" + +[dev-dependencies] +criterion = { version = "0.5", features = ["html_reports"] } + +[[bench]] +name = "sentinel" +harness = false diff --git a/packages/sentinel/benches/sentinel.rs b/packages/sentinel/benches/sentinel.rs new file mode 100644 index 00000000..26479926 --- /dev/null +++ b/packages/sentinel/benches/sentinel.rs @@ -0,0 +1,283 @@ +//! Criterion benchmarks for the Spectral Sentinel. +//! +//! Run with: +//! +//! ```sh +//! cargo bench -p torrust-sentinel +//! ``` + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use torrust_sentinel::config::SentinelConfig; +use torrust_sentinel::observation::{CentredBits, ObservationBatch}; +use torrust_sentinel::sentinel::{NoiseParams, SpectralSentinel}; + +// ─── Helpers ──────────────────────────────────────────────── + +/// Lightweight config for micro-benchmarks: few depths, small rank. +fn bench_config() -> SentinelConfig { + SentinelConfig { + max_rank: 4, + forgetting_factor: 0.95, + rank_update_interval: 10, + campus_bits: 4, + prefix_depths: vec![8, 16], + energy_threshold: 0.90, + eps: 1e-6, + per_sample_scores: false, + cusum_allowance_sigmas: 0.5, + cusum_slow_decay: 0.999, + cusum_meta_slow_decay: 0.999, + } +} + +/// Realistic config with default depths and higher rank. +fn realistic_config() -> SentinelConfig { + SentinelConfig { + max_rank: 16, + forgetting_factor: 0.99, + rank_update_interval: 100, + campus_bits: 8, + prefix_depths: vec![8, 16, 24, 32, 48, 64, 96, 128], + energy_threshold: 0.90, + eps: 1e-6, + per_sample_scores: false, + cusum_allowance_sigmas: 0.5, + cusum_slow_decay: 0.999, + cusum_meta_slow_decay: 0.999, + } +} + +/// Generate `count` sequential values in a single campus. +fn single_campus_values(count: usize) -> Vec { + (0..count).map(|i| (0xAB_u128 << 120) | (i as u128 + 1)).collect() +} + +/// Generate `count` values spread across many campuses. +fn multi_campus_values(count: usize) -> Vec { + (0..count) + .map(|i| { + let campus = (i % 16) as u128; + (campus << 124) | (i as u128 + 1) + }) + .collect() +} + +/// Create a warmed sentinel ready for steady-state benchmarking. +/// +/// Kept deliberately cheap — just enough to get past the cold EWMA +/// path and into the warm steady-state code. The full warm-up +/// (many noise rounds, many batches) is what `bench_inject_noise` +/// measures; the other benchmarks only need a sentinel whose +/// baselines are non-cold. +fn warmed_sentinel(cfg: &SentinelConfig) -> SpectralSentinel { + let mut s = SpectralSentinel::new(cfg.clone()).unwrap(); + + // Seed a few campuses. + let seed: Vec = (0..4_u128) + .flat_map(|campus| (0..4_u128).map(move |i| (campus << 124) | (i + 1))) + .collect(); + s.ingest(&seed); + + // Minimal noise injection — just enough to warm baselines. + s.inject_noise(&NoiseParams { + rounds: 3, + batch_size: 4, + seed: Some(42), + }); + + // One real batch to enter the warm code path. + let batch: Vec = (0..4_u128) + .flat_map(|campus| (0..4_u128).map(move |i| (campus << 124) | (i + 100))) + .collect(); + s.ingest(&batch); + + s +} + +// ─── Observation encoding ─────────────────────────────────── + +fn bench_centred_bits(c: &mut Criterion) { + c.bench_function("CentredBits::from_u128", |b| { + b.iter(|| CentredBits::from_u128(black_box(0xDEAD_BEEF_CAFE_BABE_1234_5678_9ABC_DEF0))); + }); +} + +fn bench_observation_batch(c: &mut Criterion) { + let mut group = c.benchmark_group("ObservationBatch::from_values"); + + for count in [16, 64, 256] { + let values = single_campus_values(count); + group.throughput(Throughput::Elements(count as u64)); + group.bench_with_input(BenchmarkId::new("single_campus", count), &values, |b, vals| { + b.iter(|| ObservationBatch::from_values(black_box(vals), 4)); + }); + } + + for count in [16, 64, 256] { + let values = multi_campus_values(count); + group.throughput(Throughput::Elements(count as u64)); + group.bench_with_input(BenchmarkId::new("multi_campus", count), &values, |b, vals| { + b.iter(|| ObservationBatch::from_values(black_box(vals), 4)); + }); + } + + group.finish(); +} + +// ─── Ingest (core hot path) ───────────────────────────────── + +fn bench_ingest_cold(c: &mut Criterion) { + let mut group = c.benchmark_group("ingest_cold"); + + for batch_size in [1, 8, 32] { + let values = single_campus_values(batch_size); + group.throughput(Throughput::Elements(batch_size as u64)); + group.bench_with_input(BenchmarkId::new("single_campus", batch_size), &values, |b, vals| { + b.iter_with_setup( + || SpectralSentinel::new(bench_config()).unwrap(), + |mut s| s.ingest(black_box(vals)), + ); + }); + } + + group.finish(); +} + +fn bench_ingest_warm(c: &mut Criterion) { + let mut group = c.benchmark_group("ingest_warm"); + let cfg = bench_config(); + + for batch_size in [1, 8, 32, 64] { + let values = single_campus_values(batch_size); + group.throughput(Throughput::Elements(batch_size as u64)); + group.bench_with_input(BenchmarkId::new("single_campus", batch_size), &values, |b, vals| { + b.iter_with_setup(|| warmed_sentinel(&cfg), |mut s| s.ingest(black_box(vals))); + }); + } + + // Multi-campus: triggers coordination tier. + for batch_size in [16, 64] { + let values = multi_campus_values(batch_size); + group.throughput(Throughput::Elements(batch_size as u64)); + group.bench_with_input(BenchmarkId::new("multi_campus", batch_size), &values, |b, vals| { + b.iter_with_setup(|| warmed_sentinel(&cfg), |mut s| s.ingest(black_box(vals))); + }); + } + + group.finish(); +} + +// ─── Ingest at realistic scale ────────────────────────────── + +fn bench_ingest_realistic(c: &mut Criterion) { + let mut group = c.benchmark_group("ingest_realistic"); + let cfg = realistic_config(); + + for batch_size in [16, 64, 256] { + let values = multi_campus_values(batch_size); + group.throughput(Throughput::Elements(batch_size as u64)); + group.bench_with_input(BenchmarkId::new("multi_campus", batch_size), &values, |b, vals| { + b.iter_with_setup(|| warmed_sentinel(&cfg), |mut s| s.ingest(black_box(vals))); + }); + } + + group.finish(); +} + +// ─── Noise injection ──────────────────────────────────────── + +fn bench_inject_noise(c: &mut Criterion) { + let mut group = c.benchmark_group("inject_noise"); + let cfg = bench_config(); + + for rounds in [10, 50] { + let params = NoiseParams { + rounds, + batch_size: 16, + seed: Some(42), + }; + group.bench_with_input(BenchmarkId::new("rounds", rounds), ¶ms, |b, params| { + b.iter_with_setup( + || { + let mut s = SpectralSentinel::new(cfg.clone()).unwrap(); + // Need at least one campus for noise to do anything. + s.ingest(&single_campus_values(4)); + s + }, + |mut s| s.inject_noise(black_box(params)), + ); + }); + } + + group.finish(); +} + +// ─── Health / inspection ──────────────────────────────────── + +fn bench_health(c: &mut Criterion) { + let cfg = bench_config(); + let s = warmed_sentinel(&cfg); + + c.bench_function("health", |b| { + b.iter(|| s.health()); + }); +} + +// ─── Per-sample scoring overhead ──────────────────────────── + +fn bench_per_sample_overhead(c: &mut Criterion) { + let mut group = c.benchmark_group("per_sample_scores"); + + let batch_size = 32; + let values = single_campus_values(batch_size); + + for enabled in [false, true] { + let cfg = SentinelConfig { + per_sample_scores: enabled, + ..bench_config() + }; + + group.throughput(Throughput::Elements(batch_size as u64)); + group.bench_with_input(BenchmarkId::new("enabled", enabled), &values, |b, vals| { + b.iter_with_setup(|| warmed_sentinel(&cfg), |mut s| s.ingest(black_box(vals))); + }); + } + + group.finish(); +} + +// ─── Depth scaling ────────────────────────────────────────── + +fn bench_depth_scaling(c: &mut Criterion) { + let mut group = c.benchmark_group("depth_scaling"); + let batch_size = 16; + let values = single_campus_values(batch_size); + + for num_depths in [1, 2, 4, 8] { + let depths: Vec = [8, 16, 24, 32, 48, 64, 96, 128].into_iter().take(num_depths).collect(); + + let cfg = SentinelConfig { + prefix_depths: depths, + ..bench_config() + }; + + group.throughput(Throughput::Elements(batch_size as u64)); + group.bench_with_input(BenchmarkId::new("num_depths", num_depths), &values, |b, vals| { + b.iter_with_setup(|| warmed_sentinel(&cfg), |mut s| s.ingest(black_box(vals))); + }); + } + + group.finish(); +} + +// ─── Registration ─────────────────────────────────────────── + +criterion_group!(encoding, bench_centred_bits, bench_observation_batch,); + +criterion_group!(ingest, bench_ingest_cold, bench_ingest_warm, bench_ingest_realistic,); + +criterion_group!(auxiliary, bench_inject_noise, bench_health,); + +criterion_group!(scaling, bench_per_sample_overhead, bench_depth_scaling,); + +criterion_main!(encoding, ingest, auxiliary, scaling); diff --git a/packages/sentinel/docs/algorithm.md b/packages/sentinel/docs/algorithm.md new file mode 100644 index 00000000..3439eefa --- /dev/null +++ b/packages/sentinel/docs/algorithm.md @@ -0,0 +1,1085 @@ +# Spectral Sentinel — Algorithm Specification + +## 1. The algorithm + +The Spectral Sentinel is a hierarchical online anomaly detector for +streams of positionally structured `u128` values. It maintains one +low-rank linear subspace model per (campus, prefix-depth) pair, +scores each incoming batch against that model, and then evolves the +model to incorporate the new data. + +**Design principle.** The sentinel measures; the host decides. All +outputs are raw statistical quantities — the sentinel never emits +threat levels, recommended actions, or policy decisions. + +### 1.1 Notation + +| Symbol | Domain | Definition | +| ------------------- | ------------------------------------------- | -------------------------------------------------------------------------------- | +| $d$ | $\{1, \ldots, 128\}$ | Prefix depth (dimensionality of the working space) | +| $b$ | $\mathbb{Z}_{>0}$ | Batch size (number of observations in one call) | +| $k$ | $\{1, \ldots, \text{cap}\}$ | Current active rank of the learned subspace | +| $\text{cap}$ | $\{1, \ldots, \min(d, r_{\max})\}$ | Hard ceiling on rank; $r_{\max}$ is `max_rank` from config | +| $\lambda$ | $(0, 1)$ | Forgetting factor (`forgetting_factor`) | +| $\alpha$ | $(0, 1)$ | Learning rate; $\alpha = 1 - \lambda$ | +| $\varepsilon$ | $\mathbb{R}_{>0}$ | Numerical stability constant (`eps`) | +| $\tau$ | $(0, 1)$ | Cumulative energy threshold (`energy_threshold`) | +| $U$ | $\mathbb{R}^{d \times \text{cap}}$ | Orthonormal basis of the learned subspace | +| $\sigma$ | $\mathbb{R}^{\text{cap}}_{\geq 0}$ | Singular values (energy per basis vector) | +| $\mu^{(z)}$ | $\mathbb{R}^{\text{cap}}$ | EWMA mean of latent coordinates | +| $\nu^{(z)}$ | $\mathbb{R}^{\text{cap}}_{>0}$ | EWMA variance of latent coordinates | +| $C$ | $\mathbb{R}^{\text{cap} \times \text{cap}}$ | EWMA cross-correlation matrix of latent coordinates | +| $X$ | $\mathbb{R}^{b \times d}$ | Centred observation matrix (one batch) | +| $Z$ | $\mathbb{R}^{b \times k}$ | Latent projection of $X$ | +| $\hat{X}$ | $\mathbb{R}^{b \times d}$ | Reconstruction of $X$ from $Z$ | +| $\lambda_s$ | $(\lambda, 1)$ | Slow EWMA decay for per-tracker CUSUM (`cusum_slow_decay`) | +| $\lambda_{s,m}$ | $(\lambda, 1)$ | Slow EWMA decay for meta-tracker CUSUM (`cusum_meta_slow_decay`) | +| $\kappa_\sigma$ | $\mathbb{R}_{\geq 0}$ | CUSUM noise allowance in slow-baseline $\sigma$ units (`cusum_allowance_sigmas`) | +| $S$ | $\mathbb{R}_{\geq 0}$ | CUSUM accumulator value | +| $\mu^{(\text{in})}$ | $\mathbb{R}^4$ | Running-mean centering reference for meta-tracker input | + +Vectors are row vectors when they represent observations and column +vectors when they represent basis directions. Subscript $i$ indexes +samples ($i = 1, \ldots, b$); subscript $j$ indexes subspace +dimensions ($j = 1, \ldots, k$). + +### 1.2 Input encoding + +**Centred bit representation.** Each raw value +$v \in \{0, \ldots, 2^{128} - 1\}$ becomes a 128-dimensional centred +bit vector $\mathbf{x} \in \{-0.5, +0.5\}^{128}$: + +$$x_i = \begin{cases} +0.5 & \text{if bit } (127 - i) \text{ of } v \text{ is 1} \\ -0.5 & \text{otherwise} \end{cases} \qquad i = 0, \ldots, 127$$ + +Index 0 is the most significant bit. Centring gives +$\mathbb{E}[x_i] = 0$ under a uniform bit distribution, which is a +prerequisite for subspace analysis without explicit mean subtraction. + +Every centred bit vector has a fixed L2 norm: + +$$\|\mathbf{x}\|^2 = \frac{d}{4}$$ + +This fixed-energy property has important consequences for scoring +(see §3.2). + +**Campus assignment.** The top $c$ bits (`campus_bits`) of $v$ +determine a campus bucket: + +$$\text{campus}(v) = v \gg (128 - c)$$ + +This produces $2^c$ buckets. Each campus maintains an independent set +of subspace trackers. + +**Prefix extraction.** For prefix depth $d$, the working observation +is the first $d$ components of the centred bit vector: + +$$\mathbf{x}^{(d)} = (x_0, \ldots, x_{d-1}) \in \mathbb{R}^d$$ + +### 1.3 Hierarchical dispatch + +Processing a batch of raw `&[u128]`: + +1. Convert to centred bit vectors with campus assignments. +2. For each distinct campus $c$ in the batch: + 1. For each configured prefix depth $d$: + 1. Extract the $d$-dimensional prefix vectors for campus $c$. + 2. Assemble into $X \in \mathbb{R}^{b_c \times d}$. + 3. Feed $X$ to the tracker for key $(c, d)$, creating it + lazily if needed. + 4. Collect the resulting `PrefixReport`. + 2. Assemble prefix reports into a `CampusReport`. +3. Assemble campus reports into a `BatchReport`. +4. For each configured prefix depth $d$: + 1. If fewer than 2 campuses reported for $d$, emit `None`. + 2. Otherwise, collect each campus's batch mean scores into + a 4-dimensional vector + $\mathbf{o}_c = (\bar{s}_{c,\text{nov}}, \bar{s}_{c,\text{disp}}, \bar{s}_{c,\text{surp}}, \bar{s}_{c,\text{coh}})$. + 3. Centre by subtracting the running-mean reference + $\mu^{(\text{in})}$ (§5.2). + 4. Feed the centred matrix to the meta-tracker for depth $d$. + 5. Collect the resulting `CoordinationReport`. + 6. Update $\mu^{(\text{in})}$ with the batch column means. +5. Attach coordination reports to the `BatchReport`. + +### 1.4 The core loop + +Each `SubspaceTracker` processes a batch $X$ in five strictly +ordered phases. Scoring precedes evolution — the batch is measured +against the _prior_ model, then the model is updated. + +#### Phase 1 — Score against the prior model + +Let $U_k = U_{:, :k}$ denote the first $k$ columns of the basis. + +$$Z = X \, U_k \qquad \hat{X} = Z \, U_k^\top \qquad R = X - \hat{X}$$ + +From $Z$, $\hat{X}$, and $R$, four per-sample scores are computed +(§2). Batch summary statistics and optional per-sample vectors are +assembled from the per-sample values. + +#### Phase 2 — Evolve subspace (streaming thin SVD) + +Construct the combined matrix: + +$$M = \begin{bmatrix} \sqrt{\lambda} \; U_k \, \operatorname{diag}(\sigma_{:k}) & \Big| & X^\top \end{bmatrix} \in \mathbb{R}^{d \times (k + b)}$$ + +The left block is the exponentially decayed memory; the right block +is the new data. Compute the thin SVD: + +$$M = \tilde{U} \, \tilde{S} \, \tilde{V}^\top$$ + +Retain the top $n = \min(\min(d, k+b), \; \text{cap})$ components: + +$$U_{:, :n} \leftarrow \tilde{U}_{:, :n} \qquad \sigma_{:n} \leftarrow \operatorname{diag}(\tilde{S})_{:n}$$ + +$\tilde{V}$ is discarded — it describes the mixing coefficients of +the combined columns and is not needed once the basis is updated. + +Without reinforcement, a singular value decays as +$\sigma^{(n)} = \lambda^{n/2} \, \sigma^{(0)}$. The energy +half-life is $t_{1/2} = \ln 2 / \ln(1/\lambda)$. + +| $\lambda$ | $t_{1/2}$ (steps) | Character | +| --------- | ----------------- | ------------- | +| 0.99 | $\approx 69$ | Long memory | +| 0.95 | $\approx 14$ | Medium memory | +| 0.90 | $\approx 7$ | Short memory | + +#### Phase 3 — Evolve latent distribution + +**Per-dimension mean and variance:** + +$$\mu^{(z)}_j \leftarrow \lambda \, \mu^{(z)}_j + \alpha \, \bar{Z}_j \qquad j = 1, \ldots, k$$ + +$$\nu^{(z)}_j \leftarrow \lambda \, \nu^{(z)}_j + \alpha \, \max\!\Big(\operatorname{Var}(Z_{:,j}), \; \varepsilon\Big) \qquad j = 1, \ldots, k$$ + +**Pairwise cross-correlation:** + +$$C_{jl} \leftarrow \lambda \, C_{jl} + \alpha \, \frac{1}{b} \sum_{i=1}^{b} z_{ij} \, z_{il} \qquad j < l$$ + +Only the upper triangle is stored. When rank increases, new entries +are initialised to zero. When rank decreases, outer entries are +ignored but preserved. + +#### Phase 4 — Update score baselines and CUSUM + +Each scoring axis maintains three components (§4): + +1. **Fast EWMA** ($\lambda$) — the instantaneous baseline for + z-score computation. +2. **Slow EWMA** ($\lambda_s$) — the long-memory reference for + CUSUM drift detection. +3. **CUSUM accumulator** — cumulative evidence of sustained + upward departure from the slow baseline. + +**Exception:** coherence does not exist while $k < 2$ (see §2.4). +Scores are identically zero, z-scores are computed against cold +placeholders, and the baselines do not evolve. If rank drops +from $\geq 2$ back to $1$, the coherence baselines are destroyed +(returned to cold state) so stale state cannot leak across rank +epochs. + +#### Phase 5 — Adapt rank + +Every `rank_update_interval` steps: + +1. Compute cumulative energy fractions: + +$$c_i = \frac{\sum_{j=1}^{i} \sigma_j^2}{\sum_{j=1}^{\text{cap}} \sigma_j^2 + \varepsilon} \qquad i = 1, \ldots, \text{cap}$$ + +2. Find the target rank: + +$$k^* = \min\!\big\{i : c_i \geq \tau\big\} + 1 \qquad \text{clamped to } [1, \text{cap}]$$ + +3. Move rank by at most one step: + +$$k \leftarrow k + \operatorname{clamp}(k^* - k, \; -1, \; +1)$$ + +--- + +## 2. The four scoring axes + +Each observation is scored along four independent axes. All four +share a uniform polarity: **higher values indicate greater anomalous +departure from baseline**. + +### 2.1 Novelty (subspace axis) + +$$\text{novelty}_i = \frac{\|\mathbf{x}_i - \hat{\mathbf{x}}_i\|^2}{d - k}$$ + +Average residual energy per orthogonal degree of freedom. High +values mean the observation contains structure the model cannot +explain. Range: $[0, \infty)$. + +### 2.2 Displacement (campus axis) + +$$\text{displacement}_i = \frac{\|\mathbf{z}_i\|^2}{k + \|\mathbf{z}_i\|^2}$$ + +Bounded distance from the subspace origin. Values near 0 indicate +proximity to the centroid; values near 1 indicate large +displacement. Range: $[0, 1)$. + +### 2.3 Surprise (campus axis) + +$$\text{surprise}_i = \frac{1}{k} \sum_{j=1}^{k} \frac{(z_{ij} - \mu^{(z)}_j)^2}{\nu^{(z)}_j + \varepsilon}$$ + +Average Mahalanobis deviation per latent dimension. A direction +with high expected variance must deviate proportionally more to +trigger surprise. Range: $[0, \infty)$. + +### 2.4 Coherence (campus axis) + +$$\text{coherence}_i = \frac{2}{k(k-1)} \sum_{j < l} \big(z_{ij} \, z_{il} - C_{jl}\big)^2$$ + +Average squared deviation of pairwise latent products from their +historical expectations. Detects unusual _combinations_ of +activations when individual dimensions have normal magnitudes. +Range: $[0, \infty)$. Defined as $0$ when $k = 1$. + +**Coherence does not exist at $k = 1$.** With fewer than two +latent dimensions there are no pairs, so coherence is identically +zero — it is not a meaningful scoring axis. The coherence EWMA +and CUSUM baselines are **not updated** while $k < 2$. When rank +first reaches 2, the baselines are cold and the first real +coherence values enter through the EWMA's cold→warm path, setting +the baseline directly. + +**Baseline destruction on rank drop.** If rank subsequently falls +from $\geq 2$ back to $1$ (Phase 5), the coherence baselines are +**destroyed** — both the fast and slow EWMAs and the CUSUM +accumulator are returned to the cold state. This prevents stale +state from a previous $k \geq 2$ epoch from contaminating a +future one. When rank reaches $2$ again the baseline is born +fresh, as if the axis had never existed. + +### 2.5 Summary + +| Axis | Score | Range | Measures | +| -------- | ---------------- | ------------- | --------------------------------- | +| Subspace | **Novelty** | $[0, \infty)$ | Unexplained structure | +| Campus | **Displacement** | $[0, 1)$ | Distance from centroid | +| Campus | **Surprise** | $[0, \infty)$ | Per-dimension magnitude deviation | +| Campus | **Coherence** | $[0, \infty)$ | Pairwise co-activation deviation | + +### 2.6 Geometric picture + +``` +Full observation space ℝ^d +┌─────────────────────────────────────────────────┐ +│ │ +│ Learned subspace ℝ^k │ +│ ┌─────────────────────┐ │ +│ │ │ │ +│ │ μ^(z) ← centroid │ Residual: x − x̂ │ +│ │ · │ ←─────── novelty ──→ │ +│ │ /| │ │ +│ │ / | │ │ +│ │ z | │ │ +│ │ | │ │ +│ │ ├──┤ displacement │ │ +│ │ │ │ (raw dist) │ │ +│ │ │ │ │ │ +│ │ ├──┤ surprise │ │ +│ │ │ │ (Mahalanobis) │ │ +│ │ | │ │ +│ │ z₁·z₂ ← coherence │ │ +│ │ (cross-correlation)│ │ +│ └─────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────┘ +``` + +### 2.7 Addressed blind spots + +Three attack patterns are invisible to the per-tracker fast EWMA +alone. All three are addressed by dedicated subsystems: + +1. **Slow drift.** Gradual profile shifts where each batch is + barely anomalous but the cumulative change is large. The fast + EWMA absorbs the drift, making z-scores converge to zero. + Addressed by the CUSUM accumulator (§4.3), which accumulates + the gap between batch mean scores and a slow EWMA reference. + The slow baseline lags behind the drift, so the CUSUM grows + monotonically until the slow baseline catches up. + +2. **Partial-campus coordination.** A subset of campuses drifts + gradually while the rest remain normal. Each affected campus's + per-tracker CUSUM accumulates individually. Simultaneously, the + meta-tracker (§5) detects the _asymmetry_ — the distribution + of campus scores develops a structure (bimodal, tailed) that + the meta-tracker's subspace has not seen. The meta-CUSUM + accumulates the gradual divergence. This scenario gets + double coverage (§5.6). + +3. **All-campus coordination.** Many campuses each receiving + mildly anomalous traffic that is alarming only in aggregate. + Each per-tracker report is mild; the attack budget is diluted + across all campuses, keeping individual CUSUMs low. Addressed + by the meta-tracker (§5), which treats campus score summaries + as observations in a 4-dimensional space and applies the same + subspace analysis at a second level. CUSUM at the meta level + catches gradual coordinated drift. Running-mean centering + (§3.9) ensures that _uniform_ elevation is not subtracted away. + +### 2.8 Remaining risk: tracker exhaustion + +The sentinel creates trackers lazily — one per (campus, depth) pair +that actually receives traffic. An attacker who can submit +observations with adversarially diverse leading bits (i.e. values +spread across many distinct campus buckets) forces the sentinel to +instantiate up to $2^c \times D$ trackers, each carrying ~73 KB of +persistent state at default depths. + +At `campus_bits = 14` the ceiling is $16\,384 \times 8 \approx 1.2$ +GB. At `campus_bits = 20` it rises to $1\,048\,576 \times 8 \approx +73$ GB — reachable only if the attacker controls enough distinct +/20 prefixes, but a real risk in open-submission systems. + +This is not a _detection_ blind spot — the sentinel's scoring axes +still work correctly — it is a _resource_ attack against the +sentinel itself. The sentinel does not impose policy, so mitigation +belongs to the host: + +- **Monitor `active_trackers`** in `BatchReport` / `HealthReport` + and alert or shed load when it crosses an acceptable threshold. +- **Evict stale trackers.** The host can periodically prune + trackers whose `real_observations` have not grown, reclaiming + memory from one-shot campus buckets. +- **Rate-limit campus creation.** Before calling `ingest`, the + host can cap the number of distinct campuses per batch or per + time window. + +Because the sentinel measures and the host decides, tracker +lifecycle management is deliberately outside the sentinel's scope. + +--- + +## 3. Insights + +This section explains the reasoning behind the non-obvious design +choices. + +### 3.1 Why four axes, not one composite score + +A full Mahalanobis distance +$(\mathbf{z} - \mu)^\top \Sigma^{-1} (\mathbf{z} - \mu)$ with a +dense covariance matrix $\Sigma$ would subsume both surprise and +coherence. The design deliberately separates them for three reasons: + +1. **Different operational meaning.** "Each dimension's magnitude is + unusual" and "the combination of dimensions is unusual" are + different threat signatures. The host may want to threshold them + independently. +2. **No matrix inversion.** $\Sigma$ can become singular or + ill-conditioned, especially when $k$ adapts and a new dimension + has no history. The factored approach avoids inversion entirely. +3. **Transparency.** Two simple scores are easier to reason about + and debug than one composite score. + +The three campus-axis scores decompose the latent activation pattern +along orthogonal statistical concerns: + +| Metric | Input | Measures | +| ------------ | ---------------------------------------- | ------------------------------------------------- | +| Displacement | $\|\mathbf{z}_i\|^2$ | Total energy (scalar) | +| Surprise | $(z_{ij} - \mu_j)^2 / \nu_j$ per $j$ | Per-dimension magnitude (diagonal of covariance) | +| Coherence | $(z_{ij} z_{il} - C_{jl})^2$ per $j < l$ | Pairwise interaction (off-diagonal of covariance) | + +Together they cover the full covariance structure without assembling +or inverting a dense $k \times k$ matrix. + +### 3.2 Why there is no projection energy axis + +Under the centred binary encoding, every observation has the same L2 +norm $\|\mathbf{x}_i\|^2 = d/4$. By the Pythagorean theorem: + +$$\underbrace{\|\hat{\mathbf{x}}_i\|^2}_{\text{projection energy}} + \underbrace{\|\mathbf{x}_i - \hat{\mathbf{x}}_i\|^2}_{\text{residual energy}} = \frac{d}{4}$$ + +A "normality" axis $\|\hat{\mathbf{x}}_i\|^2 / k$ would therefore +be a perfect affine function of novelty: + +$$\text{normality}_i = \frac{d}{4k} - \frac{d - k}{k} \cdot \text{novelty}_i$$ + +It carries zero independent information (Pearson correlation $= -1$ +with novelty). It also violates the polarity convention — novel +attacks drive it _downward_ — which would defeat the EWMA outlier +filter (see §3.3). + +This constraint is specific to centred binary inputs. If the sentinel +were extended to continuous-valued inputs where $\|\mathbf{x}_i\|^2$ +varies, normality and novelty would decouple and a projection-energy +axis might become warranted. + +### 3.3 Why displacement, not conformity — and why polarity matters + +The EWMA baseline rejects observations above a ceiling before +updating (§4): + +$$c = \bar{s} + 2\sqrt{\bar{v}}$$ + +This upper-tail filter protects the baseline from poisoning: if an +attacker can only _raise_ a score, the filter clips the attack values +and the baseline stays anchored. All four axes must therefore satisfy +a **polarity invariant**: anomalous observations produce higher +values than the normal population mean. + +**The conformity form fails.** The natural measure of campus distance +is conformity $q_i = k / (k + \|\mathbf{z}_i\|^2)$, where normal +observations score near $1$ and distant ones score near $0$. +Anomalies push $q$ _downward_. These low values always pass the +upper-tail filter (they are below the ceiling), so sustained attack +traffic drags the baseline toward the attack distribution: + +$$\bar{s}_{t+1} = \lambda \, \bar{s}_t + (1 - \lambda) \, a \xrightarrow{t \to \infty} a$$ + +After convergence a new attack value produces $\zeta = 0$. The +attack is invisible. + +No complementary axis reliably compensates: an observation that is +far from the centroid but aligns well with the subspace (low +novelty), deviates along high-variance directions (moderate +surprise), and has typical pairwise products (normal coherence) +triggers only the distance axis. + +**The complement restores robustness.** Displacement +$1 - q_i = \|\mathbf{z}_i\|^2 / (k + \|\mathbf{z}_i\|^2)$ flips +the polarity. Anomalies push it toward $1$, where the upper-tail +filter clips them. It is bounded in $[0, 1)$, non-negative, +monotone, bijective ($f^{-1}(y) = ky/(1 - y)$), and algebraically +trivial — the unique transform among the obvious candidates that +satisfies all five requirements: + +| Transform | Polarity | Bounded | Non-negative | Simple | Bijective | +| --------- | -------- | --------------- | ------------ | ------ | --------- | +| $1 - x$ | ✓ | ✓ $[0, 1)$ | ✓ | ✓ | ✓ | +| $1/x$ | ✓ | ✗ $[1, \infty)$ | ✓ | ✓ | ✓ | +| $-\ln x$ | ✓ | ✗ $[0, \infty)$ | ✓ | ✗ | ✓ | +| $-x$ | ✓ | ✓ $[-1, 0)$ | ✗ | ✓ | ✓ | + +### 3.4 Why thin SVD, not full SVD + +Full SVD produces +$\tilde{U}_{\text{full}} \in \mathbb{R}^{d \times d}$, computing +$d - m$ left singular vectors that are immediately discarded. At the +typical operating point ($d = 128$, $k + b \leq 80$), this wastes +roughly $60\%$ of the computation. Thin SVD computes exactly the +columns needed. + +### 3.5 Why the EWMA outlier filter is upper-tail only + +All four scoring axes are non-negative and right-skewed: anomalous +departure inflates scores, never deflates them. A lower-tail filter +would wrongly reject legitimate low scores during quiet periods. The +filter is also skipped entirely on the first update — the cold-start +placeholders ($\bar{s} = 1$, $\bar{v} = 1$) are not a real baseline, +so "outlier" has no meaning yet. + +### 3.6 Why ±1 rank steps + +Rank adaptation clamps changes to ±1 per evaluation. Without this, +a tracker near a rank boundary could oscillate between two ranks on +successive evaluations as noise fluctuations push the cumulative +energy across the threshold and back. + +### 3.7 What the input values must be + +The sentinel analyses bit-positional structure: campus buckets from +leading bits, centred bit vectors where each position is a dimension, +prefix slices at multiple depths. This is meaningful only when the +input `u128` values have **hierarchical positional structure** — +leading bits define coarse groupings, successive bits refine them. + +Values with this property include IPv6 addresses, IPv4-mapped +addresses, and similar hierarchically allocated identifiers. +Values _without_ it — cryptographic hashes, random nonces, UUIDs — +have pseudo-random bit distributions that defeat prefix analysis. +The sentinel processes any `u128` without complaint, but its scores +are meaningful only when the data has the expected bit layout. The +host is responsible for this guarantee. + +### 3.8 Why dual-EWMA, not a frozen reference + +The CUSUM accumulator needs a reference that the fast EWMA has +drifted _away from_. Two options: a frozen checkpoint, or a slow +EWMA. + +A frozen reference requires the host to manually reset it after +legitimate traffic regime changes (new users, seasonal patterns, +topology changes). That is operational burden and a policy +decision — it violates "the sentinel measures; the host decides." + +A slow EWMA at decay $\lambda_s > \lambda$ adapts automatically, +just slowly enough that an attack spanning tens to hundreds of +batches gets caught before absorption. The sensitivity window is +controlled by the ratio of half-lives. Self-healing: after a +legitimate regime change, the slow baseline eventually catches up +and the CUSUM returns to zero without intervention. + +### 3.9 Why running-mean centering for the meta-tracker + +The meta-tracker receives 4D score vectors (one per campus per +batch). These must be centred before SVD. Two options: + +**Per-batch centering** (subtract this batch's column means) +removes absolute elevation. If all campuses shift up uniformly +(uniform coordination), every centred vector is near zero — the +signal is invisible. + +**Running-mean centering** (subtract an EWMA of historical column +means) preserves the gap between the current batch and the recent +history. If traffic has been gradually shifting, the EWMA lags +behind, creating a residual bias in the centred data. That bias +projects into latent space as elevated displacement. The +meta-CUSUM accumulates it. + +Running-mean centering catches both _differential_ coordination +(some campuses anomalous relative to others) and _uniform_ +coordination (all campuses shifting together). It costs only 4 +extra floats per depth. + +### 3.10 Constant-norm property at the meta level + +The §3.2 proof that projection energy is redundant with novelty +depends on every observation having constant L2 norm ($d/4$). +Meta-tracker input (campus mean-score vectors) violates this — +norms vary across campuses. Projection energy and novelty are no +longer perfectly correlated at $d = 4$. + +In practice this is negligible. At $d = 4$ with cap $\leq 4$, the +rank quickly reaches 3–4, leaving $\leq 1$ residual DOF for novelty +to measure. The four existing axes capture the essential structure. +If continuous-input support becomes first-class, a projection-energy +axis could be reconsidered. + +--- + +## 4. Baseline tracking + +Each scoring axis maintains three components: a fast EWMA for +instantaneous z-scores, a slow EWMA as a long-memory reference, +and a CUSUM accumulator for gradual drift detection. + +### 4.1 Fast EWMA + +The fast EWMA tracks the exponentially-weighted running mean +$\bar{s}$ and variance $\bar{v}$ at decay $\lambda$. + +#### 4.1.1 Update rule + +Given per-sample scores $\mathbf{s} = (s_1, \ldots, s_b)$: + +1. **Outlier filtering (warm baselines only).** Compute + $c = \bar{s} + 2\sqrt{\bar{v}}$. Retain samples with $s_i < c$. + If all are rejected, the baseline is unchanged. + +2. **Mean update.** Let $\bar{s}_{\text{batch}}$ be the mean of the + retained samples. + - _Cold → warm:_ $\bar{s} \leftarrow \bar{s}_{\text{batch}}$ + - _Subsequent:_ $\bar{s} \leftarrow \lambda \, \bar{s} + \alpha \, \bar{s}_{\text{batch}}$ + +3. **Variance update.** Let $\bar{v}_{\text{batch}}$ be the + population variance (requires $\geq 2$ samples). + - _Cold → warm:_ $\bar{v} \leftarrow \max(\bar{v}_{\text{batch}}, \; 10^{-4})$ + - _Subsequent:_ $\bar{v} \leftarrow \lambda \, \bar{v} + \alpha \, \max(\bar{v}_{\text{batch}}, \; 10^{-4})$ + + The $10^{-4}$ floor prevents degenerate zero-variance baselines. + +#### 4.1.2 Z-score computation + +$$\zeta(s) = \frac{s - \bar{s}}{\sqrt{\bar{v}} + \varepsilon}$$ + +Two z-scores are reported per axis: + +- $\zeta(\max_i \, s_i)$ — the loudest alarm in the batch. +- $\zeta(\bar{s}_{\text{batch}})$ — sustained elevation indicator. + +### 4.2 Slow EWMA + +Each scoring axis maintains a second EWMA at decay $\lambda_s$ +(per-tracker) or $\lambda_{s,m}$ (meta-tracker), with the same +update rule as §4.1.1 but slower adaptation. The slow baseline +provides the reference for the CUSUM accumulator. + +The constraint $\lambda_s > \lambda$ (and $\lambda_{s,m} > \lambda$) +is structural — the slow baseline _must_ have longer memory than +the fast one, or the CUSUM detects nothing. + +| Decay | Half-life | Role | +| ------------------- | ------------------- | ------------------------ | +| $\lambda = 0.99$ | $\approx 69$ steps | Fast baseline (z-scores) | +| $\lambda_s = 0.999$ | $\approx 693$ steps | Slow reference (CUSUM) | + +### 4.3 CUSUM accumulator + +The CUSUM (Cumulative Sum) accumulator detects sustained upward +drift of batch mean scores away from the slow baseline. It is a +one-sided Page's test: + +$$S_t = \max\!\Big(0,\; S_{t-1} + \big(\bar{s}_{\text{batch},t} - \bar{s}_{\text{slow},t}\big) - \kappa\Big)$$ + +where $\bar{s}_{\text{batch},t}$ is the current batch mean, +$\bar{s}_{\text{slow},t}$ is the slow EWMA mean, and $\kappa$ is +the noise allowance: + +$$\kappa = \kappa_\sigma \cdot \sqrt{\bar{v}_{\text{slow},t}}$$ + +The $\max(0, \ldots)$ clamp resets the accumulator when the +deviation reverses — it only builds evidence for sustained +_upward_ drift, matching the polarity invariant. + +Under normal conditions, $\bar{s}_{\text{batch}}$ fluctuates around +$\bar{s}_{\text{slow}}$. The allowance absorbs these fluctuations. +Under a gradual attack, $\bar{s}_{\text{batch}}$ consistently +exceeds $\bar{s}_{\text{slow}} + \kappa$, and the accumulator +grows monotonically. + +### 4.4 CUSUM cold-start + +The CUSUM is meaningless during noise injection — the gap between +fast and slow baselines during warm-up is an initialisation +artefact, not a real signal. The accumulator is **reset to zero** +after noise injection completes. The `steps_since_reset` counter +in `CusumSnapshot` tracks batches since the last reset. + +### 4.5 CUSUM reset semantics + +Two mechanisms, not mutually exclusive: + +1. **Automatic self-healing.** The slow baseline eventually absorbs + a legitimate regime change. As + $\bar{s}_{\text{batch}} - \bar{s}_{\text{slow}} \to 0$, the + accumulator stops growing and the $\max(0, \ldots)$ clamp + drives it back to zero. No host intervention required. + +2. **Host-initiated reset.** The host inspects the alarm, decides + it reflects a legitimate regime change, and zeroes the + accumulator via `reset_cusum()`. The slow baseline continues + adapting on its own. + +--- + +## 5. Cross-campus coordination + +The meta-tracker detects anomalous patterns in the _distribution of +scores across campuses_ for a given prefix depth. It reuses the +`SubspaceTracker` machinery at a second level of abstraction. + +### 5.1 Meta-tracker input + +After all per-tracker scoring for prefix depth $d$, each reporting +campus $c$ contributes a 4-dimensional score summary vector: + +$$\mathbf{o}_c = \big(\bar{s}_{c,\text{nov}},\; \bar{s}_{c,\text{disp}},\; \bar{s}_{c,\text{surp}},\; \bar{s}_{c,\text{coh}}\big) \in \mathbb{R}^4$$ + +These are **raw batch mean scores** (not z-scores). The meta-tracker +learns its own normalisation via its subspace; raw means preserve +absolute magnitude information. + +Assemble $O \in \mathbb{R}^{n_c \times 4}$ where $n_c$ is the +number of campuses reporting for this depth in this batch. + +### 5.2 Running-mean centering + +The orchestrator maintains a 4D EWMA reference +$\mu^{(\text{in})} \in \mathbb{R}^4$ at decay $\lambda$. Before +feeding the meta-tracker: + +$$O_{\text{centred}} = O - \mathbf{1}_{n_c} \cdot \big(\mu^{(\text{in})}\big)^\top$$ + +After feeding, update $\mu^{(\text{in})}$ with the batch column +means. Running-mean centering catches both _differential_ +coordination (some campuses anomalous relative to others) and +_uniform_ coordination (all campuses shifting together). See §3.9. + +### 5.3 Meta-tracker operation + +The meta-tracker is a `SubspaceTracker` with $d = 4$ and +$\text{cap} = \min(4, r_{\max})$. It runs the same five-phase core +loop (§1.4), producing the same four scoring axes. At this level: + +| Meta-axis | Detects | +| ---------------- | ------------------------------------------------------------------------------ | +| **Novelty** | A campus-score pattern the model has never seen — a new _kind_ of coordination | +| **Displacement** | The overall score landscape has shifted away from the learned centroid | +| **Surprise** | A specific scoring axis is system-wide anomalous | +| **Coherence** | An unusual _combination_ of axis elevations | + +Each meta-axis carries its own fast EWMA, slow EWMA, and CUSUM +accumulator at decay $\lambda_{s,m}$. + +### 5.4 Minimum campuses + +The meta-tracker requires $n_c \geq 2$ to operate. With a single +campus, centering produces the zero vector and the SVD is degenerate. +When fewer than 2 campuses report for a depth, the coordination +report is `None`. As traffic diversifies, meta-trackers activate +naturally. The host should lower `campus_bits` if batches rarely +contain $\geq 2$ distinct campuses. + +### 5.5 Chained noise injection + +The meta-tracker warms naturally during per-tracker noise injection. +When the orchestrator injects noise into per-trackers, those trackers +produce scores. The orchestrator collects those scores, assembles +4D summary vectors, and feeds them through the meta-tracker's +`observe` path. This warms the meta-tracker with score distributions +that reflect _actual noise-on-noise_ baselines. + +After injection, the meta-tracker's CUSUM accumulators are reset to +zero alongside the per-tracker CUSUMs. + +### 5.6 Coverage matrix + +``` + Sudden Gradual Sudden Gradual Sudden Gradual + single single partial partial all all + +Per-tracker z ✓ ✗ absorb ✓ each ✗ absorb ✗ mild ✗ both +Per-tracker CUSUM (redund.) ✓ (redund.) ✓ each ✗ mild ✗ diluted +Meta-tracker z ✗ single ✗ absorb ✓ asym. ✗ absorb ✓ ✗ absorb +Meta-tracker CUSUM ✗ single ✗ single (redund.) ✓ asym. (redund.) ✓ +``` + +Every cell is covered by at least one tier. + +**Partial vs. all-campus gradual.** The partial-campus gradual +column is the strongest-covered scenario: per-tracker CUSUM catches +each affected campus individually _and_ meta-tracker CUSUM catches +the asymmetric pattern across campuses. This is stronger than +all-campus gradual, where the attack budget is diluted across every +campus, making individual per-tracker CUSUM values mild. The +meta-tracker CUSUM is the primary detector for all-campus gradual; +for partial-campus gradual it provides a corroborating second signal. + +The host can distinguish partial from all-campus coordination by +comparing per-campus CUSUM values: if only a subset of campuses +show elevated `cusum.accumulator` while the coordination report's +CUSUM is also elevated, the attack is partial. If per-campus CUSUMs +are uniformly mild but the coordination CUSUM is elevated, the +attack is spread thin across all campuses. + +--- + +## 6. Noise injection and maturity + +### 6.1 Purpose + +A newly created tracker has a random orthonormal basis and +placeholder baselines. Without warming, its first real scores would +be dominated by initialisation artefacts. Noise injection feeds +synthetic uniformly random centred vectors through the standard +`observe` path to: + +1. **Warm the EWMA baselines** to reflect the score distribution of + structureless noise rather than placeholders. +2. **Diversify the subspace** toward a generic representation of + random structure. + +### 6.2 Noise parameters + +| Parameter | Type | Default | Effect | +| ------------ | ------------- | ---------- | --------------------------------------- | +| `rounds` | `usize` | 50 | Number of synthetic batches per tracker | +| `batch_size` | `usize` | 16 | Samples per synthetic batch | +| `seed` | `Option` | `Some(42)` | RNG seed (`None` = system entropy) | + +### 6.3 Maturity tracking + +Each tracker records a `TrackerMaturity`: + +| Field | Type | Meaning | +| -------------------- | ----- | --------------------------------------------------------------------------- | +| `real_observations` | `u64` | Count of genuine observations processed | +| `noise_observations` | `u64` | Count of noise observations processed | +| `noise_influence` | `f64` | Estimated fraction of baseline not yet established by real data (see below) | + +Noise influence decays as: + +$$\eta_{t+1} = \begin{cases} \lambda \, \eta_t + (1 - \lambda) & \text{noise observation} \\ \lambda \, \eta_t & \text{real observation} \end{cases}$$ + +After $n$ real observations: $\eta_n = \lambda^n$. The sentinel +reports $\eta$ without interpretation. + +**Cold-start note.** Every new tracker starts with $\eta = 1.0$, +including trackers that have never received noise injection. The +initial baselines are arbitrary placeholders — not a learned +model — so $\eta = 1.0$ is correct: zero percent of the baseline +is established by real data. + +To distinguish a cold tracker from a noise-warmed one, check +`noise_observations`: + +| `noise_observations` | $\eta$ | Interpretation | +| -------------------- | ---------- | --------------------------------------------- | +| $0$ | $1.0$ | Cold — baselines are placeholders | +| $0$ | $< 1.0$ | Maturing on real data only, no noise injected | +| $> 0$ | near $1.0$ | Noise-warmed, little real data yet | +| $> 0$ | near $0.0$ | Mature — noise influence has decayed away | + +--- + +## 7. Configuration + +| Parameter | Type | Default | Constraint | Effect | +| ------------------------ | --------- | ------------------------- | ------------------- | --------------------------------------- | +| `max_rank` | `usize` | 16 | $\geq 1$ | Hard ceiling on subspace rank | +| `forgetting_factor` | `f64` | 0.99 | $(0, 1)$ | Exponential decay rate $\lambda$ | +| `rank_update_interval` | `u64` | 100 | $\geq 1$ | Steps between rank adaptation | +| `campus_bits` | `u8` | 14 | $[1, 32]$ | $\log_2$ of campus bucket count | +| `prefix_depths` | `Vec` | [8,16,24,32,48,64,96,128] | each $\in [1, 128]$ | Analysed prefix granularities | +| `energy_threshold` | `f64` | 0.90 | $(0, 1)$ | Cumulative variance target for rank | +| `eps` | `f64` | $10^{-6}$ | $> 0$ | Numerical stability constant | +| `cusum_slow_decay` | `f64` | 0.999 | $(\lambda, 1)$ | Slow EWMA decay for per-tracker CUSUM | +| `cusum_meta_slow_decay` | `f64` | 0.999 | $(\lambda, 1)$ | Slow EWMA decay for meta-tracker CUSUM | +| `cusum_allowance_sigmas` | `f64` | 0.5 | $\geq 0$ | CUSUM noise allowance in $\sigma$ units | +| `per_sample_scores` | `bool` | false | — | Include per-observation score vectors | + +--- + +## 8. Report hierarchy + +``` +BatchReport +├── lifetime_observations: u64 +├── active_trackers: usize +├── campus_reports: [CampusReport] +│ ├── campus_id: u32 +│ ├── sample_count: usize +│ └── prefix_reports: [PrefixReport] +│ ├── depth: u8 +│ ├── rank: usize +│ ├── energy_ratio: f64 +│ ├── top_singular_value: f64 +│ ├── maturity: TrackerMaturity +│ ├── scores: AnomalyScores +│ │ ├── novelty: ScoreDistribution { ... } +│ │ ├── displacement: ScoreDistribution { ... } +│ │ ├── surprise: ScoreDistribution { ... } +│ │ └── coherence: ScoreDistribution { ... } +│ └── per_sample: Option<[SampleScore]> +│ └── { novelty, displacement, surprise, coherence, +│ novelty_z, displacement_z, surprise_z, coherence_z } +└── coordination: [Option] + ├── depth: u8 + ├── campuses_reporting: usize + ├── rank: usize + ├── energy_ratio: f64 + ├── top_singular_value: f64 + ├── maturity: TrackerMaturity + ├── scores: AnomalyScores { ... } + └── per_campus: Option<[SampleScore]> + +ScoreDistribution +├── min, max, mean: f64 +├── max_z_score, mean_z_score: f64 +├── baseline: BaselineSnapshot { mean, variance } +└── cusum: CusumSnapshot { accumulator, slow_baseline, steps_since_reset } +``` + +A `HealthReport` is available on demand, summarising rank +distribution, maturity distribution, and tracker population. + +--- + +## 9. Matrix dimensions + +### 9.1 Per-tracker matrices + +For a single `observe()` call with prefix depth $d$, active rank +$k$, batch size $b$, and capacity $\text{cap}$: + +| Matrix | Shape | Notes | +| ------------------------------ | -------------------------- | ----------------------------------------------- | +| $X$ | $(b, d)$ | Centred prefix bits (input) | +| $U$ | $(d, \text{cap})$ | Orthonormal basis (columns $:k$ active) | +| $\sigma$ | $(\text{cap},)$ | Singular values (entries $:k$ meaningful) | +| $Z = X \, U_{:,:k}$ | $(b, k)$ | Latent projection | +| $\hat{X} = Z \, U_{:,:k}^\top$ | $(b, d)$ | Reconstruction | +| $M$ | $(d, \; k + b)$ | Combined matrix for streaming SVD | +| $\tilde{U}$ | $(d, \; \min(d, k+b))$ | Thin left singular vectors | +| $\tilde{S}$ | $(\min(d, k+b),)$ | Thin singular values | +| $\mu^{(z)}$ | $(\text{cap},)$ | Latent mean (only $:k$ active) | +| $\nu^{(z)}$ | $(\text{cap},)$ | Latent variance (only $:k$ active) | +| $C$ | $(\text{cap}, \text{cap})$ | Cross-correlation (upper triangle, $:k$ active) | + +Typical sizes: $d \in \{8, \ldots, 128\}$, $k \in \{1, \ldots, 16\}$, +$b \in \{1, \ldots, 64\}$. The largest thin SVD is approximately +$(128, 80)$ — microsecond range on modern hardware. + +### 9.2 Meta-tracker matrices + +For a single coordination pass at prefix depth $d$ with $n_c$ +reporting campuses and meta-tracker rank $k_m$: + +| Matrix | Shape | Notes | +| -------------------- | ------------ | ------------------------------------- | +| $O$ | $(n_c, 4)$ | Campus mean-score vectors (raw input) | +| $\mu^{(\text{in})}$ | $(4,)$ | Running-mean centering reference | +| $O_{\text{centred}}$ | $(n_c, 4)$ | After subtracting $\mu^{(\text{in})}$ | +| $U_m$ | $(4, k_m)$ | Meta-tracker subspace basis | +| $Z_m$ | $(n_c, k_m)$ | Meta-tracker latent projection | + +Typical sizes: $n_c \in \{2, \ldots, 100\}$, $k_m \in \{1, \ldots, 4\}$. +The meta-tracker thin SVD is at most $(4, n_c + k_m)$ — negligible +compared to per-tracker SVDs. + +--- + +## 10. Complexity + +Let $D = |\texttt{prefix\_depths}|$, $C$ = distinct campuses in a +batch, $n_c$ = campuses reporting per depth, $k_m$ = meta-tracker +rank. + +### 10.1 Per-tracker + +| Operation | Per-tracker | Per-batch | +| --------------------- | ---------------------- | -------------------------------------- | +| Projection / recon | $O(b \cdot d \cdot k)$ | $O(C \cdot D \cdot b \cdot d \cdot k)$ | +| Novelty scoring | $O(b \cdot d)$ | $O(C \cdot D \cdot b \cdot d)$ | +| Displacement/surprise | $O(b \cdot k)$ | $O(C \cdot D \cdot b \cdot k)$ | +| Coherence scoring | $O(b \cdot k^2)$ | $O(C \cdot D \cdot b \cdot k^2)$ | +| Streaming SVD | $O(d \cdot (k+b)^2)$ | $O(C \cdot D \cdot d \cdot (k+b)^2)$ | +| Cross-correlation | $O(b \cdot k^2)$ | $O(C \cdot D \cdot b \cdot k^2)$ | +| EWMA baselines (×2) | $O(1)$ per axis | $O(C \cdot D)$ | +| CUSUM update | $O(1)$ per axis | $O(C \cdot D)$ | +| Rank adaptation | $O(\text{cap})$ | $O(C \cdot D \cdot \text{cap})$ | + +### 10.2 Coordination (meta-tracker) + +| Operation | Per-depth | Per-batch | +| ---------------------- | -------------------------- | ---------------------------------- | +| Running-mean centering | $O(n_c)$ | $O(D \cdot n_c)$ | +| Meta-tracker SVD | $O(4 \cdot (k_m + n_c)^2)$ | $O(D \cdot 4 \cdot (k_m + n_c)^2)$ | +| Meta-scoring | $O(n_c)$ | $O(D \cdot n_c)$ | +| Meta-EWMA + CUSUM | $O(1)$ per axis | $O(D)$ | + +The per-tracker streaming SVD dominates. Coordination overhead is +negligible: the meta-tracker operates at $d = 4$ with small $n_c$. + +--- + +## 11. How the idea developed + +The sentinel began as a Python prototype (`idea.py`) built on PyTorch, +designed as an opinionated end-to-end anomaly verdict system. Its +evolution from prototype to specification involved several +conceptual shifts. + +### 11.1 From verdicts to measurements + +The prototype mapped every batch to a `Verdict` with a `ThreatLevel` +enum (`PEACEFUL`, `SUSPICIOUS`, `AGGRESSIVE`, `CATASTROPHIC`), each +carrying a recommended action (`MONITOR`, `LOG`, `RATE_LIMIT`, `BAN`) +and a human-readable description. A confidence score was computed via +sigmoid transforms on z-scores, and thresholds were baked into the +detector. + +This was removed entirely. A library embedded in a host application +has no business deciding policy. The host knows things the sentinel +cannot: the operational context, the cost of false positives, whether +the traffic is from a trusted peer. The sentinel's job is to produce +measurements — means, z-scores, baseline snapshots — and let the +host decide what they mean. + +### 11.2 From two axes to four + +The prototype scored observations along two axes: + +- **External** — reconstruction error per residual DOF (now: novelty). +- **Internal** — diagonal Mahalanobis distance per rank (now: + surprise). + +This left a gap: an observation could be far from the centroid in +latent space without triggering either axis — it projects well onto +the subspace (low novelty) and deviates along high-variance +directions (moderate surprise). That gap became the **displacement** +axis. + +Similarly, individual latent dimensions could have normal magnitudes +while their _combination_ was unprecedented. That gap became the +**coherence** axis, measuring deviation of pairwise latent products +from their historical cross-correlation. + +The four axes now decompose the full covariance structure — total +energy, diagonal, off-diagonal — without maintaining or inverting a +dense matrix. + +### 11.3 From conformity to displacement + +The prototype computed a "conformity" score: +$k / (k + \|\mathbf{z}\|^2)$, where normal observations scored +high and distant ones scored low. This worked for ranking, but +interacted badly with the EWMA outlier filter: attack values that +_lowered_ the score passed through the upper-tail filter and +poisoned the baseline. Analysis showed that the complement form — +displacement — is the unique transform satisfying all robustness +requirements (bounded, non-negative, correct polarity, bijective, +simple). The full proof is in §3.3. + +### 11.4 The elimination of projection energy + +An early design included a "normality" axis measuring projection +energy $\|\hat{\mathbf{x}}\|^2 / k$. Under continuous-valued inputs +this would carry information independent of novelty. But under the +sentinel's centred binary encoding, every observation has the same +L2 norm, making projection energy a perfect affine function of +residual energy. It was dropped as provably redundant (§3.2). + +### 11.5 Campus granularity + +The prototype used `campus_bits = 7` (128 buckets), suitable for +small-scale testing. The specification defaults to +`campus_bits = 14` (16 384 buckets) to provide useful spatial +granularity under diverse real-world observation streams. + +### 11.6 From PyTorch to faer + +The prototype used PyTorch tensors and `torch.linalg.svd`. The Rust +implementation uses the `faer` crate for linear algebra, which +provides thin SVD without a deep learning framework dependency. The +EWMA module was rewritten in pure `f64` arithmetic with no matrix +library dependency at all. + +### 11.7 From blind spots to first-class subsystems + +An early review of the four-axis architecture identified two blind +spots: gradual drift (absorbed by the fast EWMA) and cross-campus +coordination (invisible to per-tracker analysis). Rather than accept +these as limitations, the specification was extended to address both +as first-class subsystems. + +The dual-EWMA + CUSUM combination (§4) was chosen over a frozen +reference because it requires no manual resets and self-heals after +legitimate regime changes — preserving the "sentinel measures, host +decides" principle. The meta-tracker (§5) was chosen over simpler +aggregation (e.g., max-of-campuses) because it captures the +_pattern_ of cross-campus scores, not just their magnitude, and +reuses the existing `SubspaceTracker` machinery. + +### 11.8 CUSUM and the dual-baseline approach + +The CUSUM accumulator was inspired by Page's cumulative sum test +(1954), adapted for online operation with exponentially-weighted +references. The key design decisions: + +1. **Two independent slow decay parameters.** Per-tracker and + meta-tracker CUSUM share the same formula but may need different + sensitivity windows. Separate `cusum_slow_decay` and + `cusum_meta_slow_decay` fields give the host this control. + +2. **σ-normalised allowance.** A single `cusum_allowance_sigmas` + parameter works across all four scoring axes because the + allowance is expressed in units of slow-baseline standard + deviation, not in absolute score units. + +3. **Running-mean centering, not batch-mean centering.** The + meta-tracker subtracts a running-mean reference (EWMA of + historical column means) rather than the current batch's column + means. This preserves the signal from _uniform_ coordination + (all campuses shifting together) which batch-mean centering + destroys. See §3.9 for the full argument. + +4. **Post-injection reset.** The CUSUM is reset to zero after + noise injection. The gap between fast and slow baselines during + warm-up is an initialisation artefact, not real drift. diff --git a/packages/sentinel/docs/native-g-v-dual-tree-campuses.md b/packages/sentinel/docs/native-g-v-dual-tree-campuses.md new file mode 100644 index 00000000..8866da80 --- /dev/null +++ b/packages/sentinel/docs/native-g-v-dual-tree-campuses.md @@ -0,0 +1,664 @@ +# Native G-V Dual-Tree Campuses + +How the Spectral Sentinel adopts the Geometric-Value Dual Tree +as its native campus management layer. + +--- + +## 1. The problem + +The Sentinel partitions $[0, 2^{128})$ with a fixed bit prefix: + +$$\text{campus}(v) = v \gg (128 - c)$$ + +This creates $2^c$ equal-width buckets. Each bucket that receives +any traffic gets a lazily instantiated `CampusState` holding $D$ +`SubspaceTracker` instances (~73 KB × $D$ per campus). A `BTreeMap` +maps campus IDs to states. + +Three structural problems follow: + +**Fixed granularity.** At `campus_bits = 14`, a /14 prefix +receiving 99% of traffic and a /14 prefix receiving one packet +each get one tracker set. There is no mechanism to say "this +region deserves finer spatial analysis" or "these thousand +regions should be consolidated." + +**No adaptation.** The spatial partitioning does not respond to +what the Sentinel is learning. A campus where the subspace tracker +is detecting genuinely interesting structure gets the same spatial +treatment as one seeing pure noise. + +**Adversarial exhaustion.** An attacker controlling diverse source +prefixes forces $O(2^c \times D)$ tracker instantiations. The +specification computes: at `campus_bits = 20`, that is ~73 GB. +The four mitigations listed in algorithm.md §2.8 are all host-side +policy decisions — exactly what the Sentinel's design principle +says the sentinel itself should not make. + +--- + +## 2. The replacement + +The G-V Dual Tree (specified in `packages/g-v-dual-tree/concept.md`) +replaces: + +- The fixed campus partitioning → **G-Tree** (adaptive dyadic + partition of $[0, 2^{128})$) +- The `BTreeMap` → **V-Tree** (tournament + bracket of G-nodes competing for attention) +- The missing lifecycle management → **depth gates** ($D_{\text{create}}$, + $D_{\text{evict}}$) as structural invariants + +The replacement is architectural. The `SubspaceTracker`, EWMA, +CUSUM, scoring axes, and meta-tracker are unchanged. Only the +layer that decides _which regions exist and carry trackers_ is +replaced. + +--- + +## 3. Design decisions + +Three choices simplify the integration and are fixed for this +specification: + +### 3.1 Intensity = raw observation count + +Each observation increments the V-entry intensity of the terminal +G-node it routes to, plus every G-ancestor holding a V-entry. +The G-V Tree is a pure spatial attention structure: "where is +traffic?" No feedback from anomaly scores into V-entry intensity. + +**Rationale.** A region needs statistical mass to produce reliable +anomaly scores. Observation count is the correct proxy for +"does this region have enough data to justify a tracker?" Coupling +V-intensity to anomaly scores creates a feedback loop whose +stability would need separate analysis. The G-V Tree's temporal +semantics slot (concept.md §13) accepts any intensity source; +this can be revisited later. + +### 3.2 Exhaustive dispatch, not proportional sampling + +Every G-node holding a V-entry is analyzed every batch. The +V-Tree does not select which nodes to analyze per batch — it +selects which nodes _exist at all_. $D_{\text{create}}$ and +$D_{\text{evict}}$ are the only knobs controlling the active +tracker population. + +**Rationale.** The current Sentinel analyzes every active campus +every batch. Exhaustive dispatch preserves completeness and +eliminates detection latency concerns. The V-Tree's role is +lifecycle management (which regions earn trackers, which lose +them), not per-batch scheduling. + +### 3.3 All G-nodes with V-entries feed the meta-tracker + +The meta-tracker receives 4D score summaries from every G-node +with a V-entry that reported in the current batch — terminal and +internal, at all G-Tree depths. The meta-tracker's learned +subspace absorbs the fact that nodes at different spatial scales +produce differently-scaled scores. When the cross-node pattern +_changes_, the meta-tracker detects it. + +**Rationale.** The meta-tracker already learns its own +normalisation (algorithm.md §5.3). Variable-width nodes become +part of the learned distribution, not a confound. Restricting to +nodes at similar depths would sacrifice the multi-scale +coordination signal. + +--- + +## 4. Architecture + +### 4.1 Node state + +Each G-node that holds a V-entry carries a `NodeState`: + +``` +NodeState +├── trackers: Vec // one per prefix_depth +├── last_active_batch: u64 // for staleness (informational) +└── maturity: Vec // per prefix_depth +``` + +This is the same `CampusState` from the current implementation, +attached to a G-node instead of a `BTreeMap` entry. + +Internal G-nodes with V-entries also carry `NodeState`. When a +G-node splits, the parent retains its trackers (catalytic +persistence). When a G-node loses its V-entry through eviction, +its `NodeState` is destroyed. + +### 4.2 Component ownership + +``` +SpectralSentinel +├── config: SentinelConfig +├── gv_tree: GVDualTree // replaces BTreeMap +├── meta_states: Vec // one per prefix_depth (unchanged) +├── lifetime_observations: u64 +└── batch_counter: u64 +``` + +The `GVDualTree` is generic over the payload attached to G-nodes +with V-entries. The Sentinel instantiates it with `NodeState`. + +### 4.3 Replaced configuration + +| Removed | Replaced by | +| ----------------- | --------------------------------- | +| `campus_bits: u8` | Gone entirely. The G-Tree adapts. | + +| Added | Type | Default | Effect | +| ----------------- | ------- | ------- | ----------------------------------------------- | +| `split_threshold` | `f64` | TBD | Minimum G-node intensity $\theta$ to split | +| `v_depth_create` | `usize` | TBD | $D_{\text{create}}$: max V-depth to allow split | +| `v_depth_evict` | `usize` | TBD | $D_{\text{evict}}$: V-depth triggering eviction | + +The constraint $D_{\text{create}} < D_{\text{evict}}$ (concept.md +§6 D-I3) is enforced at configuration validation time. + +--- + +## 5. Observation flow + +### 5.1 Ingest + +Processing a batch of raw `&[u128]`: + +``` +function ingest(batch): + + // Phase A — Route and accumulate in G-V Tree + for each value v in batch: + x ← centred_bits(v) + terminal ← gv_tree.point_query(v) + + // G-Tree accumulation (permanent) + g ← terminal + while g ≠ null: + g.sum ← g.sum + 1 + g ← g.geo_parent + + // V-entry intensity updates + ensure terminal has V-entry (§8.1 of concept.md) + g ← terminal + while g ≠ null: + if g.entry ≠ null: + g.entry.int ← g.entry.int + 1 + propagate_v_sums(g.entry) + g ← g.geo_parent + + // Attempt split at terminal + attempt_split(terminal) + + // Rebalance V-Tree after all observations + gv_tree.rebalance() + + // Evict entries past D_evict + gv_tree.evict_deep_entries() + + // Phase B — Collect observations per G-node + node_batches ← group observations by their terminal G-node + // Also propagate: each observation appears in its + // terminal's batch AND every ancestor with a V-entry + + // Phase C — Score each G-node with a V-entry + reports ← [] + for each G-node g with V-entry, in V-Tree order: + if g has no observations this batch: continue + state ← g.node_state + obs ← observations routed through g (terminal + inherited) + + node_report ← NodeReport(range = [g.l, g.r), ...) + for each (depth_idx, depth) in config.prefix_depths: + X ← encode obs as prefix vectors at depth d + prefix_report ← state.trackers[depth_idx].observe(X) + node_report.prefix_reports.push(prefix_report) + + reports.push(node_report) + + // Phase D — Coordination tier (unchanged logic) + for each configured prefix depth d: + score_vectors ← collect 4D mean scores from + all nodes that reported at depth d + if score_vectors.len() < 2: coordination[d] = None + else: + centre by subtracting running mean + feed to meta_states[d].tracker + update running mean + coordination[d] = Some(coordination_report) + + // Phase E — Assemble BatchReport + return BatchReport { + lifetime_observations, + active_entries: gv_tree.entry_count(), + node_reports: reports, + coordination, + } +``` + +### 5.2 Key differences from current dispatch + +| Current | With G-V Tree | +| ----------------------------------------------- | --------------------------------------------------------------------------- | +| Observations grouped by fixed campus ID | Observations routed through G-Tree point query | +| Each observation trains one campus tracker | Each observation trains terminal node + ancestor nodes with V-entries | +| Campus creation is unbounded lazy instantiation | Node creation is gated by $\theta$, $D_{\text{create}}$, and V-I3 | +| Campus eviction is host responsibility | Node eviction is automatic at $D_{\text{evict}}$ | +| All campuses same spatial width | Nodes at variable depths, from very coarse to very fine | +| `active_trackers` reported for host monitoring | `active_entries` reported; the _bound_ is structural, not advisory | +| Meta-tracker compares equal-width campuses | Meta-tracker compares variable-width nodes; absorbs scale into its baseline | + +### 5.3 Observation propagation to ancestor trackers + +When a `u128` value $v$ routes to terminal G-node $\ell$, every +G-ancestor of $\ell$ that holds a V-entry also receives the +observation for tracker training. This means the same raw value +trains `SubspaceTracker` instances at multiple spatial scales +simultaneously. + +This is the Sentinel's `prefix_depths` concept realized +_spatially_. Currently, each campus trains trackers at 8 fixed +bit depths. With the G-V Tree, each observation also trains +trackers at however many spatial scales the G-Tree has +materialized above the terminal node. + +**Cost.** Per observation, the number of ancestor V-entries is +bounded by the V-Tree height. Under the Fibonacci depth bound +(concept.md §17.1), this is $O(\log_\phi(1/w_i))$ for a node +with weight fraction $w_i$. Heavy nodes (which see most traffic) +have few ancestors. Light nodes have more ancestors but see less +traffic. The expected per-observation ancestor count is +$O(H / \log_2 \phi)$ where $H$ is the entropy of the traffic +distribution. Under concentrated distributions this is $O(1)$. + +Each ancestor tracker update costs one `SubspaceTracker::observe` +call per prefix depth. The total per-batch cost is: + +$$\text{cost} = D \times \sum_{\text{entries}} b_g \cdot O(d \cdot (k + b_g)^2)$$ + +where $b_g$ is the observation count at G-node $g$ (including +inherited observations) and $D = |\text{prefix\_depths}|$. + +--- + +## 6. Lifecycle events + +### 6.1 Node creation (catalytic split) + +When a terminal G-node $g$ splits into children $g_L$ and $g_R$: + +1. **G-Tree.** $g$ becomes internal. $g_L$ and $g_R$ are created + with `sum = 0`. +2. **V-Tree.** $g_L$ and $g_R$ receive V-entries at intensity 0. + A structural node groups them under $g$'s parent in the V-Tree. + $g$'s V-entry persists as uncle (catalytic). +3. **Sentinel.** $g_L$ and $g_R$ receive fresh `NodeState` with + new `SubspaceTracker` instances (one per prefix depth). Each + new tracker is noise-injected immediately (algorithm.md §6). + +$g$'s `NodeState` is **retained**. Its trackers continue training +on all observations routed through $g$ (which now includes all +observations routed to $g_L$ and $g_R$). The parent's trained +model provides coarse-scale anomaly detection while the children's +models mature. + +**Noise injection at creation.** Each new tracker needs baseline +warming. The `inject_noise` procedure runs against the new +trackers before they receive real observations. The meta-tracker +is _not_ re-warmed — it is already running. New-node scores enter +the meta-tracker through the normal observation path, starting +from the noise-warmed baselines. + +### 6.2 Node eviction (V-depth exceeds $D_{\text{evict}}$) + +When a V-entry sinks past $D_{\text{evict}}$: + +1. **V-Tree.** Entry removed (concept.md §11.1). +2. **G-Tree.** Node collapsed — children removed, node becomes + terminal with its accumulated `sum` preserved. +3. **Sentinel.** `NodeState` destroyed. All `SubspaceTracker` + instances and their baselines are dropped. CUSUM accumulators + are lost (appropriate — the region has proven insignificant). + +Child G-nodes whose V-entries become ghosts are cleaned up lazily +(concept.md §11.2–11.3). Their `NodeState` instances are +destroyed during ghost cleanup. + +**Meta-tracker impact.** Evicted nodes stop contributing to the +meta-tracker. The meta-tracker's subspace adapts naturally via +its $\lambda$ decay — the absent node's historical influence +fades. No special handling required. + +### 6.3 Node promotion (V-Tree rebalancing) + +When the V-Tree rebalances (concept.md §10), nodes move to +shallower or deeper V-positions. This has **no effect** on +`SubspaceTracker` state. Trackers are attached to G-nodes, not +V-positions. V-Tree depth affects only the lifecycle gates +($D_{\text{create}}$ and $D_{\text{evict}}$), not scoring. + +--- + +## 7. Report structure + +### 7.1 Adapted hierarchy + +``` +BatchReport +├── lifetime_observations: u64 +├── active_entries: usize // was: active_trackers +├── gv_tree_stats: TreeStats // new +│ ├── g_node_count: usize +│ ├── v_entry_count: usize +│ ├── v_tree_height: usize +│ └── max_g_depth: usize +├── node_reports: Vec // was: campus_reports +│ ├── range: (u128, u128) // [l, r) — was: campus_id +│ ├── g_depth: usize // depth in G-Tree +│ ├── v_depth: usize // depth in V-Tree +│ ├── g_sum: f64 // accumulated observation count +│ ├── v_intensity: f64 // V-entry intensity +│ ├── sample_count: usize +│ └── prefix_reports: Vec // unchanged +│ ├── depth, rank, energy_ratio, ... +│ ├── scores: AnomalyScores // unchanged +│ ├── maturity: TrackerMaturity // unchanged +│ └── per_sample: Option> +└── coordination: Vec> // unchanged +``` + +`CampusReport` becomes `NodeReport`. The `campus_id: u32` field +(a flat bucket index) becomes `range: (u128, u128)` (the G-node's +dyadic interval) plus depth information in both trees. Everything +below the node level — `PrefixReport`, `AnomalyScores`, +`ScoreDistribution`, `SampleScore` — is unchanged. + +### 7.2 New diagnostics + +`TreeStats` gives the host visibility into the G-V Tree's state: + +- **`g_node_count`** — total materialized G-nodes (internal + + terminal). Analogous to total spatial resolution. +- **`v_entry_count`** — G-nodes with V-entries (each carrying + $D$ trackers). This is the tracker population. +- **`v_tree_height`** — current V-Tree height. Relates to the + depth gates: entries near $D_{\text{evict}}$ are at risk. +- **`max_g_depth`** — deepest G-Tree level. Indicates finest + spatial resolution currently active. + +### 7.3 Health report + +The `HealthReport` gains G-V Tree statistics alongside the +existing rank distribution, maturity distribution, and tracker +population data. The `active_trackers` metric becomes +`v_entry_count × D`. + +--- + +## 8. Spray resistance + +### 8.1 Current vulnerability + +At `campus_bits = c`, an attacker touching every bucket forces +$2^c \times D$ tracker instantiations. The algorithm.md §2.8 +ceiling at `campus_bits = 20` is ~73 GB. Mitigations are +host-side policy decisions. + +### 8.2 G-V Tree defence + +Four architectural layers replace host-side policy: + +| Layer | Mechanism | Effect | +| -------------------------------- | -------------------------------------------- | --------------------------------------------------- | +| Intensity gate ($\theta$) | G-node sum must exceed threshold to split | Low-volume spray cannot trigger splits | +| Depth gate ($D_{\text{create}}$) | V-Tree position must be shallow to split | Globally insignificant nodes cannot earn resolution | +| Uncle constraint (V-I3) | Children must outcompete parent's V-entry | Spray-generated entries stay deep, never promote | +| Eviction ($D_{\text{evict}}$) | Deep entries are removed with their trackers | Cold entries are reclaimed automatically | + +### 8.3 Steady-state bound + +Under sustained spray of $R$ observations per batch at intensity +$\Delta = 1$, with the Sentinel's forgetting factor $\lambda$ +applied as exponential decay to V-entry intensities: + +$$L_{\text{steady}} = \frac{R \cdot \Delta}{(1 - \lambda) \cdot \theta} = \frac{R}{(1 - \lambda) \cdot \theta}$$ + +At $\lambda = 0.99$ and $\theta = 1000$: a million-address spray +stabilises at $\frac{10^6}{0.01 \times 1000} = 100{,}000$ entries. +At $\theta = 10{,}000$: 10,000 entries. The ceiling is controlled +by two tunable parameters, independent of the address space size +or the adversary's prefix diversity. + +Total memory: $L_{\text{steady}} \times D \times 73\text{ KB}$. +At 10,000 entries and 8 depths: ~5.7 GB. The host tunes $\theta$ +and the depth gates to fit their memory budget. + +### 8.4 Recovery + +When spray stops, the Sentinel's $\lambda$ decay continues. +Spray-generated V-entries lose intensity, sink past +$D_{\text{evict}}$, and are evicted. G-subtrees collapse. The +tree returns to its pre-spray size without host intervention. + +--- + +## 9. Multi-scale analysis + +### 9.1 What changes + +Currently, multi-scale analysis is achieved through +`prefix_depths`: each campus trains $D$ trackers at fixed bit +depths (8, 16, 24, 32, 48, 64, 96, 128). Every campus uses +the same depths regardless of its spatial position. + +With the G-V Tree, multi-scale analysis gains a second dimension: +**spatial scale** (G-Tree depth) × **bit depth** (prefix depths). +An observation at address $v$ trains trackers at: + +- The terminal G-node's $D$ prefix depths +- Each ancestor G-node (with V-entry)'s $D$ prefix depths + +A terminal node at G-depth 24 with two ancestors at depths 14 +and 0 produces $3 \times D$ tracker updates per observation. +The spatial hierarchy gives coarse-to-fine anomaly detection +that adapts to traffic patterns. + +### 9.2 Prefix depth interaction + +The `prefix_depths` parameter continues to control bit-level +analysis granularity within each spatial region. A G-node +covering range $[l, r)$ at G-depth $g$ trains trackers at all +configured prefix depths, including depths shallower than $g$. + +At shallow prefix depths, the centred bit vectors for all +observations within $[l, r)$ share the same leading bits (they +must, since they route to the same G-node). The tracker at that +depth learns the _remaining_ bit-level structure within the +spatially constrained region. + +At deep prefix depths, the full 128-bit structure is available. +The tracker learns fine-grained patterns. + +This interaction is natural and requires no special handling. + +--- + +## 10. Temporal semantics + +The G-V Tree's temporal semantics interface (concept.md §13) is +the plug point for the Sentinel's forgetting factor. + +### 10.1 Exponential decay + +Periodically (e.g., once per batch), multiply all V-entry +intensities by $\lambda$: + +$$v.\text{int} \leftarrow \lambda \cdot v.\text{int} \qquad \text{for all V-entries}$$ + +Concept.md §13 proves this preserves V-I3: uniform scaling +maintains all uncle relationships. V-Tree shape is unchanged. + +This uses the same $\lambda$ as the Sentinel's EWMA forgetting +factor. One parameter controls both: + +- **Sentinel**: how quickly the subspace model forgets old + observations ($\sigma^{(n)} = \lambda^{n/2} \sigma^{(0)}$). +- **G-V Tree**: how quickly a region's accumulated traffic + importance decays, governing when cold regions lose their + trackers. + +### 10.2 The breathing cycle + +Under load: + +1. Traffic concentrates in some regions. +2. G-nodes accumulate intensity, earn shallow V-positions. +3. Depth gate passes → catalytic split → finer resolution. +4. New trackers warm up, begin detecting anomalies. + +When traffic subsides: + +1. $\lambda$ decay reduces V-entry intensities. +2. Entries sink in the V-Tree tournament. +3. Entries cross $D_{\text{evict}}$ → eviction → trackers + destroyed, G-subtrees collapsed. +4. Memory reclaimed. Tree contracts. + +The triad of split / decay / eviction is the mechanism. The +architecture provides split and eviction; the Sentinel provides +decay through $\lambda$. + +--- + +## 11. Migration path + +### 11.1 Phase 1 — G-V Tree as a library + +Implement the G-V Dual Tree as the standalone +`packages/g-v-dual-tree` crate, generic over payload type. +The Sentinel is not modified. The tree is tested independently +against the invariants in concept.md. + +### 11.2 Phase 2 — Integration + +Replace the Sentinel's `BTreeMap` with +`GVDualTree`. Key changes to `sentinel/mod.rs`: + +1. **`SpectralSentinel::new`** — Initialise `GVDualTree` with + $N = 128$, configured $\theta$, $D_{\text{create}}$, + $D_{\text{evict}}$. +2. **`ingest`** — Replace campus-grouped dispatch with the + flow in §5.1 above. +3. **`inject_noise`** — Inject into all existing G-node + trackers. Meta-tracker injection is unchanged. +4. **`evict_stale_campuses`** — Removed. Eviction is now + automatic via $D_{\text{evict}}$. +5. **`remove_campus`** — Replaced by manual V-entry removal + (if needed; the automatic lifecycle may make this + unnecessary). +6. **Report assembly** — `CampusReport` → `NodeReport` with + range and depth fields instead of `campus_id`. + +### 11.3 Phase 3 — Configuration + +Remove `campus_bits` from `SentinelConfig`. Add +`split_threshold`, `v_depth_create`, `v_depth_evict`. + +Provide a migration helper that suggests equivalent G-V Tree +parameters for a given `campus_bits` value (e.g., +`campus_bits = 14` ≈ a tree that stabilises at ~16K entries +under typical traffic, achieved by tuning $\theta$ and the +depth gates). + +### 11.4 Backward compatibility + +The `observation.rs` module (`CentredBits`, `CampusBatch`, +`BatchEncoder`) changes structurally. `CampusBatch` is replaced +by G-Tree point query routing. `BatchEncoder` either wraps the +G-V Tree's observation flow or is removed in favour of direct +`GVDualTree::observe` calls. + +The `report.rs` types change as described in §7. This is a +breaking API change. The `CoordinationReport` and all types +below `PrefixReport` are unchanged. + +--- + +## 12. What does not change + +| Component | Status | +| ---------------------- | ---------------------------------------------------------- | +| `SubspaceTracker` | Unchanged. Attached to G-nodes instead of campus entries. | +| Five-phase core loop | Unchanged. Runs per (G-node, prefix_depth) pair. | +| Four scoring axes | Unchanged. Novelty, displacement, surprise, coherence. | +| EWMA baselines | Unchanged. Per-tracker, per-axis. | +| CUSUM accumulators | Unchanged. Per-tracker, per-axis. | +| Meta-tracker | Unchanged. Receives 4D score vectors from reporting nodes. | +| Running-mean centering | Unchanged. EWMA of historical column means. | +| Noise injection | Unchanged per-tracker. Triggered on node creation. | +| Maturity tracking | Unchanged. Per-tracker $\eta$ decay. | +| Rank adaptation | Unchanged. ±1 steps per evaluation. | +| Coverage matrix (§5.6) | Unchanged. All six attack patterns still covered. | + +The Sentinel's analytical engine is untouched. Only the layer +that decides _which spatial regions carry that engine_ is +replaced — from a flat, fixed, adversarially-exploitable +partitioning to an adaptive, bounded, self-managing one. + +--- + +## 13. Parameter relationships + +``` + ┌─────────────────────────────┐ + │ Host tunes: │ + │ θ (split threshold) │ + │ D_create (creation gate) │ + │ D_evict (eviction gate) │ + │ λ (forgetting factor) │ + └──────────┬──────────────────┘ + │ + ┌──────────────────────┼──────────────────────┐ + │ │ │ + ▼ ▼ ▼ + G-V Tree Sentinel Steady-state + dynamics memory bound + + θ → min intensity λ → EWMA decay L = R/(1−λ)θ + to split λ → SVD decay + D_create → max λ → CUSUM slow + V-depth to split reference lag + D_evict → eviction λs → CUSUM slow + threshold (independent) + λ → intensity decay + (same parameter) +``` + +$\lambda$ does triple duty: + +1. **Sentinel EWMA/SVD decay** — statistical memory half-life. +2. **G-V Tree intensity decay** — spatial lifecycle half-life. +3. **Steady-state bound denominator** — $(1 - \lambda)$ controls + how many concurrent entries the system sustains. + +$\lambda_s$ (CUSUM slow decay) remains independent. It controls +drift detection sensitivity, not spatial lifecycle. + +--- + +## 14. Summary + +The G-V Dual Tree converts the Sentinel's campus layer from a +static partition with an acknowledged adversarial vulnerability +into an adaptive, self-managing spatial index with structural +resource bounds. + +The Sentinel's analytical engine — subspace tracking, four-axis +scoring, EWMA baselines, CUSUM drift detection, meta-tracker +coordination — is unchanged. The G-V Tree handles the concern +that the flat campus scheme could not: deciding which regions of +$[0, 2^{128})$ deserve the Sentinel's analytical attention, and +enforcing that decision with bounded resources regardless of +adversarial strategy. diff --git a/packages/sentinel/src/config.rs b/packages/sentinel/src/config.rs new file mode 100644 index 00000000..9b67e3c5 --- /dev/null +++ b/packages/sentinel/src/config.rs @@ -0,0 +1,409 @@ +//! Configuration types for the Spectral Sentinel. +//! +//! [`SentinelConfig`] controls the measurement parameters of the sentinel. +//! +//! These types are deliberately free of policy concerns (thresholds, actions). +//! The sentinel measures; the host decides what the measurements mean. + +/// Measurement parameters for the sentinel. +/// +/// Every field controls *how* the sentinel observes and learns, +/// never *what it thinks* about what it sees. +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct SentinelConfig { + /// Maximum rank (number of basis vectors) any subspace tracker can use. + /// + /// Higher = more expressive model of "normal", but more memory and + /// SVD cost per observation. The actual rank adapts automatically + /// and will never exceed `min(prefix_depth, max_rank)`. + /// + /// Default: `16` + pub max_rank: usize, + + /// Exponential forgetting factor (λ). + /// + /// Controls how fast old observations fade from memory. + /// - `0.99` = long memory (~69 observations half-life) + /// - `0.95` = short memory (~14 observations half-life) + /// + /// Must be in `(0.0, 1.0)`. + /// + /// Default: `0.99` + pub forgetting_factor: f64, + + /// How often (in observation steps) to reassess the rank of each tracker. + /// + /// Rank changes by at most ±1 per evaluation to avoid instability. + /// + /// Default: `100` + pub rank_update_interval: u64, + + /// Number of leading bits used for campus (spatial) bucketing. + /// + /// Produces `2^campus_bits` campus buckets. Each bucket gets its + /// own independent set of subspace trackers. + /// + /// - `14` → 16 384 campuses (default — good granularity for + /// diverse observation streams) + /// - `7` → 128 campuses (coarser, lighter on memory) + /// - `4` → 16 campuses (very coarse) + /// + /// Each campus × prefix-depth pair gets its own tracker, so the + /// theoretical maximum tracker count is `2^campus_bits × len(prefix_depths)`. + /// Trackers are created lazily, but high values still affect memory + /// under diverse traffic. + /// + /// **Resource attack warning.** An adversary who can submit values + /// with diverse leading bits forces lazy creation of up to + /// `2^campus_bits × len(prefix_depths)` trackers (~73 KB each at + /// default depths). The host should monitor `active_trackers` and + /// consider evicting stale trackers or rate-limiting campus + /// creation if memory growth becomes a concern. + /// See `docs/algorithm.md` §2.8 for details. + /// + /// Default: `14` + pub campus_bits: u8, + + /// Which prefix depths to monitor. + /// + /// Each depth produces a separate subspace tracker per campus, + /// analysing traffic at that prefix granularity. More depths = + /// more trackers = more memory and CPU, but finer-grained reports. + /// + /// Each value must be in `[1, 128]`. Duplicates and arbitrary + /// ordering are harmless but wasteful — duplicate depths create + /// redundant trackers doing identical work. The sentinel does not + /// sort or deduplicate; the host is responsible for providing a + /// sensible list. + /// + /// Default: `[8, 16, 24, 32, 48, 64, 96, 128]` + pub prefix_depths: Vec, + + /// Cumulative energy threshold for automatic rank adaptation. + /// + /// The rank adapts to capture at least this fraction of the total + /// variance (sum of squared singular values). Lower = fewer dimensions + /// retained, higher = more faithful representation. + /// + /// Must be in `(0.0, 1.0)`. + /// + /// Default: `0.90` + pub energy_threshold: f64, + + /// Numerical stability constant. + /// + /// Added to denominators to prevent division by zero. + /// + /// Default: `1e-6` + pub eps: f64, + + /// Slow EWMA decay factor for per-tracker CUSUM reference baselines. + /// + /// The CUSUM accumulator detects gradual drift that the fast EWMA + /// (controlled by `forgetting_factor`) absorbs. The slow EWMA + /// provides the reference: CUSUM accumulates the gap between the + /// batch mean score and the slow baseline. + /// + /// Must be in `(0.0, 1.0)` and strictly greater than + /// `forgetting_factor` — a slower memory than the fast baseline. + /// + /// Half-life ≈ `ln(2) / ln(1/λ_s)`: + /// - `0.999` = ~693 steps (default) + /// - `0.995` = ~139 steps + /// + /// Default: `0.999` + pub cusum_slow_decay: f64, + + /// Slow EWMA decay factor for meta-tracker (coordination) CUSUM. + /// + /// Controls the CUSUM reference baseline at the cross-campus + /// coordination tier. Separated from `cusum_slow_decay` because + /// the meta-tracker may see different batch cadences and the + /// host may want different drift sensitivity at each tier. + /// + /// Must be in `(0.0, 1.0)` and strictly greater than + /// `forgetting_factor`. + /// + /// Default: `0.999` + pub cusum_meta_slow_decay: f64, + + /// CUSUM noise allowance in slow-baseline σ units. + /// + /// Each CUSUM step subtracts `κ_σ · √(slow_variance)` before + /// accumulating. This absorbs normal noise fluctuations so the + /// accumulator only grows under sustained elevation. + /// + /// - `0.5` = tolerate up to half a slow-σ per step (default) + /// - `0.0` = no allowance — any positive gap accumulates + /// - `1.0` = generous allowance — only strong drift accumulates + /// + /// Must be ≥ 0. + /// + /// Default: `0.5` + pub cusum_allowance_sigmas: f64, + + /// Whether to include per-sample scores in reports. + /// + /// When `true`, each [`PrefixReport`](crate::report::PrefixReport) + /// includes a `Vec` with individual scores for every + /// observation in the batch. Useful for forensics, expensive for + /// large batches. + /// + /// Default: `false` + pub per_sample_scores: bool, +} + +impl SentinelConfig { + /// Number of campus buckets: `2^campus_bits`. + #[must_use] + pub const fn num_campuses(&self) -> u32 { + 1 << self.campus_bits + } +} + +impl Default for SentinelConfig { + fn default() -> Self { + Self { + max_rank: 16, + forgetting_factor: 0.99, + rank_update_interval: 100, + campus_bits: 14, + prefix_depths: vec![8, 16, 24, 32, 48, 64, 96, 128], + energy_threshold: 0.90, + eps: 1e-6, + cusum_slow_decay: 0.999, + cusum_meta_slow_decay: 0.999, + cusum_allowance_sigmas: 0.5, + per_sample_scores: false, + } + } +} + +/// Validation errors for [`SentinelConfig`]. +#[derive(Debug, Clone, PartialEq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub enum ConfigError { + /// `max_rank` must be at least 1. + MaxRankZero, + /// `forgetting_factor` must be in `(0.0, 1.0)`. + ForgettingFactorOutOfRange(f64), + /// `rank_update_interval` must be at least 1. + RankUpdateIntervalZero, + /// `campus_bits` must be in `[1, 32]`. + CampusBitsOutOfRange(u8), + /// `prefix_depths` must not be empty. + NoPrefixDepths, + /// Every prefix depth must be in `[1, 128]`. + PrefixDepthOutOfRange(u8), + /// `energy_threshold` must be in `(0.0, 1.0)`. + EnergyThresholdOutOfRange(f64), + /// `eps` must be positive. + EpsNotPositive(f64), + /// `cusum_slow_decay` must be in `(0.0, 1.0)`. + CusumSlowDecayOutOfRange(f64), + /// `cusum_slow_decay` must be strictly greater than `forgetting_factor`. + CusumSlowDecayTooLow { slow: f64, fast: f64 }, + /// `cusum_meta_slow_decay` must be in `(0.0, 1.0)`. + CusumMetaSlowDecayOutOfRange(f64), + /// `cusum_meta_slow_decay` must be strictly greater than `forgetting_factor`. + CusumMetaSlowDecayTooLow { slow: f64, fast: f64 }, + /// `cusum_allowance_sigmas` must be non-negative. + CusumAllowanceNegative(f64), +} + +impl std::fmt::Display for ConfigError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::MaxRankZero => write!(f, "max_rank must be at least 1"), + Self::ForgettingFactorOutOfRange(v) => { + write!(f, "forgetting_factor must be in (0.0, 1.0), got {v}") + } + Self::RankUpdateIntervalZero => write!(f, "rank_update_interval must be at least 1"), + Self::CampusBitsOutOfRange(v) => write!(f, "campus_bits must be in [1, 32], got {v}"), + Self::NoPrefixDepths => write!(f, "prefix_depths must not be empty"), + Self::PrefixDepthOutOfRange(v) => { + write!(f, "every prefix depth must be in [1, 128], got {v}") + } + Self::EnergyThresholdOutOfRange(v) => { + write!(f, "energy_threshold must be in (0.0, 1.0), got {v}") + } + Self::EpsNotPositive(v) => write!(f, "eps must be positive, got {v}"), + Self::CusumSlowDecayOutOfRange(v) => { + write!(f, "cusum_slow_decay must be in (0.0, 1.0), got {v}") + } + Self::CusumSlowDecayTooLow { slow, fast } => { + write!(f, "cusum_slow_decay ({slow}) must be > forgetting_factor ({fast})") + } + Self::CusumMetaSlowDecayOutOfRange(v) => { + write!(f, "cusum_meta_slow_decay must be in (0.0, 1.0), got {v}") + } + Self::CusumMetaSlowDecayTooLow { slow, fast } => { + write!(f, "cusum_meta_slow_decay ({slow}) must be > forgetting_factor ({fast})") + } + Self::CusumAllowanceNegative(v) => { + write!(f, "cusum_allowance_sigmas must be >= 0, got {v}") + } + } + } +} + +impl std::error::Error for ConfigError {} + +/// One or more configuration validation errors. +/// +/// Returned by [`SentinelConfig::validate`] when at least one field +/// violates its constraints. Contains every violation found, not +/// just the first. +#[derive(Debug, Clone, PartialEq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct ConfigErrors(pub Vec); + +impl std::fmt::Display for ConfigErrors { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let n = self.0.len(); + for (i, e) in self.0.iter().enumerate() { + write!(f, "{e}")?; + if i + 1 < n { + write!(f, "; ")?; + } + } + Ok(()) + } +} + +impl std::error::Error for ConfigErrors {} + +impl SentinelConfig { + /// Validate all invariants. + /// + /// Checks every field and collects all violations so the caller + /// can fix them in one pass rather than iterating one-at-a-time. + /// + /// # Errors + /// + /// Returns a [`ConfigErrors`] containing every [`ConfigError`] + /// found, if any. + pub fn validate(&self) -> Result<(), ConfigErrors> { + let mut errors = Vec::new(); + + if self.max_rank == 0 { + errors.push(ConfigError::MaxRankZero); + } + if self.forgetting_factor <= 0.0 || self.forgetting_factor >= 1.0 { + errors.push(ConfigError::ForgettingFactorOutOfRange(self.forgetting_factor)); + } + if self.rank_update_interval == 0 { + errors.push(ConfigError::RankUpdateIntervalZero); + } + if self.campus_bits == 0 || self.campus_bits > 32 { + errors.push(ConfigError::CampusBitsOutOfRange(self.campus_bits)); + } + if self.prefix_depths.is_empty() { + errors.push(ConfigError::NoPrefixDepths); + } + for &d in &self.prefix_depths { + if d == 0 || d > 128 { + errors.push(ConfigError::PrefixDepthOutOfRange(d)); + } + } + if self.energy_threshold <= 0.0 || self.energy_threshold >= 1.0 { + errors.push(ConfigError::EnergyThresholdOutOfRange(self.energy_threshold)); + } + if self.eps <= 0.0 { + errors.push(ConfigError::EpsNotPositive(self.eps)); + } + if self.cusum_slow_decay <= 0.0 || self.cusum_slow_decay >= 1.0 { + errors.push(ConfigError::CusumSlowDecayOutOfRange(self.cusum_slow_decay)); + } + if self.cusum_slow_decay <= self.forgetting_factor { + errors.push(ConfigError::CusumSlowDecayTooLow { + slow: self.cusum_slow_decay, + fast: self.forgetting_factor, + }); + } + if self.cusum_meta_slow_decay <= 0.0 || self.cusum_meta_slow_decay >= 1.0 { + errors.push(ConfigError::CusumMetaSlowDecayOutOfRange(self.cusum_meta_slow_decay)); + } + if self.cusum_meta_slow_decay <= self.forgetting_factor { + errors.push(ConfigError::CusumMetaSlowDecayTooLow { + slow: self.cusum_meta_slow_decay, + fast: self.forgetting_factor, + }); + } + if self.cusum_allowance_sigmas < 0.0 { + errors.push(ConfigError::CusumAllowanceNegative(self.cusum_allowance_sigmas)); + } + + if errors.is_empty() { + Ok(()) + } else { + Err(ConfigErrors(errors)) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn default_config_is_valid() { + SentinelConfig::default().validate().unwrap(); + } + + #[test] + fn rejects_zero_max_rank() { + let cfg = SentinelConfig { + max_rank: 0, + ..SentinelConfig::default() + }; + assert_eq!(cfg.validate(), Err(ConfigErrors(vec![ConfigError::MaxRankZero]))); + } + + #[test] + fn rejects_forgetting_factor_at_boundary() { + let cfg = SentinelConfig { + forgetting_factor: 1.0, + ..SentinelConfig::default() + }; + assert!(cfg.validate().is_err()); + let cfg = SentinelConfig { + forgetting_factor: 0.0, + ..SentinelConfig::default() + }; + assert!(cfg.validate().is_err()); + } + + #[test] + fn rejects_empty_prefix_depths() { + let cfg = SentinelConfig { + prefix_depths: vec![], + ..SentinelConfig::default() + }; + assert_eq!(cfg.validate(), Err(ConfigErrors(vec![ConfigError::NoPrefixDepths]))); + } + + #[test] + fn rejects_prefix_depth_out_of_range() { + let cfg = SentinelConfig { + prefix_depths: vec![0], + ..SentinelConfig::default() + }; + assert_eq!(cfg.validate(), Err(ConfigErrors(vec![ConfigError::PrefixDepthOutOfRange(0)]))); + let cfg = SentinelConfig { + prefix_depths: vec![129], + ..SentinelConfig::default() + }; + assert_eq!( + cfg.validate(), + Err(ConfigErrors(vec![ConfigError::PrefixDepthOutOfRange(129)])) + ); + } + + #[test] + fn num_campuses_is_power_of_two() { + let cfg = SentinelConfig::default(); + assert_eq!(cfg.num_campuses(), 16_384); // 2^14 + } +} diff --git a/packages/sentinel/src/ewma.rs b/packages/sentinel/src/ewma.rs new file mode 100644 index 00000000..018c96da --- /dev/null +++ b/packages/sentinel/src/ewma.rs @@ -0,0 +1,228 @@ +//! Exponentially-weighted moving average (EWMA) statistics. +//! +//! Tracks a running mean and variance that exponentially decay old +//! observations. Used by the subspace tracker to maintain baselines +//! of "normal" anomaly scores. +//! +//! Outlier-resistant: observations beyond 2σ from the current mean +//! are rejected before updating, preventing an attacker from poisoning +//! the baseline with a single burst. +//! +//! No `faer` dependency — this is pure `f64` arithmetic. + +use crate::report::BaselineSnapshot; + +/// Exponentially-weighted running mean and variance. +/// +/// After `update()` with a batch of values, the baseline reflects +/// a smoothed estimate of the central tendency and spread, biased +/// toward recent observations by the decay factor `λ`. +#[derive(Debug, Clone)] +pub struct EwmaStats { + /// Decay factor (λ). Each old value's contribution shrinks by + /// this factor per update. Higher = longer memory. + decay: f64, + + /// Running weighted mean. + mean: f64, + + /// Running weighted variance. + variance: f64, + + /// Whether at least one real update has occurred. + warm: bool, +} + +impl EwmaStats { + /// Create a new EWMA tracker with the given decay factor. + /// + /// Starts "cold" with `mean = 1.0`, `variance = 1.0` — deliberately + /// wide to avoid extreme z-scores before the first real data arrives. + #[must_use] + pub const fn new(decay: f64) -> Self { + Self { + decay, + mean: 1.0, + variance: 1.0, + warm: false, + } + } + + /// Current mean. + #[must_use] + pub const fn mean(&self) -> f64 { + self.mean + } + + /// Current variance. + #[must_use] + pub const fn variance(&self) -> f64 { + self.variance + } + + /// Whether the baseline has seen at least one update. + #[must_use] + pub const fn is_warm(&self) -> bool { + self.warm + } + + /// Return to the cold state — as if freshly constructed. + /// + /// Restores the placeholder mean and variance and marks the + /// tracker as cold. The next [`update`](Self::update) will + /// enter the cold→warm initialisation path. + pub fn reset_cold(&mut self) { + self.mean = 1.0; + self.variance = 1.0; + self.warm = false; + } + + /// Snapshot of the current baseline for inclusion in reports. + #[must_use] + pub const fn snapshot(&self) -> BaselineSnapshot { + BaselineSnapshot { + mean: self.mean, + variance: self.variance, + } + } + + /// Compute the z-score of a value against the current baseline. + /// + /// Returns `(value - mean) / sqrt(variance + eps)`. + /// + /// The caller supplies `eps` (typically + /// [`SentinelConfig::eps`](crate::config::SentinelConfig::eps)) + /// so that every component of the sentinel shares a single + /// stability constant. + #[must_use] + pub fn z_score(&self, value: f64, eps: f64) -> f64 { + (value - self.mean) / (self.variance.sqrt() + eps) + } + + /// Update the baseline with a batch of new values. + /// + /// Values beyond `mean + 2σ` are rejected (outlier resistance). + /// If all values are outliers, the baseline is unchanged. + /// + /// Only the **upper** tail is clipped. Anomaly scores are + /// non-negative and right-skewed — an attacker inflates them, + /// never deflates them. A lower-tail bound would wrongly reject + /// legitimate low scores during quiet periods. + /// + /// The filter is **skipped entirely on the first update** (while + /// the tracker is still cold). The initial `mean = 1.0` / + /// `variance = 1.0` are placeholders, not a real baseline — you + /// can't define "outlier" without one. + pub fn update(&mut self, values: &[f64]) { + if values.is_empty() { + return; + } + + // When cold, accept everything — no real baseline to filter against. + // Upper-tail only — see docs/algorithm.md §3.5: anomaly scores are right-skewed. + let normals: Vec = if self.warm { + let ceiling = 2.0_f64.mul_add(self.variance.sqrt(), self.mean); + let filtered: Vec = values.iter().copied().filter(|&v| v < ceiling).collect(); + if filtered.is_empty() { + return; // all outliers — learn nothing + } + filtered + } else { + values.to_vec() + }; + + #[allow(clippy::cast_precision_loss)] // batch len ≪ 2^52 + let new_mean = normals.iter().sum::() / normals.len() as f64; + + if !self.warm { + self.mean = new_mean; + if normals.len() > 1 { + let var = mean_squared_deviation(&normals, new_mean); + self.variance = var.max(1e-4); + } + self.warm = true; + return; + } + + let alpha = 1.0 - self.decay; + self.mean = self.decay.mul_add(self.mean, alpha * new_mean); + + if normals.len() > 1 { + let var = mean_squared_deviation(&normals, new_mean).max(1e-4); + self.variance = self.decay.mul_add(self.variance, alpha * var); + } + } +} + +/// Mean squared deviation from the given mean (÷N, no Bessel's correction). +fn mean_squared_deviation(values: &[f64], mean: f64) -> f64 { + #[allow(clippy::cast_precision_loss)] // batch len ≪ 2^52 + let n = values.len() as f64; + values.iter().map(|v| (v - mean).powi(2)).sum::() / n +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn starts_cold() { + let stats = EwmaStats::new(0.99); + assert!(!stats.is_warm()); + assert!((stats.mean() - 1.0).abs() < f64::EPSILON); + assert!((stats.variance() - 1.0).abs() < f64::EPSILON); + } + + #[test] + fn first_update_warms() { + let mut stats = EwmaStats::new(0.99); + stats.update(&[2.0, 3.0, 4.0]); + assert!(stats.is_warm()); + assert!((stats.mean() - 3.0).abs() < f64::EPSILON); + } + + #[test] + fn z_score_is_zero_at_mean() { + let mut stats = EwmaStats::new(0.99); + stats.update(&[5.0, 5.0, 5.0, 5.0]); + let z = stats.z_score(5.0, 1e-6); + assert!(z.abs() < 0.01); + } + + #[test] + fn outliers_are_rejected() { + let mut stats = EwmaStats::new(0.99); + // Warm up with small values + stats.update(&[1.0, 1.0, 1.0]); + let mean_before = stats.mean(); + + // Feed extreme outlier — should be rejected + stats.update(&[1000.0]); + assert!( + (stats.mean() - mean_before).abs() < f64::EPSILON, + "mean should not change when all values are outliers" + ); + } + + #[test] + fn snapshot_matches_state() { + let mut stats = EwmaStats::new(0.99); + stats.update(&[2.0, 4.0, 6.0]); + let snap = stats.snapshot(); + assert!((snap.mean - stats.mean()).abs() < f64::EPSILON); + assert!((snap.variance - stats.variance()).abs() < f64::EPSILON); + } + + #[test] + fn decays_toward_new_data() { + let mut stats = EwmaStats::new(0.90); // fast decay + stats.update(&[10.0, 10.0, 10.0]); + assert!((stats.mean() - 10.0).abs() < f64::EPSILON); + + // Push toward 0.0 + for _ in 0..50 { + stats.update(&[0.0, 0.0, 0.0]); + } + assert!(stats.mean() < 1.0, "mean should have decayed toward 0.0"); + } +} diff --git a/packages/sentinel/src/lib.rs b/packages/sentinel/src/lib.rs new file mode 100644 index 00000000..6e40960b --- /dev/null +++ b/packages/sentinel/src/lib.rs @@ -0,0 +1,26 @@ +//! **Spectral Sentinel** — hierarchical online subspace anomaly detection +//! for positionally structured `u128` observation streams. +//! +//! The sentinel maintains a low-rank model of "normal" observations per +//! (campus, prefix-depth) pair and scores each new batch of `u128` values +//! against that model using streaming thin SVD with exponential forgetting. +//! +//! The input values must have **hierarchical positional structure** — +//! leading bits define coarse groupings and successive bits refine them +//! (e.g. IPv6 addresses). Pseudo-random values such as cryptographic +//! hashes or UUIDs defeat the prefix analysis and will not produce +//! meaningful results. +//! +//! # Design principle +//! +//! **The sentinel measures; the host decides.** +//! +//! All output types carry raw statistical measurements — never opinions, +//! threat levels, or recommended actions. The consuming application reads +//! the reports and applies its own policy. + +pub mod config; +pub mod ewma; +pub mod observation; +pub mod report; +pub mod sentinel; diff --git a/packages/sentinel/src/observation.rs b/packages/sentinel/src/observation.rs new file mode 100644 index 00000000..208013da --- /dev/null +++ b/packages/sentinel/src/observation.rs @@ -0,0 +1,198 @@ +//! Observation boundary: converting raw `u128` values into the +//! mathematical representation the subspace engine needs. +//! +//! This module is the anti-corruption layer between the host's domain +//! (positionally structured `u128` values) and the linear algebra +//! world (`Mat`, centred bit vectors, campus IDs). +//! +//! The input values must have hierarchical positional structure — +//! leading bits define coarse groupings and successive bits refine +//! them (e.g. IPv6 addresses). The host is responsible for ensuring +//! this property before handing values to the sentinel. +//! +//! Nothing in this module touches `faer`. It produces plain `Vec` +//! data that the subspace module consumes. + +use std::collections::BTreeMap; + +/// A campus bucket identifier derived from the leading bits of a value. +/// +/// Two values with the same `CampusId` share the same top `campus_bits` +/// bits and are monitored by the same set of subspace trackers. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct CampusId(pub u32); + +impl CampusId { + /// Extract the campus ID from a `u128` value. + /// + /// Takes the top `campus_bits` bits of the 128-bit value and + /// interprets them as an unsigned integer. + #[must_use] + #[allow(clippy::cast_possible_truncation)] // shift by (128 − campus_bits) guarantees ≤ 32 bits + pub const fn from_value(value: u128, campus_bits: u8) -> Self { + let id = (value >> (128 - campus_bits)) as u32; + Self(id) + } +} + +impl std::fmt::Display for CampusId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +/// A single `u128` value converted to a centred bit vector. +/// +/// Each of the 128 bits becomes: +/// - bit `1` → `+0.5` +/// - bit `0` → `−0.5` +/// +/// This centring is critical: it ensures the data has zero mean +/// per dimension, which the subspace tracker requires. +#[derive(Debug, Clone)] +pub struct CentredBits { + /// The 128 centred bit values, from MSB (index 0) to LSB (index 127). + pub bits: [f64; 128], +} + +impl CentredBits { + /// Convert a `u128` value to centred bits. + #[must_use] + pub fn from_u128(value: u128) -> Self { + let mut bits = [0.0_f64; 128]; + for (i, bit) in bits.iter_mut().enumerate() { + *bit = if (value >> (127 - i)) & 1 == 1 { 0.5 } else { -0.5 }; + } + Self { bits } + } + + /// Return a slice of the first `depth` bits (a prefix). + /// + /// # Panics + /// + /// Panics if `depth > 128`. + #[must_use] + pub fn prefix(&self, depth: u8) -> &[f64] { + &self.bits[..usize::from(depth)] + } +} + +/// A batch of observations, grouped by campus, ready for the subspace engine. +/// +/// This is the processed form of a raw `&[u128]` batch. +/// Campus grouping is computed once during construction so the +/// orchestrator can iterate groups without repeated linear scans. +#[derive(Debug)] +pub struct ObservationBatch { + /// The centred bit representation of each value in the batch. + pub observations: Vec, + + /// Observation indices grouped by campus, in first-seen order. + /// + /// Iteration order preserves the order in which each campus first + /// appeared in the input slice — the same guarantee the old + /// `distinct_campuses()` provided. + groups: Vec<(CampusId, Vec)>, +} + +impl ObservationBatch { + /// Build from raw `u128` values. + /// + /// Campus grouping is computed eagerly so later iteration is O(1) + /// per group rather than O(n) per campus. + #[must_use] + pub fn from_values(values: &[u128], campus_bits: u8) -> Self { + let observations: Vec = values.iter().map(|&v| CentredBits::from_u128(v)).collect(); + + // Build groups in first-seen order. `index_of` maps each campus + // to its position in `groups` for O(log n) dedup lookups. + let mut index_of: BTreeMap = BTreeMap::new(); + let mut groups: Vec<(CampusId, Vec)> = Vec::new(); + + for (i, &v) in values.iter().enumerate() { + let cid = CampusId::from_value(v, campus_bits); + if let Some(&pos) = index_of.get(&cid) { + groups[pos].1.push(i); + } else { + index_of.insert(cid, groups.len()); + groups.push((cid, vec![i])); + } + } + + Self { observations, groups } + } + + /// Iterate over `(CampusId, &[usize])` groups in first-seen order. + pub fn groups(&self) -> impl Iterator { + self.groups.iter().map(|(cid, indices)| (*cid, indices.as_slice())) + } + + /// Number of distinct campuses in this batch. + #[must_use] + pub fn num_campuses(&self) -> usize { + self.groups.len() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn centred_bits_are_plus_minus_half() { + // Value 1 — only the LSB is set + let cb = CentredBits::from_u128(1); + + for &b in &cb.bits[..127] { + assert!((b - (-0.5)).abs() < f64::EPSILON, "expected -0.5, got {b}"); + } + assert!((cb.bits[127] - 0.5).abs() < f64::EPSILON); + } + + #[test] + fn centred_bits_all_ones() { + let cb = CentredBits::from_u128(u128::MAX); + for &b in &cb.bits { + assert!((b - 0.5).abs() < f64::EPSILON, "expected +0.5, got {b}"); + } + } + + #[test] + fn campus_id_extracts_top_bits() { + // Top 8 bits are all 1s: 0xFF << 120 + let value: u128 = 0xFF00_0000_0000_0000_0000_0000_0000_0000; + let cid = CampusId::from_value(value, 8); + assert_eq!(cid.0, 0xFF); + + // Top 4 bits of the same value are 0xF + let cid4 = CampusId::from_value(value, 4); + assert_eq!(cid4.0, 0xF); + } + + #[test] + fn prefix_slicing() { + let cb = CentredBits::from_u128(1); + let p8 = cb.prefix(8); + assert_eq!(p8.len(), 8); + } + + #[test] + fn batch_groups_by_campus() { + // Two values sharing top 4 bits = 0xF, one with top 4 bits = 0x1 + let values: Vec = vec![ + 0xF000_0000_0000_0000_0000_0000_0000_0001, + 0xF000_0000_0000_0000_0000_0000_0000_0002, + 0x1000_0000_0000_0000_0000_0000_0000_0003, + ]; + + let batch = ObservationBatch::from_values(&values, 4); + assert_eq!(batch.num_campuses(), 2); + + let groups: Vec<(CampusId, &[usize])> = batch.groups().collect(); + assert_eq!(groups[0].0, CampusId(0xF)); + assert_eq!(groups[0].1, &[0, 1]); + assert_eq!(groups[1].0, CampusId(0x1)); + assert_eq!(groups[1].1, &[2]); + } +} diff --git a/packages/sentinel/src/report.rs b/packages/sentinel/src/report.rs new file mode 100644 index 00000000..37d0fb3c --- /dev/null +++ b/packages/sentinel/src/report.rs @@ -0,0 +1,706 @@ +//! Report types emitted by the sentinel. +//! +//! These types carry the raw statistical measurements from each +//! [`ingest`](crate::sentinel::SpectralSentinel::ingest) call. +//! They contain numbers and facts — never opinions or recommended actions. +//! +//! The host reads these reports and applies its own policy to decide +//! what (if anything) to do about them. + +// ─── Batch-level ──────────────────────────────────────────── + +/// Complete statistical output from one [`ingest`](crate::sentinel::SpectralSentinel::ingest) call. +/// +/// This is the top-level type returned to the host. It contains +/// everything the sentinel measured about the batch, organised +/// hierarchically by campus and prefix depth. +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct BatchReport { + /// Total real (non-noise) observations the sentinel has seen + /// across its entire lifetime, including this batch. + pub lifetime_observations: u64, + + /// Number of active `(campus, depth)` trackers. + pub active_trackers: usize, + + /// Per-campus breakdown. Only campuses that appeared in this + /// batch are included. + pub campus_reports: Vec, + + /// Cross-campus coordination analysis, one entry per prefix depth. + /// + /// Each entry is produced by a meta-tracker that consumes the + /// per-campus score summaries for that depth. `None` entries + /// indicate that fewer than 2 campuses reported for that depth + /// (insufficient spatial diversity for coordination analysis). + pub coordination: Vec>, +} + +// ─── Campus-level ─────────────────────────────────────────── + +/// Statistics for a single campus bucket within one batch. +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct CampusReport { + /// Campus bucket ID (derived from the leading bits of each value). + pub campus_id: u32, + + /// How many observations in this batch mapped to this campus. + pub sample_count: usize, + + /// Per-prefix-depth breakdown for this campus. + pub prefix_reports: Vec, +} + +// ─── Coordination-level ─────────────────────────────────────── + +/// Cross-campus coordination analysis for a single prefix depth. +/// +/// Produced by a meta-tracker (a `SubspaceTracker` operating at +/// d = 4) that consumes campus mean-score vectors as observations. +/// The four anomaly axes have second-order meaning at this level: +/// +/// | Meta-axis | Detects | +/// |-----------|----------------------------------------------------------| +/// | Novelty | A campus-score pattern the model has never seen | +/// | Displacement | The overall score landscape has shifted | +/// | Surprise | A specific scoring axis is system-wide anomalous | +/// | Coherence | An unusual combination of axis elevations | +/// +/// The same [`AnomalyScores`] type is reused — the host distinguishes +/// tiers by the report hierarchy (coordination vs. campus). +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct CoordinationReport { + /// The prefix depth this meta-tracker analyses. + pub depth: u8, + + /// How many campuses contributed score vectors to this analysis. + pub campuses_reporting: usize, + + /// Current rank of the meta-tracker's learned subspace. + pub rank: usize, + + /// Fraction of total variance captured by the meta-tracker's rank. + pub energy_ratio: f64, + + /// Largest singular value of the meta-tracker's subspace. + pub top_singular_value: f64, + + /// Anomaly scores along all four measurement axes. + /// + /// These score the cross-campus *pattern* — how unusual the + /// distribution of campus scores is, not any individual campus. + pub scores: AnomalyScores, + + /// How mature is this meta-tracker's learned model? + pub maturity: TrackerMaturity, + + /// Geometric properties that determine which scoring axes are + /// structurally meaningful at this meta-tracker's current state. + /// + /// The [`dim`](ScoringGeometry::dim) field is the meta-tracker's + /// working dimensionality (always `4`), **not** the prefix depth. + pub geometry: ScoringGeometry, + + /// Per-campus scores, if + /// [`SentinelConfig::per_sample_scores`](crate::config::SentinelConfig::per_sample_scores) + /// is enabled. + /// + /// Each "sample" in the meta-tracker corresponds to one campus. + /// When present, indices correspond to the campus order in this + /// batch's campus reports for this depth. + pub per_campus: Option>, +} + +// ─── Prefix-depth-level ───────────────────────────────────── + +/// Statistics for a single `(campus, prefix_depth)` tracker +/// after processing one batch. +/// +/// This is where the interesting numbers live. +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct PrefixReport { + /// The prefix depth (number of leading bits) this tracker analyses. + pub depth: u8, + + /// Current rank of the learned subspace (number of active basis vectors). + pub rank: usize, + + /// Fraction of total variance captured by the current rank. + /// + /// `1.0` = the rank explains everything. Lower values mean the + /// data has structure the current rank can't represent. + pub energy_ratio: f64, + + /// Largest singular value of the learned subspace. + /// + /// A sudden spike here indicates a new dominant pattern has appeared. + pub top_singular_value: f64, + + /// Anomaly scores along all four measurement axes. + pub scores: AnomalyScores, + + /// How mature is this tracker's learned model? + pub maturity: TrackerMaturity, + + /// Geometric properties that determine which scoring axes are + /// structurally meaningful at this tracker's current state. + pub geometry: ScoringGeometry, + + /// Per-sample scores, if + /// [`SentinelConfig::per_sample_scores`](crate::config::SentinelConfig::per_sample_scores) + /// is enabled. + /// + /// `None` when disabled. When present, indices correspond to + /// the observations in this campus subset (not the original batch order). + pub per_sample: Option>, +} + +// ─── Anomaly scores ───────────────────────────────────── + +/// The four anomaly-score axes for a batch of observations. +/// +/// The sentinel scores each observation along four independent axes +/// organised into two conceptual groups: +/// +/// **Subspace axis** — how well the learned model explains the observation: +/// +/// | Score | Metric | Intuition | +/// |-------|--------|-----------| +/// | *Novelty* | Residual energy / DOF: `‖X − X̂‖² / (dim − k)` | "How much of this is foreign?" | +/// +/// **Campus axis** — how typical the observation is for *this* campus: +/// +/// | Score | Metric | Intuition | +/// |-------|--------|-----------| +/// | *Displacement* | `‖z‖² / (k + ‖z‖²)`, bounded in `[0, 1)` | "How far is this from the centroid?" | +/// | *Surprise* | Mahalanobis / rank: `Σⱼ ((zⱼ − μⱼ)/σⱼ)² / k` | "The shape is familiar, but the magnitude is wild" | +/// | *Coherence* | Cross-correlation deviation: `Σⱼ<ₗ (zⱼzₗ − Cⱼₗ)²` | "Normal individually, but this combination is new" | +/// +/// Displacement, surprise, and coherence decompose the latent +/// activation pattern along orthogonal statistical concerns: +/// displacement measures total energy, surprise measures +/// per-dimension magnitude (diagonal covariance), and coherence +/// measures pairwise interaction (off-diagonal covariance). +/// +/// All four axes share the same polarity: **higher values indicate +/// greater anomalous departure**. This ensures uniform z-score +/// interpretation and EWMA outlier-filter robustness (see +/// `docs/algorithm.md`, Appendix A). +/// +/// **Why not projection energy / "normality"?** Under the sentinel's +/// centred binary encoding, every observation has the same L2 norm +/// (`d / 4`). Projection energy is therefore a perfect affine function +/// of residual energy — it carries zero independent information. +/// See `docs/algorithm.md`, Appendix B for the full proof. +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct AnomalyScores { + /// Residual energy per residual DOF: `‖X − X̂‖² / (dim − k)`. + /// + /// Measures how much of the observation the subspace cannot explain. + /// High values = novel / unrecognised structure. + /// Low values = the subspace captures the observation well. + pub novelty: ScoreDistribution, + + /// Campus displacement: `‖z‖² / (k + ‖z‖²)`, bounded in `[0, 1)`. + /// + /// Measures raw (not variance-normalised) distance from the campus + /// centroid in latent space. Values near 0 mean the observation + /// sits right at the centre; values near 1 mean it is far away. + /// + /// This is the complement of "conformity" (`k / (k + ‖z‖²)`). + /// The complement form is used so that all four axes share the + /// same polarity: higher values = greater anomalous departure. + pub displacement: ScoreDistribution, + + /// Latent surprise: Mahalanobis distance per rank, + /// `Σⱼ ((zⱼ − μⱼ) / σⱼ)² / k`. + /// + /// Measures how unusual the latent activation pattern is *relative + /// to the expected spread* in each direction. High values = + /// recognised shape, wild magnitudes. Low values = typical + /// activation pattern. + pub surprise: ScoreDistribution, + + /// Latent coherence: cross-correlation deviation, + /// `2 / (k(k−1)) · Σⱼ<ₗ (zⱼzₗ − Cⱼₗ)²`. + /// + /// Measures how unusual the *combination* of latent activations is, + /// even when individual dimensions have normal magnitudes. Detects + /// co-activation patterns that have not been seen historically. + /// + /// Requires `k ≥ 2` to be meaningful (zero pairs at `k = 1`). + /// + /// Produced by `SubspaceTracker` from the upper-triangle + /// cross-correlation matrix `C` maintained in latent space. + pub coherence: ScoreDistribution, +} + +// ─── Score distribution ───────────────────────────────────── + +/// Summary statistics for a vector of anomaly scores. +/// +/// Contains both the raw score distribution and its relationship +/// to the learned baseline (via z-scores). +#[derive(Debug, Clone, Copy)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct ScoreDistribution { + /// Minimum score in the batch. + pub min: f64, + + /// Maximum score in the batch. + pub max: f64, + + /// Arithmetic mean of scores in the batch. + pub mean: f64, + + /// Z-score of the *maximum* score against the EWMA baseline. + /// + /// `(max_score - baseline_mean) / sqrt(baseline_variance)` + /// + /// This is the "loudest alarm" in the batch. The host can + /// threshold on this to decide if the batch is noteworthy. + pub max_z_score: f64, + + /// Z-score of the *mean* score against the EWMA baseline. + /// + /// A sustained elevation here (even if `max_z_score` is modest) + /// indicates a broad shift in traffic character, not just one outlier. + pub mean_z_score: f64, + + /// Snapshot of the fast EWMA baseline this distribution was scored against. + pub baseline: BaselineSnapshot, + + /// CUSUM drift accumulator for this scoring axis. + /// + /// Tracks sustained upward departure of batch mean scores from + /// a slow EWMA reference. A high accumulator value indicates + /// gradual drift that the fast EWMA baseline has absorbed. + pub cusum: CusumSnapshot, +} + +// ─── Baseline snapshot ────────────────────────────────────── + +/// Frozen snapshot of an EWMA baseline at the time of scoring. +/// +/// Exposing this lets the host monitor baseline drift over time +/// and make informed decisions about when to trust the scores. +#[derive(Debug, Clone, Copy)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct BaselineSnapshot { + /// Current EWMA mean of "normal" scores. + pub mean: f64, + + /// Current EWMA variance of "normal" scores. + pub variance: f64, +} + +// ─── CUSUM snapshot ───────────────────────────────────────── + +/// Frozen snapshot of a CUSUM accumulator at the time of scoring. +/// +/// The CUSUM detects gradual drift that EWMA baselines absorb. +/// It accumulates the gap between the batch mean score and a slow +/// EWMA reference, minus a noise allowance. The accumulator grows +/// monotonically under sustained elevation and resets to zero when +/// the deviation reverses. +/// +/// The sentinel reports the raw accumulator value without +/// interpretation. The host decides what level of accumulated +/// drift warrants action. +#[derive(Debug, Clone, Copy)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct CusumSnapshot { + /// Current CUSUM accumulator value. + /// + /// Zero means no accumulated evidence of drift. Higher values + /// indicate sustained upward departure from the slow baseline. + pub accumulator: f64, + + /// Snapshot of the slow EWMA baseline used as the CUSUM reference. + pub slow_baseline: BaselineSnapshot, + + /// Number of batches since the CUSUM was last reset (including + /// the post-noise-injection reset). + pub steps_since_reset: u64, +} + +// ─── Maturity ─────────────────────────────────────────────── + +/// How much experience a tracker has, and how much of that is noise. +/// +/// The host uses this to decide whether to trust the tracker's +/// measurements. The sentinel reports the facts; the host sets +/// the maturity threshold. +#[derive(Debug, Clone, Copy)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct TrackerMaturity { + /// Number of real (non-noise) observations this tracker has processed. + pub real_observations: u64, + + /// Number of noise observations injected into this tracker. + pub noise_observations: u64, + + /// Estimated fraction of the baseline **not yet established by + /// real data**. + /// + /// Starts at `1.0` for every new tracker — regardless of whether + /// noise has been injected — because the initial EWMA baselines + /// are arbitrary placeholders, not a learned model. Each real + /// observation decays the value by `λ`; each noise observation + /// pushes it back toward `1.0`. + /// + /// - `1.0` = baselines are entirely placeholders or noise + /// (no real data has been seen) + /// - `0.0` = baselines are fully determined by real data + /// (noise / cold-start influence has decayed away) + /// + /// **Distinguishing cold from noise-warmed.** A tracker that has + /// never received noise injection *also* starts at `1.0`. To + /// tell the two apart, check `noise_observations`: + /// + /// | `noise_observations` | `noise_influence` | State | + /// |----------------------|-------------------|-------| + /// | `0` | `1.0` | Cold — baselines are placeholders | + /// | `0` | `< 1.0` | Maturing — only real data, no noise | + /// | `> 0` | near `1.0` | Noise-warmed, little real data yet | + /// | `> 0` | near `0.0` | Mature — noise has decayed away | + /// + /// A typical host policy: + /// + /// ```text + /// if noise_influence > 0.05 { + /// // Baselines still immature — scores may be unreliable. + /// // This fires for both cold and noise-warmed trackers + /// // that haven't seen enough real traffic yet. + /// } + /// ``` + pub noise_influence: f64, +} + +impl TrackerMaturity { + /// A tracker with no observations of any kind. + /// + /// `noise_influence` starts at `1.0` — the baselines are pure + /// placeholders, not a learned model. See the field-level docs + /// for how to distinguish this from a noise-warmed tracker. + #[must_use] + pub const fn cold() -> Self { + Self { + real_observations: 0, + noise_observations: 0, + noise_influence: 1.0, + } + } + + /// Total observations (real + noise). + #[must_use] + pub const fn total_observations(&self) -> u64 { + self.real_observations + self.noise_observations + } +} + +// ─── Scoring geometry ─────────────────────────────────────── + +/// Geometric properties of the tracker's scoring state. +/// +/// Reports the structural facts that determine which scoring axes +/// produce meaningful measurements given the tracker's current rank +/// and dimensionality. The sentinel reports these facts; the host +/// decides what confidence threshold to apply. +/// +/// # Quick reference +/// +/// | Condition | Effect | +/// |-----------|--------| +/// | `residual_dof == 0` | Novelty axis degenerate — subspace spans the full input space | +/// | `cap >= dim` | Novelty *can* become degenerate as rank adapts | +/// | `rank < 2`* | Coherence axis does not exist (no pairs, baseline destroyed) | +/// +/// *`rank` is available from the parent report type +/// ([`PrefixReport::rank`], [`CoordinationReport::rank`], +/// [`TrackerInspection::rank`]).* +#[derive(Debug, Clone, Copy)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct ScoringGeometry { + /// Working dimensionality of the tracker's input space. + /// + /// For per-campus trackers this equals the prefix depth. + /// For coordination meta-trackers this is `4` (one dimension + /// per scoring axis: novelty, displacement, surprise, coherence). + pub dim: usize, + + /// Maximum rank this tracker can reach: `min(dim, max_rank)`. + /// + /// When `cap >= dim`, the rank *can* reach the full + /// dimensionality, which would leave zero residual DOF for + /// novelty scoring. + pub cap: usize, + + /// Residual degrees of freedom: `dim - rank`. + /// + /// Novelty scoring divides residual energy by + /// `max(1, dim - rank)`. When `residual_dof == 0`, the + /// subspace spans the entire input space and the novelty axis + /// is structurally degenerate — the reported value is raw + /// residual energy (numerical noise), not a normalised per-DOF + /// anomaly measure. + pub residual_dof: usize, +} + +impl ScoringGeometry { + /// Whether the novelty axis is structurally degenerate + /// (`residual_dof == 0`). + #[must_use] + pub const fn is_novelty_saturated(&self) -> bool { + self.residual_dof == 0 + } + + /// Whether the novelty axis *can* become degenerate as rank + /// adapts (`cap >= dim`). + #[must_use] + pub const fn is_novelty_saturable(&self) -> bool { + self.cap >= self.dim + } +} + +// ─── Per-sample scores ────────────────────────────────────── + +/// Raw anomaly scores for a single observation. +/// +/// Only present when [`SentinelConfig::per_sample_scores`](crate::config::SentinelConfig::per_sample_scores) +/// is enabled. +#[derive(Debug, Clone, Copy)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct SampleScore { + /// Residual energy per residual DOF (novelty axis). + pub novelty: f64, + + /// Campus displacement score, bounded in `[0, 1)`. + pub displacement: f64, + + /// Mahalanobis distance per rank (surprise axis). + pub surprise: f64, + + /// Cross-correlation deviation (coherence axis). + /// + /// `2 / (k(k−1)) · Σⱼ<ₗ (zⱼzₗ − Cⱼₗ)²`, or `0.0` when `k = 1`. + pub coherence: f64, + + /// Z-score of `novelty` against its EWMA baseline. + pub novelty_z: f64, + + /// Z-score of `displacement` against its EWMA baseline. + pub displacement_z: f64, + + /// Z-score of `surprise` against its EWMA baseline. + pub surprise_z: f64, + + /// Z-score of `coherence` against its EWMA baseline. + pub coherence_z: f64, +} + +// ─── Health ───────────────────────────────────────────────── + +/// Operational health snapshot of the entire sentinel. +/// +/// Returned by [`SpectralSentinel::health`](crate::sentinel::SpectralSentinel::health). +/// Useful for dashboards and monitoring. +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct HealthReport { + /// Number of active `(campus, depth)` trackers. + pub active_trackers: usize, + + /// Total real observations across the sentinel's lifetime. + pub lifetime_observations: u64, + + /// Number of distinct campus buckets that have received traffic. + pub campuses_seen: usize, + + /// Distribution of ranks across all active trackers. + pub rank_distribution: RankDistribution, + + /// Distribution of maturity across all active trackers. + pub maturity_distribution: MaturityDistribution, + + /// Distribution of geometric scoring reliability across all + /// active per-campus trackers. + pub geometry_distribution: GeometryDistribution, + + /// Health of the coordination (meta-tracker) tier. + pub meta_tracker_health: MetaTrackerHealth, +} + +/// Summary of rank values across all active trackers. +#[derive(Debug, Clone, Copy)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct RankDistribution { + /// Lowest rank among all trackers. + pub min: usize, + + /// Highest rank among all trackers. + pub max: usize, + + /// Mean rank across all trackers. + pub mean: f64, +} + +/// Summary of maturity across all active trackers. +#[derive(Debug, Clone, Copy)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct MaturityDistribution { + /// Highest noise influence among all trackers (least mature). + pub max_noise_influence: f64, + + /// Lowest noise influence among all trackers (most mature). + pub min_noise_influence: f64, + + /// Mean noise influence across all trackers. + pub mean_noise_influence: f64, + + /// Number of trackers with zero real observations. + pub cold_trackers: usize, +} + +// ─── Geometry distribution ────────────────────────────────── + +/// Summary of geometric scoring reliability across a set of trackers. +/// +/// Captures how many trackers are in states where specific scoring +/// axes cannot produce structurally meaningful measurements. +#[derive(Debug, Clone, Copy)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct GeometryDistribution { + /// Trackers where `rank == dim` (novelty axis degenerate). + /// + /// The subspace spans the entire input space, leaving zero + /// residual DOF. Novelty scores from these trackers reflect + /// numerical noise, not structural anomalies. + pub novelty_saturated: usize, + + /// Trackers where `cap >= dim` (novelty *can* become degenerate). + /// + /// The rank ceiling allows the tracker to eventually span the + /// full input space. Not necessarily degenerate now, but could + /// become so as the rank adapts upward. + pub novelty_saturable: usize, + + /// Trackers where `rank < 2` (coherence axis does not exist). + /// + /// Coherence requires at least two latent dimensions to form + /// pairwise products. At rank < 2 it is undefined: scores are + /// identically zero and baselines are in the cold (destroyed) + /// state. If rank previously was ≥ 2 and dropped, the baseline + /// was destroyed — no stale state persists. + pub coherence_inactive: usize, +} + +// ─── Meta-tracker health ──────────────────────────────────── + +/// Health snapshot of the coordination (meta-tracker) tier. +/// +/// One meta-tracker exists per configured prefix depth. They operate +/// at `d = 4` and consume campus mean-score vectors. +#[derive(Debug, Clone, Copy)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct MetaTrackerHealth { + /// Number of meta-trackers (one per configured prefix depth). + pub count: usize, + + /// Maximum rank each meta-tracker can reach. + /// + /// Because meta-trackers operate at `d = 4`, capacity is + /// `min(4, max_rank)`. When `max_rank ≥ 4` the subspace can + /// span the entire input space, leaving no residual for novelty + /// scoring. The host can compare this against `d` to decide + /// whether the configuration is meaningful. + pub capacity: usize, + + /// Distribution of ranks across meta-trackers. + pub rank_distribution: RankDistribution, + + /// Distribution of maturity across meta-trackers. + pub maturity_distribution: MaturityDistribution, + + /// Working dimensionality of each meta-tracker's input space. + /// + /// Currently always `4` (one dimension per scoring axis: + /// novelty, displacement, surprise, coherence). Exposed so + /// the host can compare it against [`capacity`](Self::capacity) + /// and individual meta-tracker ranks without hard-coding + /// implementation details. + pub dim: usize, + + /// Distribution of geometric scoring reliability across + /// meta-trackers. + pub geometry_distribution: GeometryDistribution, +} + +// ─── Campus inspection ────────────────────────────────────── + +/// Read-only snapshot of a single campus's tracker state. +/// +/// Returned by [`SpectralSentinel::inspect_campus`](crate::sentinel::SpectralSentinel::inspect_campus). +/// Lets the host examine a specific campus without waiting for the +/// next [`ingest`](crate::sentinel::SpectralSentinel::ingest) call. +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct CampusInspection { + /// The campus bucket ID. + pub campus_id: u32, + + /// Per-prefix-depth tracker state, in the same order as + /// [`SentinelConfig::prefix_depths`](crate::config::SentinelConfig::prefix_depths). + pub trackers: Vec, + + /// The `ingest` batch counter value when this campus last received + /// traffic. + pub last_active_batch: u64, +} + +/// Read-only snapshot of a single `(campus, depth)` tracker's +/// learned state. +/// +/// This is a lighter summary than [`PrefixReport`] — it captures +/// the model's structural health without requiring a new batch of +/// observations to score against. +#[derive(Debug, Clone, Copy)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct TrackerInspection { + /// Prefix depth this tracker operates at. + pub depth: u8, + + /// Current rank (number of active basis vectors). + pub rank: usize, + + /// Fraction of total variance captured by the current rank. + pub energy_ratio: f64, + + /// Largest singular value of the learned subspace. + pub top_singular_value: f64, + + /// Maturity state. + pub maturity: TrackerMaturity, + + /// Geometric scoring properties. + pub geometry: ScoringGeometry, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cold_maturity_is_fully_noisy() { + let m = TrackerMaturity::cold(); + assert_eq!(m.real_observations, 0); + assert_eq!(m.noise_observations, 0); + assert!((m.noise_influence - 1.0).abs() < f64::EPSILON); + assert_eq!(m.total_observations(), 0); + } +} diff --git a/packages/sentinel/src/sentinel/cusum.rs b/packages/sentinel/src/sentinel/cusum.rs new file mode 100644 index 00000000..b1297738 --- /dev/null +++ b/packages/sentinel/src/sentinel/cusum.rs @@ -0,0 +1,191 @@ +//! CUSUM (Cumulative Sum) drift accumulator. +//! +//! Detects sustained upward drift of batch mean scores away from a +//! slow EWMA reference. This is a one-sided Page's test: the +//! accumulator grows when the fast signal consistently exceeds the +//! slow baseline by more than a noise allowance, and resets to zero +//! when the deviation reverses. +//! +//! See `docs/algorithm.md` §4.3 for the full specification. +//! +//! Pure `f64` arithmetic — no `faer` dependency. + +use crate::ewma::EwmaStats; +use crate::report::{BaselineSnapshot, CusumSnapshot}; + +/// One-sided CUSUM accumulator with a slow EWMA reference. +/// +/// Each scoring axis owns one of these. It pairs a slow EWMA +/// baseline (longer memory than the fast baseline in [`EwmaStats`]) +/// with a cumulative sum that builds evidence of sustained drift. +/// +/// The sentinel reports the raw accumulator value; the host decides +/// what level of accumulated drift warrants action. +#[derive(Debug, Clone)] +pub struct CusumAccumulator { + /// Slow EWMA baseline — the reference the CUSUM measures drift from. + slow: EwmaStats, + + /// The cumulative sum. Non-negative (clamped at zero). + accumulator: f64, + + /// Batches since the last reset (including post-noise reset). + steps_since_reset: u64, +} + +impl CusumAccumulator { + /// Create a new CUSUM accumulator with the given slow decay factor. + #[must_use] + pub const fn new(slow_decay: f64) -> Self { + Self { + slow: EwmaStats::new(slow_decay), + accumulator: 0.0, + steps_since_reset: 0, + } + } + + /// Update the accumulator with a batch of per-sample scores. + /// + /// 1. Computes the gap: `batch_mean − slow_mean − κ·√slow_var`. + /// 2. Accumulates: `S = max(0, S + gap)`. + /// 3. Feeds the scores to the slow EWMA baseline. + /// + /// The gap is computed *before* updating the slow baseline so the + /// reference reflects the prior state — matching the principle + /// that scoring precedes evolution. + /// + /// `allowance_sigmas` is `κ_σ` from config — the noise tolerance + /// in units of slow-baseline standard deviation. + pub fn update(&mut self, scores: &[f64], batch_mean: f64, allowance_sigmas: f64, eps: f64) { + let slow_mean = self.slow.mean(); + let slow_std = (self.slow.variance() + eps).sqrt(); + let allowance = allowance_sigmas * slow_std; + + let gap = batch_mean - slow_mean - allowance; + self.accumulator = (self.accumulator + gap).max(0.0); + + // Now update the slow baseline with this batch. + self.slow.update(scores); + + self.steps_since_reset += 1; + } + + /// Reset the accumulator to zero. + /// + /// Called after noise injection (§4.4) and optionally by the host + /// after acknowledging a regime change. + pub fn reset(&mut self) { + self.accumulator = 0.0; + self.steps_since_reset = 0; + } + + /// Destroy all state — return to the freshly-constructed state. + /// + /// Resets the accumulator *and* the slow EWMA baseline to cold. + /// Used when the scoring axis this accumulator tracks ceases to + /// exist (e.g. coherence when rank drops below 2). + pub fn reset_cold(&mut self) { + self.slow.reset_cold(); + self.accumulator = 0.0; + self.steps_since_reset = 0; + } + + /// Snapshot for inclusion in reports. + #[must_use] + pub const fn snapshot(&self) -> CusumSnapshot { + CusumSnapshot { + accumulator: self.accumulator, + slow_baseline: BaselineSnapshot { + mean: self.slow.mean(), + variance: self.slow.variance(), + }, + steps_since_reset: self.steps_since_reset, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn starts_at_zero() { + let c = CusumAccumulator::new(0.999); + let snap = c.snapshot(); + assert!((snap.accumulator).abs() < f64::EPSILON); + assert_eq!(snap.steps_since_reset, 0); + } + + #[test] + fn accumulates_under_sustained_elevation() { + let mut c = CusumAccumulator::new(0.999); + // Warm the slow baseline with normal-ish scores. + for _ in 0..20 { + c.update(&[1.0, 1.0, 1.0], 1.0, 0.5, 1e-6); + } + let before = c.snapshot().accumulator; + + // Now feed consistently elevated scores. + for _ in 0..10 { + c.update(&[5.0, 5.0, 5.0], 5.0, 0.5, 1e-6); + } + assert!( + c.snapshot().accumulator > before, + "accumulator should grow under sustained elevation" + ); + } + + #[test] + fn resets_to_zero() { + let mut c = CusumAccumulator::new(0.999); + c.update(&[5.0, 5.0], 5.0, 0.0, 1e-6); + c.update(&[5.0, 5.0], 5.0, 0.0, 1e-6); + assert!(c.snapshot().accumulator > 0.0); + + c.reset(); + assert!((c.snapshot().accumulator).abs() < f64::EPSILON); + assert_eq!(c.snapshot().steps_since_reset, 0); + } + + #[test] + fn clamps_at_zero_when_below_baseline() { + let mut c = CusumAccumulator::new(0.999); + // Warm with high values. + for _ in 0..20 { + c.update(&[10.0, 10.0], 10.0, 0.5, 1e-6); + } + c.reset(); + + // Feed low values — gap is negative, accumulator stays at zero. + for _ in 0..10 { + c.update(&[0.1, 0.1], 0.1, 0.5, 1e-6); + } + assert!( + (c.snapshot().accumulator).abs() < f64::EPSILON, + "accumulator should not go below zero" + ); + } + + #[test] + fn allowance_absorbs_noise() { + // With a large allowance, small deviations should not accumulate + // when the slow baseline has meaningful variance. + let mut c = CusumAccumulator::new(0.999); + + // Warm with varied data so the slow baseline has real variance. + for _ in 0..20 { + c.update(&[0.5, 1.0, 1.5], 1.0, 2.0, 1e-6); + } + c.reset(); + + // Feed slightly elevated scores — allowance should absorb them. + for _ in 0..10 { + c.update(&[1.1, 1.2, 1.3], 1.2, 2.0, 1e-6); + } + assert!( + c.snapshot().accumulator < 0.1, + "generous allowance should absorb small deviations, got {}", + c.snapshot().accumulator, + ); + } +} diff --git a/packages/sentinel/src/sentinel/mod.rs b/packages/sentinel/src/sentinel/mod.rs new file mode 100644 index 00000000..e98827cc --- /dev/null +++ b/packages/sentinel/src/sentinel/mod.rs @@ -0,0 +1,826 @@ +//! The sentinel engine — orchestration, tracking, and drift detection. +//! +//! This module contains the core [`SpectralSentinel`] orchestrator and +//! its internal machinery. Only the orchestrator is part of the public +//! API; the subspace tracker and CUSUM accumulator are implementation +//! details. +//! +//! # Quick start +//! +//! ``` +//! use torrust_sentinel::config::SentinelConfig; +//! use torrust_sentinel::sentinel::SpectralSentinel; +//! +//! let cfg = SentinelConfig { +//! campus_bits: 4, +//! prefix_depths: vec![8, 16], +//! ..SentinelConfig::default() +//! }; +//! let mut sentinel = SpectralSentinel::new(cfg).unwrap(); +//! +//! let values: Vec = vec![ +//! 0xF000_0000_0000_0000_0000_0000_0000_0001, +//! 0xF000_0000_0000_0000_0000_0000_0000_0002, +//! 0x1000_0000_0000_0000_0000_0000_0000_0003, +//! ]; +//! let report = sentinel.ingest(&values); +//! +//! assert_eq!(report.campus_reports.len(), 2); +//! ``` + +pub(crate) mod cusum; +pub(crate) mod tracker; + +use std::collections::BTreeMap; + +use rand::rngs::SmallRng; +use rand::{RngExt, SeedableRng}; + +use self::tracker::SubspaceTracker; +use crate::config::{ConfigErrors, SentinelConfig}; +use crate::observation::{CampusId, ObservationBatch}; +use crate::report::{ + BatchReport, CampusInspection, CampusReport, CoordinationReport, GeometryDistribution, HealthReport, MaturityDistribution, + MetaTrackerHealth, PrefixReport, RankDistribution, TrackerInspection, +}; + +// ─── Noise injection parameters ───────────────────────────── + +/// Parameters for synthetic noise injection. +/// +/// Controls how [`SpectralSentinel::inject_noise`] generates and feeds +/// random observations into the trackers. This is a per-call argument, +/// not part of the persistent [`SentinelConfig`]. +#[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct NoiseParams { + /// Number of synthetic batches to feed each tracker. + /// + /// More rounds = wider baseline variance = slower confidence ramp-up. + /// + /// Default: `50` + pub rounds: usize, + + /// Number of synthetic samples per batch. + /// + /// Default: `16` + pub batch_size: usize, + + /// RNG seed for deterministic noise generation. + /// + /// `Some(seed)` = reproducible across restarts, which is useful for + /// testing and consistent startup behaviour. `None` = use system entropy. + /// + /// Default: `Some(42)` + pub seed: Option, +} + +impl Default for NoiseParams { + fn default() -> Self { + Self { + rounds: 50, + batch_size: 16, + seed: Some(42), + } + } +} + +// ─── Internal state types ─────────────────────────────────── + +/// Per-campus state: one subspace tracker per configured prefix depth. +struct CampusState { + /// One tracker per entry in `SentinelConfig::prefix_depths`, same order. + trackers: Vec, + + /// The `batch_counter` value when this campus last received observations. + last_active_batch: u64, +} + +/// Per-depth coordination state: a meta-tracker at d=4 plus a +/// running-mean centering reference. +struct MetaState { + /// Subspace tracker operating at d=4, using `cusum_meta_slow_decay`. + tracker: SubspaceTracker, + + /// Running EWMA mean of the 4D campus-score input vectors. + /// Used for centering before feeding the meta-tracker (§5.2). + running_mean: [f64; 4], + + /// Whether the running mean has been initialised with at least one batch. + warm: bool, +} + +// ─── SpectralSentinel ─────────────────────────────────────── + +/// Hierarchical online subspace anomaly detector. +/// +/// The sentinel maintains one `SubspaceTracker` per observed +/// (campus, prefix-depth) pair. Trackers are created lazily — only +/// campuses that actually receive traffic cost memory. +/// +/// A second tier of meta-trackers (one per prefix depth) analyses +/// cross-campus score patterns for coordinated anomalies. +/// +/// # Design principle +/// +/// **The sentinel measures; the host decides.** +/// +/// [`ingest`](Self::ingest) returns a [`BatchReport`] containing raw +/// statistical measurements. The host reads the report and applies +/// its own policy to decide what (if anything) to do. +pub struct SpectralSentinel { + config: SentinelConfig, + + /// Per-campus state, keyed by campus ID. + /// + /// `BTreeMap` (not `HashMap`) so that iteration order is + /// deterministic — critical for reproducible noise injection + /// when the caller supplies a fixed RNG seed. + campuses: BTreeMap, + + /// Per-depth coordination state. Indexed by position in + /// `config.prefix_depths`. + meta: Vec, + + /// Monotonically increasing counter, incremented once per `ingest` call. + batch_counter: u64, + + /// Total real (non-noise) observations across the sentinel's lifetime. + lifetime_observations: u64, +} + +impl SpectralSentinel { + /// Create a new sentinel with the given configuration. + /// + /// Validates the configuration and pre-creates meta-trackers for + /// each configured prefix depth. No per-campus trackers are + /// created until the first [`ingest`](Self::ingest) call. + /// + /// # Errors + /// + /// Returns [`ConfigErrors`](crate::config::ConfigErrors) if the + /// configuration violates any invariant (see + /// [`SentinelConfig::validate`]). + pub fn new(config: SentinelConfig) -> Result { + config.validate()?; + + let meta = config + .prefix_depths + .iter() + .map(|_| MetaState { + tracker: SubspaceTracker::new(4, &config, config.cusum_meta_slow_decay), + running_mean: [0.0; 4], + warm: false, + }) + .collect(); + + Ok(Self { + config, + campuses: BTreeMap::new(), + meta, + batch_counter: 0, + lifetime_observations: 0, + }) + } + + /// Process a batch of raw `u128` observations and return a full + /// statistical report. + /// + /// Each value is converted to a centred bit vector, assigned to a + /// campus bucket, and scored at every configured prefix depth. + /// Cross-campus coordination analysis runs after all per-campus + /// scoring is complete. + /// + /// An empty input slice produces an empty report (no campus + /// reports, all coordination entries `None`). + pub fn ingest(&mut self, values: &[u128]) -> BatchReport { + // ── Early return on empty input ───────────────── + if values.is_empty() { + return self.empty_report(); + } + + // ── Encode ────────────────────────────────────── + let batch = ObservationBatch::from_values(values, self.config.campus_bits); + let num_depths = self.config.prefix_depths.len(); + + // ── Per-campus scoring ────────────────────────── + // For each campus × depth we collect a PrefixReport. + // We also need the per-campus mean scores per depth for + // the coordination tier. + // + // campus_mean_scores[depth_idx] = Vec of [nov, disp, surp, coh] + let mut campus_reports = Vec::with_capacity(batch.num_campuses()); + let mut campus_mean_scores: Vec> = vec![Vec::new(); num_depths]; + + for (campus_id, indices) in batch.groups() { + let sample_count = indices.len(); + + // Get or create the campus state. + let state = self + .campuses + .entry(campus_id) + .or_insert_with(|| Self::new_campus_state(&self.config, self.batch_counter)); + state.last_active_batch = self.batch_counter; + + let mut prefix_reports = Vec::with_capacity(num_depths); + + for (di, &depth) in self.config.prefix_depths.iter().enumerate() { + // Gather prefix slices for this campus × depth. + let slices: Vec<&[f64]> = indices.iter().map(|&i| batch.observations[i].prefix(depth)).collect(); + + let report = state.trackers[di].observe(&slices, depth, false); + + // Collect mean scores for the coordination tier. + campus_mean_scores[di].push([ + report.scores.novelty.mean, + report.scores.displacement.mean, + report.scores.surprise.mean, + report.scores.coherence.mean, + ]); + + prefix_reports.push(report); + } + + campus_reports.push(CampusReport { + campus_id: campus_id.0, + sample_count, + prefix_reports, + }); + } + + // ── Coordination tier ─────────────────────────── + let coordination = self.run_coordination(&campus_mean_scores); + + // ── Bookkeeping ───────────────────────────────── + self.batch_counter += 1; + + #[allow(clippy::cast_possible_truncation)] // batch len ≪ 2^63 + let count = values.len() as u64; + self.lifetime_observations += count; + + // ── Assemble report ───────────────────────────── + BatchReport { + lifetime_observations: self.lifetime_observations, + active_trackers: self.active_tracker_count(), + campus_reports, + coordination, + } + } + + /// Inject synthetic noise to warm baselines before real traffic. + /// + /// Feeds `params.rounds` batches of `params.batch_size` random + /// centred vectors (±0.5) through every existing per-campus + /// tracker, then chains the resulting scores into the + /// meta-trackers (§5.5). After all rounds, every CUSUM + /// accumulator — per-tracker and meta-tracker — is reset to + /// zero (§4.4). + /// + /// This should be called **after** the campuses of interest have + /// been created (e.g. after a first `ingest`) and **before** real + /// traffic scoring begins. Calling on an empty sentinel (no + /// campuses) is a no-op. + pub fn inject_noise(&mut self, params: &NoiseParams) { + if self.campuses.is_empty() { + return; + } + + let mut rng: SmallRng = params + .seed + .map_or_else(|| SmallRng::from_rng(&mut rand::rng()), SmallRng::seed_from_u64); + + let lam = self.config.forgetting_factor; + let num_depths = self.config.prefix_depths.len(); + + // Collect campus IDs up-front so we can call `get_mut` inside the + // loop without holding an immutable borrow on the map. Iteration + // order is deterministic (BTreeMap) so the RNG sequence is + // reproducible across runs when a fixed seed is supplied. + let campus_ids: Vec = self.campuses.keys().copied().collect(); + + for _round in 0..params.rounds { + // Per-depth accumulator for meta-tracker input. + // campus_mean_scores[di] = Vec of [nov, disp, surp, coh] + let mut campus_mean_scores: Vec> = vec![Vec::with_capacity(campus_ids.len()); num_depths]; + + for &campus_id in &campus_ids { + let Some(state) = self.campuses.get_mut(&campus_id) else { + continue; + }; + + for (di, &depth) in self.config.prefix_depths.iter().enumerate() { + let dim = usize::from(depth); + let noise = Self::generate_noise_batch(dim, params.batch_size, &mut rng); + let slices: Vec<&[f64]> = noise.iter().map(Vec::as_slice).collect(); + + let report = state.trackers[di].observe(&slices, depth, true); + + campus_mean_scores[di].push([ + report.scores.novelty.mean, + report.scores.displacement.mean, + report.scores.surprise.mean, + report.scores.coherence.mean, + ]); + } + } + + // ── Chained meta-tracker warming (§5.5) ───── + for (di, scores_for_depth) in campus_mean_scores.iter().enumerate() { + if scores_for_depth.len() >= 2 { + let depth = self.config.prefix_depths[di]; + Self::feed_meta_for_depth(&mut self.meta[di], depth, scores_for_depth, lam, true); + } + } + } + + // ── Reset all CUSUMs (§4.4) ──────────────────── + for state in self.campuses.values_mut() { + for tracker in &mut state.trackers { + tracker.reset_cusum(); + } + } + for meta in &mut self.meta { + meta.tracker.reset_cusum(); + } + } + + /// Evict campus buckets that have not received traffic for more + /// than `max_idle_batches` consecutive [`ingest`](Self::ingest) + /// calls. + /// + /// Returns the number of campus buckets removed. Each evicted + /// campus drops all its per-depth trackers, freeing the + /// associated memory. + /// + /// Meta-trackers are **not** affected — they represent + /// cross-campus structure and adapt naturally as evicted campuses + /// stop contributing score vectors. + /// + /// Passing `0` evicts every campus that did not appear in the + /// most recent batch. + pub fn evict_idle(&mut self, max_idle_batches: u64) -> usize { + // batch_counter is incremented at the end of ingest(), so the + // most recent completed batch set last_active_batch to + // (batch_counter - 1). We use saturating_sub to handle the + // edge case where no ingest has been called yet. + let latest = self.batch_counter.saturating_sub(1); + let before = self.campuses.len(); + self.campuses + .retain(|_, state| latest - state.last_active_batch <= max_idle_batches); + before - self.campuses.len() + } + + /// Remove a single campus by ID. + /// + /// Returns `true` if the campus existed (and was removed). + /// + /// This is the targeted complement to [`evict_idle`](Self::evict_idle). + /// The host might use it after inspecting a [`BatchReport`] and + /// deciding a specific campus is adversarial. + pub fn evict_campus(&mut self, campus: CampusId) -> bool { + self.campuses.remove(&campus).is_some() + } + + /// Produce an operational health snapshot of the sentinel. + /// + /// Summarises rank distribution and maturity across all active + /// trackers. Useful for dashboards, alerting on tracker + /// proliferation, and deciding when to call + /// [`evict_idle`](Self::evict_idle). + #[must_use] + pub fn health(&self) -> HealthReport { + let active_trackers = self.active_tracker_count(); + let meta_health = self.meta_tracker_health(); + + if active_trackers == 0 { + return HealthReport { + active_trackers: 0, + lifetime_observations: self.lifetime_observations, + campuses_seen: 0, + rank_distribution: RankDistribution { + min: 0, + max: 0, + mean: 0.0, + }, + maturity_distribution: MaturityDistribution { + max_noise_influence: 0.0, + min_noise_influence: 0.0, + mean_noise_influence: 0.0, + cold_trackers: 0, + }, + geometry_distribution: GeometryDistribution { + novelty_saturated: 0, + novelty_saturable: 0, + coherence_inactive: 0, + }, + meta_tracker_health: meta_health, + }; + } + + let mut rank_min = usize::MAX; + let mut rank_max = 0_usize; + let mut rank_sum = 0_u64; + + let mut ni_min = f64::INFINITY; + let mut ni_max = f64::NEG_INFINITY; + let mut ni_sum = 0.0_f64; + let mut cold = 0_usize; + + let mut geo_saturated = 0_usize; + let mut geo_saturable = 0_usize; + let mut geo_coh_inactive = 0_usize; + + for state in self.campuses.values() { + for tracker in &state.trackers { + let r = tracker.rank(); + rank_min = rank_min.min(r); + rank_max = rank_max.max(r); + rank_sum += r as u64; + + let m = tracker.maturity(); + ni_min = ni_min.min(m.noise_influence); + ni_max = ni_max.max(m.noise_influence); + ni_sum += m.noise_influence; + if m.real_observations == 0 { + cold += 1; + } + + let g = tracker.scoring_geometry(); + if g.is_novelty_saturated() { + geo_saturated += 1; + } + if g.is_novelty_saturable() { + geo_saturable += 1; + } + if r < 2 { + geo_coh_inactive += 1; + } + } + } + + #[allow(clippy::cast_precision_loss)] + let n = active_trackers as f64; + + #[allow(clippy::cast_precision_loss)] // rank_sum ≪ 2^52 in practice + let rank_mean = rank_sum as f64 / n; + + HealthReport { + active_trackers, + lifetime_observations: self.lifetime_observations, + campuses_seen: self.campuses.len(), + rank_distribution: RankDistribution { + min: rank_min, + max: rank_max, + mean: rank_mean, + }, + maturity_distribution: MaturityDistribution { + max_noise_influence: ni_max, + min_noise_influence: ni_min, + mean_noise_influence: ni_sum / n, + cold_trackers: cold, + }, + geometry_distribution: GeometryDistribution { + novelty_saturated: geo_saturated, + novelty_saturable: geo_saturable, + coherence_inactive: geo_coh_inactive, + }, + meta_tracker_health: meta_health, + } + } + + /// Number of campus buckets that have received traffic. + #[must_use] + pub fn campuses_seen(&self) -> usize { + self.campuses.len() + } + + /// Total number of active `(campus, depth)` trackers. + #[must_use] + pub fn active_tracker_count(&self) -> usize { + self.campuses.len() * self.config.prefix_depths.len() + } + + /// Total real observations processed across the sentinel's lifetime. + #[must_use] + pub const fn lifetime_observations(&self) -> u64 { + self.lifetime_observations + } + + /// Read-only access to the configuration. + #[must_use] + pub const fn config(&self) -> &SentinelConfig { + &self.config + } + + /// Reset the sentinel to its freshly-constructed state. + /// + /// Drops all per-campus trackers and their learned subspaces, + /// re-initialises meta-trackers, and zeroes all counters. + /// The configuration is preserved. + /// + /// This is useful when the host determines that the learned model + /// is fundamentally wrong (e.g. after a major topology change) + /// and wants a clean slate without re-validating the config. + pub fn reset(&mut self) { + self.campuses.clear(); + self.batch_counter = 0; + self.lifetime_observations = 0; + self.meta = self + .config + .prefix_depths + .iter() + .map(|_| MetaState { + tracker: SubspaceTracker::new(4, &self.config, self.config.cusum_meta_slow_decay), + running_mean: [0.0; 4], + warm: false, + }) + .collect(); + } + + /// List all campus IDs that currently have trackers. + /// + /// Returns IDs in ascending order (`BTreeMap` iteration order). + #[must_use] + pub fn campus_ids(&self) -> Vec { + self.campuses.keys().copied().collect() + } + + /// Inspect a specific campus's tracker state. + /// + /// Returns `None` if the campus has not been seen or was evicted. + /// The snapshot is cheap — it reads existing state without + /// triggering any model updates. + #[must_use] + pub fn inspect_campus(&self, campus: CampusId) -> Option { + let state = self.campuses.get(&campus)?; + + let trackers = state + .trackers + .iter() + .zip(&self.config.prefix_depths) + .map(|(tracker, &depth)| TrackerInspection { + depth, + rank: tracker.rank(), + energy_ratio: tracker.energy_ratio(), + top_singular_value: tracker.top_singular_value(), + maturity: tracker.maturity(), + geometry: tracker.scoring_geometry(), + }) + .collect(); + + Some(CampusInspection { + campus_id: campus.0, + trackers, + last_active_batch: state.last_active_batch, + }) + } + + // ════════════════════════════════════════════════════════ + // Private implementation + // ════════════════════════════════════════════════════════ + + /// Compute meta-tracker health summary. + fn meta_tracker_health(&self) -> MetaTrackerHealth { + let count = self.meta.len(); + + if count == 0 { + return MetaTrackerHealth { + count: 0, + capacity: 0, + rank_distribution: RankDistribution { + min: 0, + max: 0, + mean: 0.0, + }, + maturity_distribution: MaturityDistribution { + max_noise_influence: 0.0, + min_noise_influence: 0.0, + mean_noise_influence: 0.0, + cold_trackers: 0, + }, + dim: 4, + geometry_distribution: GeometryDistribution { + novelty_saturated: 0, + novelty_saturable: 0, + coherence_inactive: 0, + }, + }; + } + + let mut rank_min = usize::MAX; + let mut rank_max = 0_usize; + let mut rank_sum = 0_u64; + + let mut ni_min = f64::INFINITY; + let mut ni_max = f64::NEG_INFINITY; + let mut ni_sum = 0.0_f64; + let mut cold = 0_usize; + + let mut geo_saturated = 0_usize; + let mut geo_saturable = 0_usize; + let mut geo_coh_inactive = 0_usize; + + for meta in &self.meta { + let r = meta.tracker.rank(); + rank_min = rank_min.min(r); + rank_max = rank_max.max(r); + rank_sum += r as u64; + + let m = meta.tracker.maturity(); + ni_min = ni_min.min(m.noise_influence); + ni_max = ni_max.max(m.noise_influence); + ni_sum += m.noise_influence; + if m.real_observations == 0 { + cold += 1; + } + + let g = meta.tracker.scoring_geometry(); + if g.is_novelty_saturated() { + geo_saturated += 1; + } + if g.is_novelty_saturable() { + geo_saturable += 1; + } + if r < 2 { + geo_coh_inactive += 1; + } + } + + #[allow(clippy::cast_precision_loss)] + let n = count as f64; + + #[allow(clippy::cast_precision_loss)] + let rank_mean = rank_sum as f64 / n; + + // All meta-trackers share the same dim and config, so cap is + // identical — read it from the first one. + let capacity = self.meta[0].tracker.cap(); + let dim = self.meta[0].tracker.dim(); + + MetaTrackerHealth { + count, + capacity, + rank_distribution: RankDistribution { + min: rank_min, + max: rank_max, + mean: rank_mean, + }, + maturity_distribution: MaturityDistribution { + max_noise_influence: ni_max, + min_noise_influence: ni_min, + mean_noise_influence: ni_sum / n, + cold_trackers: cold, + }, + dim, + geometry_distribution: GeometryDistribution { + novelty_saturated: geo_saturated, + novelty_saturable: geo_saturable, + coherence_inactive: geo_coh_inactive, + }, + } + } + + /// Create a fresh `CampusState` with one tracker per prefix depth. + fn new_campus_state(config: &SentinelConfig, batch_counter: u64) -> CampusState { + let trackers = config + .prefix_depths + .iter() + .map(|&depth| SubspaceTracker::new(usize::from(depth), config, config.cusum_slow_decay)) + .collect(); + + CampusState { + trackers, + last_active_batch: batch_counter, + } + } + + /// Run the coordination tier: for each prefix depth, assemble + /// campus mean scores into a 4D matrix, centre against the + /// running mean, feed the meta-tracker, and produce a + /// `CoordinationReport`. + fn run_coordination(&mut self, campus_mean_scores: &[Vec<[f64; 4]>]) -> Vec> { + let lam = self.config.forgetting_factor; + + campus_mean_scores + .iter() + .enumerate() + .map(|(di, scores_for_depth)| { + let depth = self.config.prefix_depths[di]; + let n_campuses = scores_for_depth.len(); + + if n_campuses < 2 { + return None; + } + + let report = Self::feed_meta_for_depth(&mut self.meta[di], depth, scores_for_depth, lam, false); + + Some(Self::coordination_from_prefix(depth, n_campuses, report)) + }) + .collect() + } + + /// Centre one depth's campus score vectors against the running mean, + /// feed the meta-tracker, update the running mean, and return the + /// raw `PrefixReport`. + /// + /// This is an associated function (not `&mut self`) so it can be + /// called from both `run_coordination` and `inject_noise` without + /// borrow-checker conflicts. + fn feed_meta_for_depth( + meta: &mut MetaState, + depth: u8, + scores_for_depth: &[[f64; 4]], + lam: f64, + is_noise: bool, + ) -> PrefixReport { + let alpha = 1.0 - lam; + + // ── Running-mean centering (§5.2) ─────────────── + // On the very first batch `running_mean` is still [0; 4], so we + // centre against zero. This is intentional: the first scores + // pass through unshifted, and the `!meta.warm` branch below + // then seeds `running_mean` with the actual column means. + let centred: Vec> = scores_for_depth + .iter() + .map(|row| row.iter().enumerate().map(|(j, &v)| v - meta.running_mean[j]).collect()) + .collect(); + + let slices: Vec<&[f64]> = centred.iter().map(Vec::as_slice).collect(); + + // Feed the meta-tracker (d=4). + let prefix_report = meta.tracker.observe(&slices, depth, is_noise); + + // ── Update running mean ───────────────────────── + let n_campuses = scores_for_depth.len(); + + #[allow(clippy::cast_precision_loss)] + let n_f = n_campuses as f64; + + let mut col_means = [0.0_f64; 4]; + for row in scores_for_depth { + for (j, &v) in row.iter().enumerate() { + col_means[j] += v; + } + } + for m in &mut col_means { + *m /= n_f; + } + + if meta.warm { + for (j, m) in meta.running_mean.iter_mut().enumerate() { + *m = lam.mul_add(*m, alpha * col_means[j]); + } + } else { + meta.running_mean = col_means; + meta.warm = true; + } + + prefix_report + } + + /// Generate a batch of random centred vectors (each entry ±0.5). + /// + /// Matches the `CentredBits` encoding: bit 1 → +0.5, bit 0 → −0.5. + fn generate_noise_batch(dim: usize, batch_size: usize, rng: &mut SmallRng) -> Vec> { + (0..batch_size) + .map(|_| (0..dim).map(|_| if rng.random_bool(0.5) { 0.5 } else { -0.5 }).collect()) + .collect() + } + + /// Map a `PrefixReport` (from the meta-tracker) to a `CoordinationReport`. + fn coordination_from_prefix(depth: u8, campuses_reporting: usize, pr: PrefixReport) -> CoordinationReport { + CoordinationReport { + depth, + campuses_reporting, + rank: pr.rank, + energy_ratio: pr.energy_ratio, + top_singular_value: pr.top_singular_value, + scores: pr.scores, + maturity: pr.maturity, + geometry: pr.geometry, + per_campus: pr.per_sample, + } + } + + /// Produce an empty `BatchReport` (for empty input slices). + fn empty_report(&self) -> BatchReport { + BatchReport { + lifetime_observations: self.lifetime_observations, + active_trackers: self.active_tracker_count(), + campus_reports: Vec::new(), + coordination: self.config.prefix_depths.iter().map(|_| None).collect(), + } + } +} + +// ─── Compile-time safety ──────────────────────────────────── + +/// Static assertion that `SpectralSentinel` is `Send + Sync`. +/// +/// The host will almost certainly want to run the sentinel on a +/// background thread. If a future dependency introduces `!Send` +/// state, this will fail at compile time rather than at integration. +const _: () = { + const fn assert_send_sync() {} + assert_send_sync::(); +}; diff --git a/packages/sentinel/src/sentinel/tracker.rs b/packages/sentinel/src/sentinel/tracker.rs new file mode 100644 index 00000000..befbed7d --- /dev/null +++ b/packages/sentinel/src/sentinel/tracker.rs @@ -0,0 +1,758 @@ +//! Subspace tracker — the core online SVD engine. +//! +//! Maintains a low-rank model of "normal" via streaming thin SVD +//! with exponential forgetting. Scores each new batch along four +//! axes (novelty, displacement, surprise, coherence), then evolves +//! the model to incorporate the new data. +//! +//! See `docs/algorithm.md` §1.4 for the five-phase core loop and +//! §2 for the four scoring axes. +//! +//! This is the only module that depends on `faer`. + +use faer::Mat; + +use crate::config::SentinelConfig; +use crate::ewma::EwmaStats; +use crate::report::{AnomalyScores, PrefixReport, SampleScore, ScoreDistribution, ScoringGeometry, TrackerMaturity}; +use crate::sentinel::cusum::CusumAccumulator; + +// ─── Per-axis baseline ────────────────────────────────────── + +/// Fast EWMA (z-scores) + CUSUM (drift detection) for one scoring axis. +#[derive(Debug, Clone)] +struct AxisBaseline { + fast: EwmaStats, + cusum: CusumAccumulator, +} + +impl AxisBaseline { + const fn new(fast_decay: f64, slow_decay: f64) -> Self { + Self { + fast: EwmaStats::new(fast_decay), + cusum: CusumAccumulator::new(slow_decay), + } + } + + /// Destroy all learned state — return to the freshly-constructed + /// state with cold EWMA and zeroed CUSUM. + fn reset_cold(&mut self) { + self.fast.reset_cold(); + self.cusum.reset_cold(); + } +} + +// ─── SubspaceTracker ──────────────────────────────────────── + +/// Low-rank subspace model with online learning and four-axis scoring. +/// +/// Each `(campus, prefix_depth)` pair gets one of these. The tracker +/// accepts centred observation slices via [`observe`](Self::observe) +/// and returns a [`PrefixReport`]. +/// +/// The tracker has zero knowledge of campus IDs, observation types, +/// or the host's domain. It operates on `&[&[f64]]` — a batch of +/// d-dimensional centred bit slices. +#[derive(Debug, Clone)] +pub struct SubspaceTracker { + /// Prefix depth (dimensionality of the working space). + dim: usize, + + /// Hard ceiling on rank: `min(dim, max_rank)`. + cap: usize, + + /// Current active rank (number of basis vectors in use). + rank: usize, + + /// Observation step counter (for rank adaptation timing). + step: u64, + + // ── Subspace state ────────────────────────────────── + /// Orthonormal basis, shape `(dim, cap)`. Only columns `[:rank]` are active. + basis: Mat, + + /// Singular values, length `cap`. Only `[:rank]` are meaningful. + sigmas: Vec, + + // ── Latent distribution ───────────────────────────── + /// EWMA mean of latent coordinates, length `cap`. + lat_mean: Vec, + + /// EWMA variance of latent coordinates, length `cap`. + lat_var: Vec, + + /// Upper-triangle cross-correlation, flat-packed. + /// `cross_corr[tri_idx(j, l, cap)]` for `j < l` tracks EWMA of `zⱼ · zₗ`. + /// + /// Length: `cap * (cap - 1) / 2`. When rank increases, new entries + /// are already zero (the full triangle is pre-allocated at construction). + /// When rank decreases, outer entries are ignored but preserved (§1.3 Phase 3). + cross_corr: Vec, + + // ── Score baselines (one per axis) ────────────────── + novelty_bl: AxisBaseline, + displacement_bl: AxisBaseline, + surprise_bl: AxisBaseline, + coherence_bl: AxisBaseline, + + // ── Maturity tracking ─────────────────────────────── + real_observations: u64, + noise_observations: u64, + noise_influence: f64, + + // ── Config snapshots ──────────────────────────────── + forgetting_factor: f64, + energy_threshold: f64, + rank_update_interval: u64, + eps: f64, + per_sample_scores: bool, + cusum_allowance_sigmas: f64, +} + +impl SubspaceTracker { + /// Create a new tracker for the given prefix depth. + /// + /// The initial basis is identity-like columns (not random). + /// Noise injection will diversify it before real traffic arrives. + pub fn new(dim: usize, cfg: &SentinelConfig, slow_decay: f64) -> Self { + let cap = dim.min(cfg.max_rank); + + // Identity-like basis: column j has a 1.0 at row j. + let mut basis = Mat::zeros(dim, cap); + for j in 0..cap.min(dim) { + basis[(j, j)] = 1.0; + } + + let fast_decay = cfg.forgetting_factor; + + Self { + dim, + cap, + rank: 1, + step: 0, + basis, + sigmas: vec![0.01; cap], + lat_mean: vec![0.0; cap], + lat_var: vec![1.0; cap], + cross_corr: vec![0.0; cap * (cap.saturating_sub(1)) / 2], + novelty_bl: AxisBaseline::new(fast_decay, slow_decay), + displacement_bl: AxisBaseline::new(fast_decay, slow_decay), + surprise_bl: AxisBaseline::new(fast_decay, slow_decay), + coherence_bl: AxisBaseline::new(fast_decay, slow_decay), + real_observations: 0, + noise_observations: 0, + noise_influence: 1.0, + forgetting_factor: cfg.forgetting_factor, + energy_threshold: cfg.energy_threshold, + rank_update_interval: cfg.rank_update_interval, + eps: cfg.eps, + per_sample_scores: cfg.per_sample_scores, + cusum_allowance_sigmas: cfg.cusum_allowance_sigmas, + } + } + + /// Process a batch of centred observation slices and return a report. + /// + /// Each inner slice in `rows` has length `self.dim`. + /// Scoring happens against the *prior* model, then the model evolves. + /// + /// `is_noise` controls maturity bookkeeping — noise observations + /// don't count as real. + #[allow(clippy::many_single_char_names)] // mathematical notation matching the spec + pub fn observe(&mut self, rows: &[&[f64]], depth: u8, is_noise: bool) -> PrefixReport { + let b = rows.len(); + let d = self.dim; + let k = self.rank; + let eps = self.eps; + + // Build X matrix (b × d). + let x = Self::build_matrix(rows, b, d); + + // ── Phase 1: Score against the prior model ────── + // Capture Z from this phase for Phase 3 (latent evolution uses + // the prior-basis projection, not the post-evolution basis). + let u_k = self.basis.subcols(0, k); + let z = &x * u_k; // (b × k) + let x_hat = &z * u_k.transpose(); // (b × d) + let residual = &x - &x_hat; // (b × d) + + let (nov_scores, disp_scores, surp_scores, coh_scores) = self.compute_scores(&z, &residual, b, d, k, eps); + + // Build per-sample structs only when enabled (avoids 4 z-score + // calls per sample in the common disabled case). + let per_sample = if self.per_sample_scores { + Some(self.build_per_sample(&nov_scores, &disp_scores, &surp_scores, &coh_scores, eps)) + } else { + None + }; + + // ── Phase 2: Evolve subspace (streaming thin SVD) ─ + self.evolve_subspace(&x, k); + + // ── Phase 3: Evolve latent distribution ───────── + // Uses Z from Phase 1 (prior basis), not the updated basis. + self.evolve_latent(&z, k); + + // ── Phase 4: Update score baselines and CUSUM ─── + let allowance = self.cusum_allowance_sigmas; + let novelty_dist = Self::update_axis(&mut self.novelty_bl, &nov_scores, eps, allowance, true); + let displacement_dist = Self::update_axis(&mut self.displacement_bl, &disp_scores, eps, allowance, true); + let surprise_dist = Self::update_axis(&mut self.surprise_bl, &surp_scores, eps, allowance, true); + // Coherence does not exist at k < 2 (no pairs). Scores are + // identically zero, so the baseline must not evolve — it stays + // cold until k reaches 2, where the first real values enter + // through the cold→warm path. If rank later drops back below + // 2, adapt_rank() destroys the coherence baseline entirely. + let coherence_dist = Self::update_axis(&mut self.coherence_bl, &coh_scores, eps, allowance, k >= 2); + + // ── Phase 5: Adapt rank ───────────────────────── + self.step += 1; + if self.step % self.rank_update_interval == 0 { + self.adapt_rank(); + } + + // ── Maturity bookkeeping ──────────────────────── + self.update_maturity(b, is_noise); + + PrefixReport { + depth, + rank: self.rank, + energy_ratio: self.energy_ratio(), + top_singular_value: self.sigmas.first().copied().unwrap_or(0.0), + scores: AnomalyScores { + novelty: novelty_dist, + displacement: displacement_dist, + surprise: surprise_dist, + coherence: coherence_dist, + }, + maturity: self.maturity(), + geometry: self.scoring_geometry(), + per_sample, + } + } + + /// Reset all CUSUM accumulators to zero (post-noise-injection). + pub fn reset_cusum(&mut self) { + self.novelty_bl.cusum.reset(); + self.displacement_bl.cusum.reset(); + self.surprise_bl.cusum.reset(); + self.coherence_bl.cusum.reset(); + } + + /// Current maturity snapshot. + pub const fn maturity(&self) -> TrackerMaturity { + TrackerMaturity { + real_observations: self.real_observations, + noise_observations: self.noise_observations, + noise_influence: self.noise_influence, + } + } + + /// Current rank. + pub const fn rank(&self) -> usize { + self.rank + } + + /// Maximum rank this tracker can reach (`min(dim, max_rank)`). + pub(crate) const fn cap(&self) -> usize { + self.cap + } + + /// Working dimensionality of the tracker's input space. + pub(crate) const fn dim(&self) -> usize { + self.dim + } + + /// Geometric properties of the current scoring state. + pub(crate) const fn scoring_geometry(&self) -> ScoringGeometry { + ScoringGeometry { + dim: self.dim, + cap: self.cap, + residual_dof: self.dim.saturating_sub(self.rank), + } + } + + // ════════════════════════════════════════════════════════ + // Private implementation + // ════════════════════════════════════════════════════════ + + /// Build a `faer::Mat` (b × d) from row slices. + fn build_matrix(rows: &[&[f64]], b: usize, d: usize) -> Mat { + let mut x = Mat::zeros(b, d); + for (i, row) in rows.iter().enumerate() { + for (j, &val) in row.iter().enumerate() { + x[(i, j)] = val; + } + } + x + } + + /// Phase 1: Compute raw per-sample scores from the prior-model projection. + /// + /// Returns the four score vectors. Per-sample `SampleScore` structs + /// (which include z-scores) are only built when `per_sample_scores` + /// is enabled — avoiding 4 z-score calls per sample in the common case. + fn compute_scores( + &self, + z: &Mat, + residual: &Mat, + b: usize, + d: usize, + k: usize, + eps: f64, + ) -> (Vec, Vec, Vec, Vec) { + #[allow(clippy::cast_precision_loss)] // d − k ≤ 128, well within f64 mantissa + let dof = (d - k).max(1) as f64; + + #[allow(clippy::cast_precision_loss)] + let k_f = k as f64; + + let mut nov_scores = Vec::with_capacity(b); + let mut disp_scores = Vec::with_capacity(b); + let mut surp_scores = Vec::with_capacity(b); + let mut coh_scores = Vec::with_capacity(b); + + for i in 0..b { + // ── Novelty: ‖rᵢ‖² / (d − k) ── + let mut resid_sq = 0.0; + for j in 0..d { + resid_sq += residual[(i, j)] * residual[(i, j)]; + } + nov_scores.push(resid_sq / dof); + + // ── Displacement: ‖zᵢ‖² / (k + ‖zᵢ‖²) ── + let mut z_sq = 0.0; + for j in 0..k { + z_sq += z[(i, j)] * z[(i, j)]; + } + disp_scores.push(z_sq / (k_f + z_sq)); + + // ── Surprise: (1/k) Σⱼ (zᵢⱼ − μⱼ)² / (νⱼ + ε) ── + let mut surprise = 0.0; + for j in 0..k { + let dev = z[(i, j)] - self.lat_mean[j]; + surprise += (dev * dev) / (self.lat_var[j] + eps); + } + surp_scores.push(surprise / k_f.max(1.0)); + + // ── Coherence: (2/(k(k−1))) Σⱼ<ₗ (zᵢⱼ·zᵢₗ − Cⱼₗ)² ── + // + // Dividing by pairs = k(k−1)/2 is equivalent to multiplying + // by 2/(k(k−1)), matching the spec (§2.4). + coh_scores.push(if k >= 2 { + let pairs = (k * (k - 1)) / 2; + let mut coh = 0.0; + for j in 0..k { + for l in (j + 1)..k { + let prod = z[(i, j)] * z[(i, l)]; + let dev = prod - self.cross_corr[tri_idx(j, l, self.cap)]; + coh += dev * dev; + } + } + + #[allow(clippy::cast_precision_loss)] + { + coh / pairs as f64 + } + } else { + 0.0 + }); + } + + (nov_scores, disp_scores, surp_scores, coh_scores) + } + + /// Build per-sample `SampleScore` structs (only when `per_sample_scores` is enabled). + /// + /// This is separated from `compute_scores` to avoid 4 z-score calls + /// per sample in the default (disabled) case. + fn build_per_sample(&self, nov: &[f64], disp: &[f64], surp: &[f64], coh: &[f64], eps: f64) -> Vec { + nov.iter() + .zip(disp) + .zip(surp) + .zip(coh) + .map(|(((&n, &d), &s), &c)| SampleScore { + novelty: n, + displacement: d, + surprise: s, + coherence: c, + novelty_z: self.novelty_bl.fast.z_score(n, eps), + displacement_z: self.displacement_bl.fast.z_score(d, eps), + surprise_z: self.surprise_bl.fast.z_score(s, eps), + coherence_z: self.coherence_bl.fast.z_score(c, eps), + }) + .collect() + } + + /// Phase 2: Evolve subspace via streaming thin SVD. + /// + /// `M = [√λ · U_k · diag(σ[:k]) | X^T] ∈ ℝ^(d × (k+b))` + /// + /// Then thin SVD of M, retaining top `min(min(d, k+b), cap)` components. + #[allow(clippy::many_single_char_names)] // mathematical notation matching the spec + fn evolve_subspace(&mut self, x: &Mat, k: usize) { + let d = self.dim; + let b = x.nrows(); + let sqrt_lam = self.forgetting_factor.sqrt(); + + // Build M: (d × (k + b)) + let cols = k + b; + let mut m = Mat::zeros(d, cols); + + // Left block: √λ · U_k · diag(σ[:k]) + for j in 0..k { + let s = sqrt_lam * self.sigmas[j]; + for i in 0..d { + m[(i, j)] = self.basis[(i, j)] * s; + } + } + + // Right block: X^T + for i in 0..b { + for j in 0..d { + m[(j, k + i)] = x[(i, j)]; + } + } + + // Thin SVD of M. If it fails (non-convergence), keep the old subspace. + let Ok(svd) = m.thin_svd() else { + tracing::warn!("thin SVD did not converge — keeping prior subspace"); + return; + }; + + let n = cols.min(d).min(self.cap); + let u_new = svd.U(); + let s_new = svd.S().column_vector(); + + // Update basis and sigmas. + for j in 0..n { + for i in 0..d { + self.basis[(i, j)] = u_new[(i, j)]; + } + self.sigmas[j] = s_new[j]; + } + + // Zero out unused sigmas. + for s in &mut self.sigmas[n..] { + *s = 0.0; + } + } + + /// Phase 3: Evolve latent distribution (mean, variance, cross-correlation). + /// + /// Uses the `Z` matrix from Phase 1 (prior-basis projection). + fn evolve_latent(&mut self, z: &Mat, k: usize) { + let b = z.nrows(); + let lam = self.forgetting_factor; + let alpha = 1.0 - lam; + let eps = self.eps; + + #[allow(clippy::cast_precision_loss)] + let b_f = b as f64; + + // Per-dimension mean and variance. + for j in 0..k { + let mut col_sum = 0.0; + for i in 0..b { + col_sum += z[(i, j)]; + } + let col_mean = col_sum / b_f; + + let mut col_var = 0.0; + for i in 0..b { + let d = z[(i, j)] - col_mean; + col_var += d * d; + } + col_var /= b_f; + + self.lat_mean[j] = lam.mul_add(self.lat_mean[j], alpha * col_mean); + self.lat_var[j] = lam.mul_add(self.lat_var[j], alpha * col_var.max(eps)); + } + + // Pairwise cross-correlation: C[j][l] ← λ·C[j][l] + α·(1/b)·Σᵢ zᵢⱼ·zᵢₗ + // + // Invariant (§1.3 Phase 3): when rank increases, new entries are + // already zero from construction. When rank decreases, outer + // entries are ignored here but preserved in the vector. + for j in 0..k { + for l in (j + 1)..k { + let mut prod_sum = 0.0; + for i in 0..b { + prod_sum += z[(i, j)] * z[(i, l)]; + } + let batch_corr = prod_sum / b_f; + let idx = tri_idx(j, l, self.cap); + self.cross_corr[idx] = lam.mul_add(self.cross_corr[idx], alpha * batch_corr); + } + } + } + + /// Phase 4 helper: score against an axis's baseline and (optionally) + /// evolve it. + /// + /// When `evolve` is `false` the EWMA and CUSUM are left untouched. + /// This is used for the coherence axis at rank < 2, where coherence + /// does not exist (no pairs) and every score is identically zero. + /// The baseline stays cold; when rank reaches 2 the first real + /// values enter through the cold→warm path naturally. If rank + /// later drops below 2, `adapt_rank()` destroys the baseline + /// entirely. + fn update_axis(bl: &mut AxisBaseline, scores: &[f64], eps: f64, cusum_allowance: f64, evolve: bool) -> ScoreDistribution { + let (min, max, sum) = scores + .iter() + .fold((f64::INFINITY, f64::NEG_INFINITY, 0.0_f64), |(mn, mx, s), &v| { + (mn.min(v), mx.max(v), s + v) + }); + + #[allow(clippy::cast_precision_loss)] + let mean = sum / scores.len() as f64; + + // Z-scores computed *before* updating the fast baseline. + let max_z = bl.fast.z_score(max, eps); + let mean_z = bl.fast.z_score(mean, eps); + let baseline = bl.fast.snapshot(); + + if evolve { + // Update fast EWMA. + bl.fast.update(scores); + + // Update CUSUM (slow EWMA + accumulator). + bl.cusum.update(scores, mean, cusum_allowance, eps); + } + let cusum = bl.cusum.snapshot(); + + ScoreDistribution { + min, + max, + mean, + max_z_score: max_z, + mean_z_score: mean_z, + baseline, + cusum, + } + } + + /// Phase 5: Adapt rank based on cumulative energy. + /// + /// Every `rank_update_interval` steps, find the smallest rank + /// capturing `energy_threshold` of total variance. Move by ±1. + fn adapt_rank(&mut self) { + let total_energy: f64 = self.sigmas.iter().map(|s| s * s).sum::() + self.eps; + + let mut cumulative = 0.0; + let mut target = self.cap; + + for (i, s) in self.sigmas.iter().enumerate() { + cumulative += s * s; + if cumulative / total_energy >= self.energy_threshold { + target = (i + 1).min(self.cap); + break; + } + } + target = target.max(1); + + let old_rank = self.rank; + + // Move by at most ±1 to avoid oscillation (§3.6). + if target > self.rank { + self.rank = (self.rank + 1).min(self.cap); + } else if target < self.rank { + self.rank = self.rank.saturating_sub(1).max(1); + } + + // Coherence does not exist at rank < 2. If rank just + // dropped below 2, destroy the coherence baseline so + // stale state from a previous k ≥ 2 epoch cannot leak + // into a future one. When rank reaches 2 again the + // baseline is born fresh via the cold→warm path. + if old_rank >= 2 && self.rank < 2 { + self.coherence_bl.reset_cold(); + } + } + + /// Fraction of total variance captured by the current rank. + pub(crate) fn energy_ratio(&self) -> f64 { + let total: f64 = self.sigmas.iter().map(|s| s * s).sum::() + self.eps; + let active: f64 = self.sigmas[..self.rank].iter().map(|s| s * s).sum(); + active / total + } + + /// Largest singular value of the learned subspace. + pub(crate) fn top_singular_value(&self) -> f64 { + self.sigmas.first().copied().unwrap_or(0.0) + } + + /// Update maturity counters after processing a batch. + /// + /// Noise influence decays as λⁿ for `n` real observations, or + /// converges toward 1.0 under noise (§6.3). Computed via `powi` + /// instead of an `n`-iteration loop. + fn update_maturity(&mut self, batch_size: usize, is_noise: bool) { + #[allow(clippy::cast_possible_truncation)] + let count = batch_size as u64; + + #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)] // batch_size ≪ 2^31 + let n = batch_size as i32; + let lam_n = self.forgetting_factor.powi(n); + + if is_noise { + self.noise_observations += count; + // η_{t+n} = λⁿ·η_t + (1 − λⁿ) (geometric series of n EWMA steps toward 1.0) + self.noise_influence = lam_n.mul_add(self.noise_influence, 1.0 - lam_n); + } else { + self.real_observations += count; + // η_{t+n} = λⁿ·η_t + self.noise_influence *= lam_n; + } + } +} + +// ─── Upper-triangle indexing ───────────────────────────────── + +/// Map a pair `(j, l)` with `j < l` to a flat upper-triangle index. +/// +/// The triangle for a `cap × cap` matrix stores `cap*(cap−1)/2` +/// elements in row-major order: (0,1), (0,2), …, (0,cap−1), +/// (1,2), …, (cap−2, cap−1). +#[inline] +const fn tri_idx(j: usize, l: usize, cap: usize) -> usize { + // Elements before row j: j*cap − j*(j+1)/2 + // Offset within row j: l − j − 1 + j * cap - (j * (j + 1)) / 2 + l - j - 1 +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_config() -> SentinelConfig { + SentinelConfig { + max_rank: 4, + forgetting_factor: 0.95, + rank_update_interval: 10, + energy_threshold: 0.90, + eps: 1e-6, + per_sample_scores: true, + cusum_allowance_sigmas: 0.5, + ..SentinelConfig::default() + } + } + + /// Generate a batch of centred bit vectors from `u128` values. + fn centred_rows(values: &[u128], depth: usize) -> Vec> { + values + .iter() + .map(|&v| { + (0..depth) + .map(|i| if (v >> (127 - i)) & 1 == 1 { 0.5 } else { -0.5 }) + .collect() + }) + .collect() + } + + fn as_slices(vecs: &[Vec]) -> Vec<&[f64]> { + vecs.iter().map(Vec::as_slice).collect() + } + + #[test] + fn creates_with_rank_one() { + let cfg = test_config(); + let t = SubspaceTracker::new(8, &cfg, 0.999); + assert_eq!(t.rank(), 1); + assert_eq!(t.maturity().real_observations, 0); + assert!((t.maturity().noise_influence - 1.0).abs() < f64::EPSILON); + } + + #[test] + fn observe_returns_report_with_correct_depth() { + let cfg = test_config(); + let mut t = SubspaceTracker::new(8, &cfg, 0.999); + + let rows = centred_rows(&[0x0123_4567_89AB_CDEF_0123_4567_89AB_CDEF], 8); + let report = t.observe(&as_slices(&rows), 8, false); + + assert_eq!(report.depth, 8); + assert_eq!(report.rank, 1); // hasn't adapted yet + assert!(report.per_sample.is_some()); + assert_eq!(report.per_sample.as_ref().unwrap().len(), 1); + } + + #[test] + fn maturity_tracks_real_vs_noise() { + let cfg = test_config(); + let mut t = SubspaceTracker::new(8, &cfg, 0.999); + + let rows = centred_rows(&[1, 2, 3], 8); + let slices = as_slices(&rows); + + t.observe(&slices, 8, true); // noise + assert_eq!(t.maturity().noise_observations, 3); + assert_eq!(t.maturity().real_observations, 0); + + t.observe(&slices, 8, false); // real + assert_eq!(t.maturity().real_observations, 3); + assert!(t.maturity().noise_influence < 1.0); + } + + #[test] + fn novelty_is_low_for_repeated_pattern() { + let cfg = test_config(); + let mut t = SubspaceTracker::new(16, &cfg, 0.999); + + // Feed the same pattern many times so the subspace learns it. + let pattern: u128 = 0xAAAA_0000_0000_0000_0000_0000_0000_0000; + let rows = centred_rows(&[pattern; 8], 16); + let slices = as_slices(&rows); + + for _ in 0..20 { + t.observe(&slices, 16, false); + } + + // Now score the same pattern — novelty should be low. + let report = t.observe(&slices, 16, false); + assert!( + report.scores.novelty.mean < 1.0, + "novelty should be low for a learned pattern, got {}", + report.scores.novelty.mean, + ); + } + + #[test] + fn cusum_resets_to_zero() { + let cfg = test_config(); + let mut t = SubspaceTracker::new(8, &cfg, 0.999); + + let rows = centred_rows(&[1, 2, 3, 4], 8); + let slices = as_slices(&rows); + + for _ in 0..5 { + t.observe(&slices, 8, false); + } + + t.reset_cusum(); + + let report = t.observe(&slices, 8, false); + // After reset + one step, steps_since_reset should be 1. + assert_eq!(report.scores.novelty.cusum.steps_since_reset, 1); + } + + #[test] + fn rank_stays_bounded() { + let cfg = SentinelConfig { + max_rank: 3, + rank_update_interval: 1, // adapt every step + ..test_config() + }; + let mut t = SubspaceTracker::new(8, &cfg, 0.999); + + let rows = centred_rows(&[0xFF00_0000_0000_0000_0000_0000_0000_0000; 8], 8); + let slices = as_slices(&rows); + + for _ in 0..50 { + t.observe(&slices, 8, false); + } + + assert!(t.rank() >= 1); + assert!(t.rank() <= 3, "rank should not exceed max_rank, got {}", t.rank()); + } +} diff --git a/packages/sentinel/tests/api.rs b/packages/sentinel/tests/api.rs new file mode 100644 index 00000000..6bd57b76 --- /dev/null +++ b/packages/sentinel/tests/api.rs @@ -0,0 +1,176 @@ +mod common; + +use common::test_config; +use torrust_sentinel::config::SentinelConfig; +use torrust_sentinel::sentinel::SpectralSentinel; + +#[test] +fn new_validates_config() { + let bad = SentinelConfig { + max_rank: 0, + ..test_config() + }; + assert!(SpectralSentinel::new(bad).is_err()); +} + +#[test] +fn new_starts_empty() { + let s = SpectralSentinel::new(test_config()).unwrap(); + assert_eq!(s.campuses_seen(), 0); + assert_eq!(s.active_tracker_count(), 0); + assert_eq!(s.lifetime_observations(), 0); +} + +#[test] +fn empty_ingest_returns_empty_report() { + let mut s = SpectralSentinel::new(test_config()).unwrap(); + let report = s.ingest(&[]); + + assert!(report.campus_reports.is_empty()); + assert_eq!(report.coordination.len(), 2); // two depths + assert!(report.coordination.iter().all(Option::is_none)); + assert_eq!(report.lifetime_observations, 0); + assert_eq!(report.active_trackers, 0); +} + +#[test] +fn single_value_produces_one_campus() { + let mut s = SpectralSentinel::new(test_config()).unwrap(); + let report = s.ingest(&[0xABCD_0000_0000_0000_0000_0000_0000_0001]); + + assert_eq!(report.campus_reports.len(), 1); + assert_eq!(report.campus_reports[0].sample_count, 1); + assert_eq!(report.campus_reports[0].prefix_reports.len(), 2); // 2 depths + + // Only 1 campus — coordination is None for all depths. + assert!(report.coordination.iter().all(Option::is_none)); + + assert_eq!(report.lifetime_observations, 1); + assert_eq!(s.campuses_seen(), 1); +} + +#[test] +fn two_campuses_enable_coordination() { + let mut s = SpectralSentinel::new(test_config()).unwrap(); + + // Two values with different top-4 bits → two campuses. + let values = vec![ + 0xF000_0000_0000_0000_0000_0000_0000_0001, + 0x1000_0000_0000_0000_0000_0000_0000_0002, + ]; + let report = s.ingest(&values); + + assert_eq!(report.campus_reports.len(), 2); + assert_eq!(s.campuses_seen(), 2); + + // 2 campuses → coordination should be Some for all depths. + assert!( + report.coordination.iter().all(Option::is_some), + "coordination should be Some when >= 2 campuses report" + ); + + for coord in report.coordination.iter().flatten() { + assert_eq!(coord.campuses_reporting, 2); + } +} + +#[test] +fn batch_counter_increments() { + let mut s = SpectralSentinel::new(test_config()).unwrap(); + + s.ingest(&[1]); + assert_eq!(s.lifetime_observations(), 1); + + s.ingest(&[2, 3, 4]); + assert_eq!(s.lifetime_observations(), 4); +} + +#[test] +fn active_trackers_matches_campuses_times_depths() { + let cfg = test_config(); // 2 depths + let mut s = SpectralSentinel::new(cfg).unwrap(); + + // 3 values across 2 campuses. + let values = vec![ + 0xF000_0000_0000_0000_0000_0000_0000_0001, + 0xF000_0000_0000_0000_0000_0000_0000_0002, + 0x1000_0000_0000_0000_0000_0000_0000_0003, + ]; + let report = s.ingest(&values); + + // 2 campuses × 2 depths = 4 trackers. + assert_eq!(report.active_trackers, 4); + assert_eq!(s.active_tracker_count(), 4); +} + +#[test] +fn campus_reports_are_in_insertion_order() { + let mut s = SpectralSentinel::new(test_config()).unwrap(); + + // Campus 0xA first, then 0x3, then 0x7. + let values = vec![ + 0xA000_0000_0000_0000_0000_0000_0000_0001, + 0x3000_0000_0000_0000_0000_0000_0000_0002, + 0x7000_0000_0000_0000_0000_0000_0000_0003, + ]; + let report = s.ingest(&values); + + assert_eq!(report.campus_reports.len(), 3); + assert_eq!(report.campus_reports[0].campus_id, 0xA); + assert_eq!(report.campus_reports[1].campus_id, 0x3); + assert_eq!(report.campus_reports[2].campus_id, 0x7); +} + +#[test] +fn coordination_depth_matches_config() { + let cfg = SentinelConfig { + prefix_depths: vec![8, 16, 32], + ..test_config() + }; + let mut s = SpectralSentinel::new(cfg).unwrap(); + + let values = vec![ + 0xF000_0000_0000_0000_0000_0000_0000_0001, + 0x1000_0000_0000_0000_0000_0000_0000_0002, + ]; + let report = s.ingest(&values); + + assert_eq!(report.coordination.len(), 3); + let depths: Vec = report.coordination.iter().flatten().map(|c| c.depth).collect(); + assert_eq!(depths, vec![8, 16, 32]); +} + +#[test] +fn repeated_ingest_does_not_panic() { + let mut s = SpectralSentinel::new(test_config()).unwrap(); + let values: Vec = (0_u128..20).map(|i| i << 120 | i).collect(); + + for _ in 0..10 { + drop(s.ingest(&values)); + } +} + +#[test] +fn per_sample_scores_present_when_enabled() { + let mut s = SpectralSentinel::new(test_config()).unwrap(); // per_sample_scores = true + + let report = s.ingest(&[0xF000_0000_0000_0000_0000_0000_0000_0001]); + let pr = &report.campus_reports[0].prefix_reports[0]; + + assert!(pr.per_sample.is_some()); + assert_eq!(pr.per_sample.as_ref().unwrap().len(), 1); +} + +#[test] +fn per_sample_scores_absent_when_disabled() { + let cfg = SentinelConfig { + per_sample_scores: false, + ..test_config() + }; + let mut s = SpectralSentinel::new(cfg).unwrap(); + + let report = s.ingest(&[0xF000_0000_0000_0000_0000_0000_0000_0001]); + let pr = &report.campus_reports[0].prefix_reports[0]; + + assert!(pr.per_sample.is_none()); +} diff --git a/packages/sentinel/tests/common/mod.rs b/packages/sentinel/tests/common/mod.rs new file mode 100644 index 00000000..cb12c4f1 --- /dev/null +++ b/packages/sentinel/tests/common/mod.rs @@ -0,0 +1,76 @@ +// Each integration test file includes this module independently, so +// not every test file uses every helper. +#![allow(dead_code)] + +use torrust_sentinel::config::SentinelConfig; +use torrust_sentinel::sentinel::{NoiseParams, SpectralSentinel}; + +pub fn test_config() -> SentinelConfig { + SentinelConfig { + max_rank: 4, + forgetting_factor: 0.95, + rank_update_interval: 10, + campus_bits: 4, // 16 campus buckets + prefix_depths: vec![8, 16], + energy_threshold: 0.90, + eps: 1e-6, + per_sample_scores: true, + cusum_allowance_sigmas: 0.5, + cusum_slow_decay: 0.999, + cusum_meta_slow_decay: 0.999, + } +} + +pub const fn small_noise_params() -> NoiseParams { + NoiseParams { + rounds: 5, + batch_size: 4, + seed: Some(42), + } +} + +/// Pre-populate a sentinel with two campuses so noise injection +/// has trackers to warm and enough campuses for coordination. +pub fn seeded_sentinel() -> SpectralSentinel { + let mut s = SpectralSentinel::new(test_config()).unwrap(); + // Two values with different top-4 bits → two campuses. + s.ingest(&[ + 0xF000_0000_0000_0000_0000_0000_0000_0001, + 0x1000_0000_0000_0000_0000_0000_0000_0002, + ]); + s +} + +/// Config tuned for integration tests: tight rank so novelty +/// signals are clearer, deterministic noise, fast rank adaptation. +pub fn integration_config() -> SentinelConfig { + SentinelConfig { + max_rank: 2, + forgetting_factor: 0.95, + rank_update_interval: 5, + campus_bits: 4, + prefix_depths: vec![8, 16], + energy_threshold: 0.90, + eps: 1e-6, + per_sample_scores: false, + cusum_allowance_sigmas: 0.5, + cusum_slow_decay: 0.999, + cusum_meta_slow_decay: 0.999, + } +} + +/// Generate `count` values in the given campus (top nibble) +/// with sequential low bits. +pub fn campus_values(campus_nibble: u128, count: usize) -> Vec { + (0..count).map(|i| (campus_nibble << 124) | (i as u128 + 1)).collect() +} + +/// Generate values with all bits set in a specific range to +/// create structurally novel data. +pub fn anomalous_values(campus_nibble: u128, count: usize) -> Vec { + // Set a dense block of high bits in the middle — structurally + // very different from the sparse sequential values above. + (0..count) + .map(|i| (campus_nibble << 124) | 0x0FFF_FFFF_FFFF_FFFF_FFFF_FFFF_0000_0000 | (i as u128)) + .collect() +} diff --git a/packages/sentinel/tests/eviction.rs b/packages/sentinel/tests/eviction.rs new file mode 100644 index 00000000..95da2445 --- /dev/null +++ b/packages/sentinel/tests/eviction.rs @@ -0,0 +1,60 @@ +mod common; + +use common::{seeded_sentinel, test_config}; +use torrust_sentinel::observation::CampusId; +use torrust_sentinel::sentinel::SpectralSentinel; + +#[test] +fn evict_idle_removes_stale_campuses() { + let mut s = SpectralSentinel::new(test_config()).unwrap(); + + // Batch 0: three campuses appear. + s.ingest(&[ + 0xF000_0000_0000_0000_0000_0000_0000_0001, + 0x1000_0000_0000_0000_0000_0000_0000_0002, + 0xA000_0000_0000_0000_0000_0000_0000_0003, + ]); + assert_eq!(s.campuses_seen(), 3); + + // Batch 1: only the first campus appears. + s.ingest(&[0xF000_0000_0000_0000_0000_0000_0000_0004]); + assert_eq!(s.campuses_seen(), 3); // still 3 — no auto-eviction + + // Evict campuses idle for more than 0 batches. + let evicted = s.evict_idle(0); + assert_eq!(evicted, 2); + assert_eq!(s.campuses_seen(), 1); +} + +#[test] +fn evict_idle_with_large_window_removes_nothing() { + let mut s = seeded_sentinel(); + let evicted = s.evict_idle(u64::MAX); + assert_eq!(evicted, 0); + assert_eq!(s.campuses_seen(), 2); +} + +#[test] +fn evict_idle_on_empty_sentinel() { + let mut s = SpectralSentinel::new(test_config()).unwrap(); + let evicted = s.evict_idle(0); + assert_eq!(evicted, 0); +} + +#[test] +fn evict_campus_removes_known() { + let mut s = SpectralSentinel::new(test_config()).unwrap(); + s.ingest(&[0xF000_0000_0000_0000_0000_0000_0000_0001]); + assert_eq!(s.campuses_seen(), 1); + + let campus = CampusId::from_value(0xF000_0000_0000_0000_0000_0000_0000_0001, 4); + assert!(s.evict_campus(campus)); + assert_eq!(s.campuses_seen(), 0); +} + +#[test] +fn evict_campus_unknown_returns_false() { + let mut s = seeded_sentinel(); + assert!(!s.evict_campus(CampusId(0xFF))); + assert_eq!(s.campuses_seen(), 2); // unchanged +} diff --git a/packages/sentinel/tests/health.rs b/packages/sentinel/tests/health.rs new file mode 100644 index 00000000..96e6497e --- /dev/null +++ b/packages/sentinel/tests/health.rs @@ -0,0 +1,173 @@ +mod common; + +use common::{seeded_sentinel, small_noise_params, test_config}; +use torrust_sentinel::observation::CampusId; +use torrust_sentinel::sentinel::SpectralSentinel; + +// ── Health tests ──────────────────────────────────────────── + +#[test] +fn health_on_empty_sentinel() { + let s = SpectralSentinel::new(test_config()).unwrap(); + let h = s.health(); + + assert_eq!(h.active_trackers, 0); + assert_eq!(h.campuses_seen, 0); + assert_eq!(h.lifetime_observations, 0); + assert_eq!(h.rank_distribution.min, 0); + assert_eq!(h.rank_distribution.max, 0); + assert!((h.rank_distribution.mean).abs() < f64::EPSILON); + assert_eq!(h.maturity_distribution.cold_trackers, 0); +} + +#[test] +fn health_reflects_tracker_population() { + let mut s = SpectralSentinel::new(test_config()).unwrap(); + + // 2 campuses × 2 depths = 4 trackers. + s.ingest(&[ + 0xF000_0000_0000_0000_0000_0000_0000_0001, + 0x1000_0000_0000_0000_0000_0000_0000_0002, + ]); + + let h = s.health(); + assert_eq!(h.active_trackers, 4); + assert_eq!(h.campuses_seen, 2); + assert_eq!(h.lifetime_observations, 2); + + // All trackers start at rank 1. + assert_eq!(h.rank_distribution.min, 1); + assert_eq!(h.rank_distribution.max, 1); + assert!((h.rank_distribution.mean - 1.0).abs() < f64::EPSILON); + + // All trackers have seen real observations, so none are cold. + assert_eq!(h.maturity_distribution.cold_trackers, 0); + + // noise_influence should be < 1.0 after real observations. + assert!(h.maturity_distribution.max_noise_influence < 1.0); +} + +#[test] +fn health_after_noise_shows_reduced_noise_influence() { + let mut s = seeded_sentinel(); + s.inject_noise(&small_noise_params()); + + let h = s.health(); + assert!( + h.maturity_distribution.mean_noise_influence < 1.0, + "noise injection should reduce noise_influence" + ); + // After noise but no further real data, trackers have zero + // real observations → still cold. + // + // Actually, seeded_sentinel() calls ingest() once first, so + // trackers have real_observations > 0. No cold trackers. + assert_eq!(h.maturity_distribution.cold_trackers, 0); +} + +#[test] +fn health_updates_after_eviction() { + let mut s = SpectralSentinel::new(test_config()).unwrap(); + + s.ingest(&[ + 0xF000_0000_0000_0000_0000_0000_0000_0001, + 0x1000_0000_0000_0000_0000_0000_0000_0002, + ]); + assert_eq!(s.health().active_trackers, 4); + + // Evict one campus. + let campus = CampusId::from_value(0xF000_0000_0000_0000_0000_0000_0000_0001, 4); + s.evict_campus(campus); + + let h = s.health(); + assert_eq!(h.active_trackers, 2); // 1 campus × 2 depths + assert_eq!(h.campuses_seen, 1); +} + +// ── Inspector API tests ───────────────────────────────────── + +#[test] +fn campus_ids_returns_active_campuses() { + let mut s = SpectralSentinel::new(test_config()).unwrap(); + assert!(s.campus_ids().is_empty()); + + s.ingest(&[ + 0xF000_0000_0000_0000_0000_0000_0000_0001, + 0x1000_0000_0000_0000_0000_0000_0000_0002, + ]); + + let ids = s.campus_ids(); + assert_eq!(ids.len(), 2); + // BTreeMap order → sorted ascending. + assert_eq!(ids[0], CampusId(0x1)); + assert_eq!(ids[1], CampusId(0xF)); +} + +#[test] +fn inspect_campus_returns_none_for_unknown() { + let s = SpectralSentinel::new(test_config()).unwrap(); + assert!(s.inspect_campus(CampusId(0xFF)).is_none()); +} + +#[test] +fn inspect_campus_returns_tracker_state() { + let mut s = SpectralSentinel::new(test_config()).unwrap(); + s.ingest(&[0xF000_0000_0000_0000_0000_0000_0000_0001]); + + let campus = CampusId::from_value(0xF000_0000_0000_0000_0000_0000_0000_0001, 4); + let inspection = s.inspect_campus(campus).expect("campus should exist"); + + assert_eq!(inspection.campus_id, 0xF); + assert_eq!(inspection.trackers.len(), 2); // 2 depths in test_config + assert_eq!(inspection.trackers[0].depth, 8); + assert_eq!(inspection.trackers[1].depth, 16); + + for ti in &inspection.trackers { + assert!(ti.rank >= 1); + assert!(ti.energy_ratio > 0.0); + assert!(ti.top_singular_value > 0.0); + assert_eq!(ti.maturity.real_observations, 1); + } +} + +#[test] +fn inspect_campus_reflects_eviction() { + let mut s = seeded_sentinel(); + let campus = CampusId(0xF); + + assert!(s.inspect_campus(campus).is_some()); + s.evict_campus(campus); + assert!(s.inspect_campus(campus).is_none()); +} + +// ── Meta-tracker health tests ─────────────────────────────── + +#[test] +fn health_includes_meta_tracker_stats() { + let s = SpectralSentinel::new(test_config()).unwrap(); + let h = s.health(); + + // 2 depths → 2 meta-trackers. + assert_eq!(h.meta_tracker_health.count, 2); + // Meta-trackers operate at d=4 with test max_rank=4 → cap = 4. + assert_eq!(h.meta_tracker_health.capacity, 4); + // All start at rank 1. + assert_eq!(h.meta_tracker_health.rank_distribution.min, 1); + assert_eq!(h.meta_tracker_health.rank_distribution.max, 1); + // All cold (no observations yet). + assert_eq!(h.meta_tracker_health.maturity_distribution.cold_trackers, 2); +} + +#[test] +fn meta_tracker_health_after_noise() { + let mut s = seeded_sentinel(); + s.inject_noise(&small_noise_params()); + + let mh = s.health().meta_tracker_health; + assert_eq!(mh.count, 2); + // Meta-trackers should have received noise observations. + assert!( + mh.maturity_distribution.max_noise_influence < 1.0, + "meta-trackers should have warmed after noise" + ); +} diff --git a/packages/sentinel/tests/integration.rs b/packages/sentinel/tests/integration.rs new file mode 100644 index 00000000..3dc9ff74 --- /dev/null +++ b/packages/sentinel/tests/integration.rs @@ -0,0 +1,328 @@ +mod common; + +use common::{anomalous_values, campus_values, integration_config, seeded_sentinel, small_noise_params, test_config}; +use torrust_sentinel::config::SentinelConfig; +use torrust_sentinel::observation::CampusId; +use torrust_sentinel::sentinel::{NoiseParams, SpectralSentinel}; + +// ── Full lifecycle ────────────────────────────────────────── + +#[test] +fn full_lifecycle() { + // 1. Construct + let mut s = SpectralSentinel::new(integration_config()).unwrap(); + assert_eq!(s.campuses_seen(), 0); + + // 2. Seed campuses + let seed: Vec = [campus_values(0xF, 4), campus_values(0x1, 4)].concat(); + s.ingest(&seed); + assert_eq!(s.campuses_seen(), 2); + + // 3. Noise injection + s.inject_noise(&NoiseParams { + rounds: 20, + batch_size: 8, + seed: Some(123), + }); + + // 4. Steady-state ingestion + for _ in 0..15 { + let batch: Vec = [campus_values(0xF, 8), campus_values(0x1, 8)].concat(); + drop(s.ingest(&batch)); + } + + // 5. Anomalous batch + let anomaly: Vec = [anomalous_values(0xF, 8), campus_values(0x1, 8)].concat(); + let report = s.ingest(&anomaly); + + assert_eq!(report.campus_reports.len(), 2); + assert!(report.coordination.iter().all(Option::is_some)); + + // 6. Eviction — add a third campus, then evict idle + s.ingest(&campus_values(0xA, 4)); + assert_eq!(s.campuses_seen(), 3); + + // Only campus 0xA appeared in the last batch. + let evicted = s.evict_idle(0); + assert_eq!(evicted, 2); + assert_eq!(s.campuses_seen(), 1); + + // 7. Health check + let h = s.health(); + assert_eq!(h.active_trackers, 2); // 1 campus × 2 depths + assert_eq!(h.campuses_seen, 1); + assert!(h.rank_distribution.min >= 1); +} + +// ── Anomaly detection ─────────────────────────────────────── + +#[test] +fn novelty_responds_to_out_of_subspace_data() { + let mut s = SpectralSentinel::new(integration_config()).unwrap(); + + // Seed + warm + s.ingest(&campus_values(0xF, 8)); + s.inject_noise(&NoiseParams { + rounds: 30, + batch_size: 8, + seed: Some(42), + }); + + // Establish steady state with structurally consistent traffic. + let mut steady_novelties = Vec::new(); + for _ in 0..20 { + let report = s.ingest(&campus_values(0xF, 8)); + steady_novelties.push(report.campus_reports[0].prefix_reports[0].scores.novelty.mean); + } + + // Now inject structurally novel data. + let report = s.ingest(&anomalous_values(0xF, 8)); + let anomaly_novelty = report.campus_reports[0].prefix_reports[0].scores.novelty.mean; + + // The anomaly novelty should exceed the steady-state average. + #[allow(clippy::cast_precision_loss)] // test vector length ≪ 2^52 + let steady_avg: f64 = steady_novelties.iter().sum::() / steady_novelties.len() as f64; + assert!( + anomaly_novelty > steady_avg, + "novelty for structurally novel data ({anomaly_novelty:.6}) should exceed \ + steady-state average ({steady_avg:.6})" + ); +} + +#[test] +fn cusum_accumulates_under_sustained_elevation() { + // Use a faster slow decay so the slow baseline can actually + // adapt within a reasonable number of test batches. + // At λ_s = 0.96 the half-life is ~17 steps. + let cfg = SentinelConfig { + cusum_slow_decay: 0.96, + cusum_meta_slow_decay: 0.96, + ..integration_config() + }; + let mut s = SpectralSentinel::new(cfg).unwrap(); + + // Establish baseline with normal traffic (no noise injection + // so the slow EWMA reflects only real scores). + for _ in 0..30 { + drop(s.ingest(&campus_values(0xF, 8))); + } + + let cusum_before = s.ingest(&campus_values(0xF, 8)).campus_reports[0].prefix_reports[0] + .scores + .novelty + .cusum + .accumulator; + + // Now feed sustained anomalous traffic. + let mut cusum_values = Vec::new(); + for _ in 0..15 { + let report = s.ingest(&anomalous_values(0xF, 8)); + cusum_values.push(report.campus_reports[0].prefix_reports[0].scores.novelty.cusum.accumulator); + } + + let cusum_after = *cusum_values.last().unwrap(); + assert!( + cusum_after > cusum_before, + "CUSUM should grow under sustained anomalous traffic: before={cusum_before:.6}, after={cusum_after:.6}" + ); +} + +// ── Coordination ──────────────────────────────────────────── + +#[test] +fn coordination_detects_cross_campus_pattern() { + let cfg = SentinelConfig { + prefix_depths: vec![8], + ..integration_config() + }; + let mut s = SpectralSentinel::new(cfg).unwrap(); + + // Seed 3 campuses + let seed: Vec = [campus_values(0xF, 4), campus_values(0x1, 4), campus_values(0xA, 4)].concat(); + s.ingest(&seed); + s.inject_noise(&NoiseParams { + rounds: 30, + batch_size: 8, + seed: Some(42), + }); + + // Normal independent traffic across all 3 campuses. + let mut normal_coord_novelties = Vec::new(); + for _ in 0..20 { + let batch: Vec = [campus_values(0xF, 8), campus_values(0x1, 8), campus_values(0xA, 8)].concat(); + let report = s.ingest(&batch); + if let Some(coord) = &report.coordination[0] { + normal_coord_novelties.push(coord.scores.novelty.mean); + } + } + + // Now make 2 of 3 campuses simultaneously anomalous. + let anomaly_batch: Vec = [anomalous_values(0xF, 8), anomalous_values(0x1, 8), campus_values(0xA, 8)].concat(); + let anomaly_report = s.ingest(&anomaly_batch); + + let coord = anomaly_report.coordination[0] + .as_ref() + .expect("should have coordination with 3 campuses"); + assert_eq!(coord.campuses_reporting, 3); + + // The coordination novelty should be elevated relative to + // the normal period. We check against the max of the + // normal period to be conservative. + if !normal_coord_novelties.is_empty() { + let normal_max = normal_coord_novelties.iter().copied().fold(f64::NEG_INFINITY, f64::max); + + // The meta-tracker may need more data to produce a clear + // signal, so we use a soft check: the anomaly should be + // at least in the upper range. + assert!( + coord.scores.novelty.mean >= normal_max * 0.5, + "coordination novelty ({:.6}) should be in the upper range \ + (normal max = {normal_max:.6})", + coord.scores.novelty.mean + ); + } +} + +// ── Warm vs. cold ─────────────────────────────────────────── + +#[test] +fn warm_vs_cold_scoring_differs() { + let cfg = integration_config(); + let noise = NoiseParams { + rounds: 20, + batch_size: 8, + seed: Some(42), + }; + + // Warm sentinel: seed → noise → ingest + let mut warm = SpectralSentinel::new(cfg.clone()).unwrap(); + warm.ingest(&campus_values(0xF, 4)); + warm.inject_noise(&noise); + let warm_report = warm.ingest(&campus_values(0xF, 8)); + + // Cold sentinel: seed → ingest (no noise) + let mut cold = SpectralSentinel::new(cfg).unwrap(); + cold.ingest(&campus_values(0xF, 4)); + let cold_report = cold.ingest(&campus_values(0xF, 8)); + + let warm_pr = &warm_report.campus_reports[0].prefix_reports[0]; + let cold_pr = &cold_report.campus_reports[0].prefix_reports[0]; + + // Warm sentinel had CUSUM reset after noise. + assert_eq!( + warm_pr.scores.novelty.cusum.steps_since_reset, 1, + "warm sentinel CUSUM should have 1 step since reset" + ); + + // Cold sentinel never had a reset — steps equals total batches. + assert_eq!( + cold_pr.scores.novelty.cusum.steps_since_reset, 2, + "cold sentinel CUSUM should count all batches (seed + this one)" + ); + + // Warm sentinel has higher noise influence — noise injection + // pushed η toward 1.0 and only one real batch has decayed it. + assert!( + warm_pr.maturity.noise_influence > cold_pr.maturity.noise_influence, + "warm sentinel should have higher noise_influence due to injected noise ({:.4} vs {:.4})", + warm_pr.maturity.noise_influence, + cold_pr.maturity.noise_influence + ); + + // Warm sentinel should have more noise observations. + assert!(warm_pr.maturity.noise_observations > 0); + assert_eq!(cold_pr.maturity.noise_observations, 0); +} + +// ── Eviction + reingest ───────────────────────────────────── + +#[test] +fn eviction_then_reingest_creates_fresh_trackers() { + let mut s = SpectralSentinel::new(integration_config()).unwrap(); + + // Phase 1: ingest to campus A, inject noise. + s.ingest(&campus_values(0xF, 4)); + s.inject_noise(&NoiseParams { + rounds: 10, + batch_size: 4, + seed: Some(42), + }); + for _ in 0..5 { + drop(s.ingest(&campus_values(0xF, 8))); + } + + // Verify campus has matured. + let h1 = s.health(); + assert!(h1.maturity_distribution.max_noise_influence < 1.0); + + // Phase 2: evict. + let campus_f = CampusId::from_value(0xF000_0000_0000_0000_0000_0000_0000_0001, 4); + assert!(s.evict_campus(campus_f)); + assert_eq!(s.campuses_seen(), 0); + + // Phase 3: re-ingest to the same campus. + let report = s.ingest(&campus_values(0xF, 4)); + let pr = &report.campus_reports[0].prefix_reports[0]; + + // Fresh tracker starts at η = 1.0 but the 4 real observations + // in this batch decay it to λ^4. + let expected_ni = 0.95_f64.powi(4); + assert!( + (pr.maturity.noise_influence - expected_ni).abs() < 1e-10, + "re-created tracker noise_influence should be λ^4 ≈ {expected_ni:.6}, got {:.6}", + pr.maturity.noise_influence, + ); + assert_eq!(pr.maturity.noise_observations, 0); + assert_eq!(pr.maturity.real_observations, 4); // just this batch +} + +// ── reset() tests ─────────────────────────────────────────── + +#[test] +fn reset_clears_all_state() { + let mut s = seeded_sentinel(); + s.inject_noise(&small_noise_params()); + + assert!(s.campuses_seen() > 0); + assert!(s.lifetime_observations() > 0); + + s.reset(); + + assert_eq!(s.campuses_seen(), 0); + assert_eq!(s.active_tracker_count(), 0); + assert_eq!(s.lifetime_observations(), 0); + assert!(s.campus_ids().is_empty()); +} + +#[test] +fn reset_preserves_config() { + let cfg = test_config(); + let mut s = SpectralSentinel::new(cfg).unwrap(); + s.ingest(&[0xF000_0000_0000_0000_0000_0000_0000_0001]); + + s.reset(); + + // Config is intact — can immediately ingest again. + let report = s.ingest(&[0xF000_0000_0000_0000_0000_0000_0000_0001]); + assert_eq!(report.campus_reports.len(), 1); + assert_eq!(report.lifetime_observations, 1); +} + +#[test] +fn reset_reinitialises_meta_trackers() { + let mut s = seeded_sentinel(); + s.inject_noise(&small_noise_params()); + + // Meta-trackers should have seen noise. + let h_before = s.health(); + assert!(h_before.meta_tracker_health.maturity_distribution.max_noise_influence < 1.0); + + s.reset(); + + // After reset, meta-trackers are cold again. + let h_after = s.health(); + assert!( + (h_after.meta_tracker_health.maturity_distribution.max_noise_influence - 1.0).abs() < f64::EPSILON, + "meta-trackers should be cold after reset" + ); +} diff --git a/packages/sentinel/tests/noise.rs b/packages/sentinel/tests/noise.rs new file mode 100644 index 00000000..9005bacf --- /dev/null +++ b/packages/sentinel/tests/noise.rs @@ -0,0 +1,149 @@ +mod common; + +use common::{seeded_sentinel, small_noise_params, test_config}; +use torrust_sentinel::sentinel::{NoiseParams, SpectralSentinel}; + +#[test] +fn inject_noise_on_empty_sentinel_is_noop() { + let mut s = SpectralSentinel::new(test_config()).unwrap(); + s.inject_noise(&small_noise_params()); // should not panic + assert_eq!(s.campuses_seen(), 0); + assert_eq!(s.lifetime_observations(), 0); +} + +#[test] +fn inject_noise_warms_baselines() { + let mut s = seeded_sentinel(); + s.inject_noise(&small_noise_params()); + + for campus_id in s.campus_ids() { + let inspection = s.inspect_campus(campus_id).expect("campus should exist"); + for ti in &inspection.trackers { + assert!( + ti.maturity.noise_observations > 0, + "tracker should have received noise observations" + ); + assert!( + ti.maturity.noise_influence < 1.0, + "noise_influence should have decayed below 1.0" + ); + } + } +} + +#[test] +fn inject_noise_resets_cusum() { + let mut s = seeded_sentinel(); + s.inject_noise(&small_noise_params()); + + // Ingest one real batch to produce a report with CUSUM snapshots. + let report = s.ingest(&[ + 0xF000_0000_0000_0000_0000_0000_0000_AAAA, + 0x1000_0000_0000_0000_0000_0000_0000_BBBB, + ]); + + for cr in &report.campus_reports { + for pr in &cr.prefix_reports { + // steps_since_reset should be exactly 1 (the one real batch + // we just ingested after the reset). + assert_eq!( + pr.scores.novelty.cusum.steps_since_reset, 1, + "CUSUM should have been reset after noise injection" + ); + } + } +} + +#[test] +fn inject_noise_warms_meta_trackers() { + let mut s = seeded_sentinel(); + s.inject_noise(&small_noise_params()); + + let mh = s.health().meta_tracker_health; + assert!( + mh.maturity_distribution.max_noise_influence < 1.0, + "meta-trackers should have received noise observations" + ); +} + +#[test] +fn inject_noise_is_deterministic_with_same_seed() { + let mut s1 = seeded_sentinel(); + let mut s2 = seeded_sentinel(); + + let params = small_noise_params(); // seed = Some(42) + s1.inject_noise(¶ms); + s2.inject_noise(¶ms); + + // After identical noise, maturity should match across all campuses. + let ids1 = s1.campus_ids(); + let ids2 = s2.campus_ids(); + assert_eq!(ids1, ids2); + + for campus_id in ids1 { + let insp1 = s1.inspect_campus(campus_id).unwrap(); + let insp2 = s2.inspect_campus(campus_id).unwrap(); + for (t1, t2) in insp1.trackers.iter().zip(insp2.trackers.iter()) { + assert_eq!(t1.maturity.noise_observations, t2.maturity.noise_observations); + assert!( + (t1.maturity.noise_influence - t2.maturity.noise_influence).abs() < f64::EPSILON, + "noise influence should be identical with same seed" + ); + } + } +} + +#[test] +fn inject_noise_differs_with_different_seed() { + let mut s1 = seeded_sentinel(); + let mut s2 = seeded_sentinel(); + + s1.inject_noise(&NoiseParams { + seed: Some(42), + ..small_noise_params() + }); + s2.inject_noise(&NoiseParams { + seed: Some(999), + ..small_noise_params() + }); + + // After different noise, at least one tracker should report + // a different post-injection score pattern. We check by + // ingesting the same real traffic and comparing reports. + let probe = vec![ + 0xF000_0000_0000_0000_0000_0000_0000_0001, + 0x1000_0000_0000_0000_0000_0000_0000_0002, + ]; + let r1 = s1.ingest(&probe); + let r2 = s2.ingest(&probe); + + // The novelty means should differ because the warmed baselines + // diverged. (It's theoretically possible for them to match, + // but astronomically unlikely with different seeds.) + let means1: Vec = r1 + .campus_reports + .iter() + .flat_map(|cr| cr.prefix_reports.iter().map(|pr| pr.scores.novelty.mean)) + .collect(); + let means2: Vec = r2 + .campus_reports + .iter() + .flat_map(|cr| cr.prefix_reports.iter().map(|pr| pr.scores.novelty.mean)) + .collect(); + + assert_ne!(means1, means2, "different seeds should produce different baselines"); +} + +#[test] +fn inject_noise_does_not_count_as_real_observations() { + let mut s = seeded_sentinel(); + let obs_before = s.lifetime_observations(); + + s.inject_noise(&small_noise_params()); + + assert_eq!( + s.lifetime_observations(), + obs_before, + "noise injection should not increment lifetime_observations" + ); +} diff --git a/project-words.txt b/project-words.txt index 01908fce..fe4c4487 100644 --- a/project-words.txt +++ b/project-words.txt @@ -1,11 +1,17 @@ actix addrs alekitto +analysing +artefacts +asym AUTOINCREMENT +behaviour bencode bencoded Benoit +bijective binascii +bmatrix btih buildx camino @@ -19,6 +25,7 @@ Containerfile creativecommons creds Culqt +CUSUM Cyberneering datetime DATETIME @@ -28,6 +35,9 @@ dotless dtolnay elif Eray +EWMA +faer +granularities grcov Grünwald hasher @@ -40,6 +50,10 @@ imdl indexadmin indexmap infohash +infty +initialisation +initialised +initialises Intermodal jsonwebtoken Karatay @@ -51,6 +65,8 @@ libsqlite luckythelab mailcatcher mandelbrotset +mathbb +mathbf metainfo Mgmt migth @@ -59,15 +75,25 @@ NCCA nextest nilm nocapture +normalisation +nrows Oberhachingerstr oneshot openbittorrent opentrackr +operatorname +organised ppassword programatik proxied rapppid +recognise +redund +reingest +reinitialises reqwest +resid +rngs Roadmap ROADMAP rowid @@ -82,9 +108,11 @@ singlepart sqlx strftime struct +subcols sublicensable sublist subpoints +surp Swatinem taiki tempdir @@ -99,7 +127,9 @@ upgrader Uragqm urlencoding uroot +usize uuidgen +vecs Verstappen waivable webseeding