diff --git a/Cargo.lock b/Cargo.lock index 55df44e76..ba4a07867 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -400,6 +400,52 @@ dependencies = [ "tower-service", ] +[[package]] +name = "barkus-antlr" +version = "0.1.0" +source = "git+https://github.com/DataDog/barkus?rev=45e307e87900a709398fc878940f33ea04507aa2#45e307e87900a709398fc878940f33ea04507aa2" +dependencies = [ + "barkus-core", + "barkus-parser-common", +] + +[[package]] +name = "barkus-core" +version = "0.1.0" +source = "git+https://github.com/DataDog/barkus?rev=45e307e87900a709398fc878940f33ea04507aa2#45e307e87900a709398fc878940f33ea04507aa2" +dependencies = [ + "rand 0.10.0", + "serde", + "serde_json", +] + +[[package]] +name = "barkus-ebnf" +version = "0.1.0" +source = "git+https://github.com/DataDog/barkus?rev=45e307e87900a709398fc878940f33ea04507aa2#45e307e87900a709398fc878940f33ea04507aa2" +dependencies = [ + "barkus-core", + "barkus-parser-common", +] + +[[package]] +name = "barkus-parser-common" +version = "0.1.0" +source = "git+https://github.com/DataDog/barkus?rev=45e307e87900a709398fc878940f33ea04507aa2#45e307e87900a709398fc878940f33ea04507aa2" +dependencies = [ + "barkus-core", + "rand 0.10.0", +] + +[[package]] +name = "barkus-peg" +version = "0.1.0" +source = "git+https://github.com/DataDog/barkus?rev=45e307e87900a709398fc878940f33ea04507aa2#45e307e87900a709398fc878940f33ea04507aa2" +dependencies = [ + "barkus-core", + "barkus-parser-common", +] + [[package]] name = "base64" version = "0.22.1" @@ -598,6 +644,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "chacha20" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "rand_core 0.10.0", +] + [[package]] name = "chrono" version = "0.4.42" @@ -709,7 +766,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3bb320cac8a0750d7f25280aa97b09c26edfe161164238ecbbb31092b079e735" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "proptest", "serde_core", ] @@ -769,6 +826,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crc32fast" version = "1.5.0" @@ -1081,6 +1147,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "foldhash" version = "0.2.0" @@ -1276,8 +1348,22 @@ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 5.3.0", + "wasip2", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "rand_core 0.10.0", "wasip2", + "wasip3", ] [[package]] @@ -1320,13 +1406,22 @@ dependencies = [ "ahash 0.7.8", ] +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash 0.1.5", +] + [[package]] name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" dependencies = [ - "foldhash", + "foldhash 0.2.0", ] [[package]] @@ -1656,6 +1751,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + [[package]] name = "idna" version = "1.1.0" @@ -1975,6 +2076,10 @@ name = "lading-payload" version = "0.1.0" dependencies = [ "arbitrary", + "barkus-antlr", + "barkus-core", + "barkus-ebnf", + "barkus-peg", "byte-unit", "bytes", "criterion", @@ -1983,6 +2088,7 @@ dependencies = [ "proptest", "proptest-derive", "prost", + "rand 0.10.0", "rand 0.9.2", "rmp-serde", "rustc-hash", @@ -1990,6 +2096,7 @@ dependencies = [ "serde_json", "serde_tuple", "serde_yaml", + "tempfile", "thiserror 2.0.17", "time", "tokio", @@ -2023,6 +2130,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "lexical-core" version = "1.0.6" @@ -3026,6 +3139,12 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "radium" version = "0.7.0" @@ -3063,6 +3182,17 @@ dependencies = [ "rand_core 0.9.3", ] +[[package]] +name = "rand" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" +dependencies = [ + "chacha20", + "getrandom 0.4.2", + "rand_core 0.10.0", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -3101,6 +3231,12 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_core" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba" + [[package]] name = "rand_xorshift" version = "0.4.0" @@ -3704,7 +3840,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "digest", ] @@ -4395,6 +4531,12 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + [[package]] name = "unsafe-libyaml" version = "0.2.11" @@ -4513,7 +4655,16 @@ version = "1.0.1+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.46.0", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", ] [[package]] @@ -4574,6 +4725,40 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap 2.12.1", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap 2.12.1", + "semver", +] + [[package]] name = "web-sys" version = "0.3.83" @@ -4924,6 +5109,94 @@ version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap 2.12.1", + "prettyplease", + "syn 2.0.113", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.113", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap 2.12.1", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap 2.12.1", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + [[package]] name = "writeable" version = "0.6.2" diff --git a/Cargo.toml b/Cargo.toml index dbc185a33..79bda140e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,6 +68,10 @@ ddsketch-agent = { git = "https://github.com/DataDog/saluki", rev = "f47a7ef588c datadog-protos = { git = "https://github.com/DataDog/saluki", rev = "f47a7ef588c53aa1da35dcfd93808595ebeb1291" } protobuf = { version = "3.7" } enum_dispatch = { version = "0.3" } +barkus-core = { git = "https://github.com/DataDog/barkus", rev = "45e307e87900a709398fc878940f33ea04507aa2" } +barkus-ebnf = { git = "https://github.com/DataDog/barkus", rev = "45e307e87900a709398fc878940f33ea04507aa2" } +barkus-peg = { git = "https://github.com/DataDog/barkus", rev = "45e307e87900a709398fc878940f33ea04507aa2" } +barkus-antlr = { git = "https://github.com/DataDog/barkus", rev = "45e307e87900a709398fc878940f33ea04507aa2" } [workspace.lints.clippy] all = "deny" diff --git a/deny.toml b/deny.toml index 3a19e3ec6..b52238328 100644 --- a/deny.toml +++ b/deny.toml @@ -16,9 +16,36 @@ allow = [ ] unused-allowed-license = "allow" +# barkus is a DataDog-internal project whose crates do not yet declare a +# license field. The repo itself is MIT-licensed (see its LICENSE file). +[[licenses.clarify]] +name = "barkus-core" +expression = "MIT" +license-files = [] + +[[licenses.clarify]] +name = "barkus-ebnf" +expression = "MIT" +license-files = [] + +[[licenses.clarify]] +name = "barkus-peg" +expression = "MIT" +license-files = [] + +[[licenses.clarify]] +name = "barkus-antlr" +expression = "MIT" +license-files = [] + +[[licenses.clarify]] +name = "barkus-parser-common" +expression = "MIT" +license-files = [] + [sources] unknown-git = "deny" -allow-git = ["https://github.com/DataDog/saluki"] +allow-git = ["https://github.com/DataDog/saluki", "https://github.com/DataDog/barkus"] [advisories] version = 2 diff --git a/examples/grammars/json.ebnf b/examples/grammars/json.ebnf new file mode 100644 index 000000000..acd2482a6 --- /dev/null +++ b/examples/grammars/json.ebnf @@ -0,0 +1,15 @@ +start = value , "\n" ; + +value = object | array | string | number | "true" | "false" | "null" ; + +object = "{" [ pair { "," pair } ] "}" ; +pair = string ":" value ; + +array = "[" [ value { "," value } ] "]" ; + +string = '"' { character } '"' ; +character = "a" | "b" | "c" | "d" | "e" | "f" | "x" | "y" | "z" + | "0" | "1" | "2" | "3" ; + +number = [ "-" ] digit { digit } [ "." digit { digit } ] ; +digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ; diff --git a/examples/lading-grammar.yaml b/examples/lading-grammar.yaml new file mode 100644 index 000000000..0208b3dde --- /dev/null +++ b/examples/lading-grammar.yaml @@ -0,0 +1,17 @@ +generator: + - tcp: + seed: [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, + 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131] + addr: "127.0.0.1:8282" + bytes_per_second: "10 MiB" + maximum_prebuild_cache_size_bytes: "64 MiB" + variant: + grammar: + grammar_path: "examples/grammars/json.ebnf" + format: ebnf + max_depth: 20 + max_total_nodes: 5000 + +blackhole: + - tcp: + binding_addr: "0.0.0.0:8282" diff --git a/lading_payload/Cargo.toml b/lading_payload/Cargo.toml index ef9be7d54..6719058a7 100644 --- a/lading_payload/Cargo.toml +++ b/lading_payload/Cargo.toml @@ -30,12 +30,18 @@ tracing = { workspace = true } tokio = { workspace = true } arbitrary = { version = "1", optional = true, features = ["derive"] } enum_dispatch = { workspace = true } +barkus-core = { workspace = true } +barkus-ebnf = { workspace = true } +barkus-peg = { workspace = true } +barkus-antlr = { workspace = true } +rand_0_10 = { package = "rand", version = "0.10", default-features = false } [dev-dependencies] proptest = { workspace = true } proptest-derive = { workspace = true } criterion = { version = "0.8", features = ["html_reports"] } rustc-hash = { workspace = true } +tempfile = { workspace = true } [features] default = [] diff --git a/lading_payload/README.grammar.md b/lading_payload/README.grammar.md new file mode 100644 index 000000000..55bf00722 --- /dev/null +++ b/lading_payload/README.grammar.md @@ -0,0 +1,106 @@ +# Grammar payload + +Generates structured data from EBNF, PEG, or ANTLR v4 grammar files using +[barkus](https://github.com/DataDog/barkus). Samples are concatenated directly +with no injected delimiters — if you need newline-delimited output, include a +trailing `\n` in your grammar's start production. + +## Using it in a lading config + +Reference the variant by name with a `grammar_path` pointing at the grammar +file and a `format` field indicating the grammar type. + +**TCP generator** (variant is a top-level field): + +```yaml +generator: + - tcp: + seed: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32] + addr: "127.0.0.1:8080" + bytes_per_second: "10 MiB" + maximum_prebuild_cache_size_bytes: "64 MiB" + variant: + grammar: + grammar_path: "/path/to/json.ebnf" + format: ebnf + max_depth: 20 + max_total_nodes: 5000 +``` + +**HTTP generator** (variant is nested under `method.post`): + +```yaml +generator: + - http: + seed: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32] + headers: {} + target_uri: "http://127.0.0.1:8080/" + bytes_per_second: "1 MiB" + parallel_connections: 1 + method: + post: + maximum_prebuild_cache_size_bytes: "10 MiB" + variant: + grammar: + grammar_path: "/path/to/sql.g4" + format: antlr +``` + +## Configuration fields + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `grammar_path` | path | (required) | Path to the grammar file. Can be absolute or relative to the lading working directory. | +| `format` | string | (required) | Grammar format: `ebnf`, `peg`, or `antlr`. | +| `max_depth` | integer | 30 | Maximum recursion depth for the grammar expansion. Lower values produce smaller, simpler output. | +| `max_total_nodes` | integer | 20000 | Maximum AST nodes per generated sample. Limits the total size of each output. | + +## Supported grammar formats + +- **EBNF** (`format: ebnf`): ISO/IEC 14977 Extended Backus-Naur Form. Rules + use `=` and terminate with `;`. Repetition with `{ }`, optional with `[ ]`. +- **PEG** (`format: peg`): Parsing Expression Grammars. Rules use `<-` or `=`. + Ordered choice with `/`, quantifiers `?`, `*`, `+`. +- **ANTLR** (`format: antlr`): ANTLR v4 combined or parser grammars (`.g4` + files). Rules use `:` and terminate with `;`. Supports `grammar Name;` + headers and `fragment` rules. + +## Example EBNF grammar (simplified JSON) + +```ebnf +(* Wrap the start production with a trailing newline for line-delimited output. *) +start = value , "\n" ; + +value = object | array | string | number | "true" | "false" | "null" ; + +object = "{" [ pair { "," pair } ] "}" ; +pair = string ":" value ; + +array = "[" [ value { "," value } ] "]" ; + +string = '"' { character } '"' ; +character = "a" | "b" | "c" | "x" | "y" | "z" | "0" | "1" | "2" ; + +number = [ "-" ] digit { digit } [ "." digit { digit } ] ; +digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ; +``` + +This produces output like: + +``` +{"a1":true,"xz":[false,42,-7.3]} +"abc" +[null,{"y":0}] +``` + +## Tuning tips + +- Start with the defaults (`max_depth: 30`, `max_total_nodes: 20000`). Lower + `max_depth` if you want shallower output or faster generation. +- Increase `max_total_nodes` for grammars with many terminals per sample (e.g., + large SQL statements). +- If the grammar's start rule requires deep recursion, ensure `max_depth` is at + least as large as the start rule's minimum depth. Lading validates this at + startup and will report an error if the depth budget is too small. diff --git a/lading_payload/src/block.rs b/lading_payload/src/block.rs index 4acb0d4e5..186635dcc 100644 --- a/lading_payload/src/block.rs +++ b/lading_payload/src/block.rs @@ -401,6 +401,17 @@ impl Cache { construct_block_cache_inner(rng, &mut pyld, maximum_block_bytes, total_bytes.get())? } + crate::Config::Grammar(config) => { + let mut serializer = crate::Grammar::new(config)?; + let span = span!(Level::INFO, "fixed", payload = "grammar"); + let _guard = span.enter(); + construct_block_cache_inner( + &mut rng, + &mut serializer, + maximum_block_bytes, + total_bytes.get(), + )? + } }; let total_cycle_size = blocks diff --git a/lading_payload/src/grammar.rs b/lading_payload/src/grammar.rs new file mode 100644 index 000000000..6f89341d9 --- /dev/null +++ b/lading_payload/src/grammar.rs @@ -0,0 +1,382 @@ +//! Grammar-based payload generation via [barkus](https://github.com/DataDog/barkus). +//! +//! This module wraps barkus — a structure-aware grammar-based data generator — +//! as a lading payload type. It accepts EBNF, PEG, or ANTLR v4 grammar files +//! and generates conforming structured output. +//! +//! Unlike other lading payload types, the grammar generator does **not** inject +//! delimiters (e.g. newlines) between samples. If you need newline-delimited +//! output, include the newline in your grammar's start production. + +use std::{io::Write, path::PathBuf}; + +use barkus_core::error::GenerateError; +use barkus_core::generate; +use barkus_core::profile::Profile; +use rand::Rng; +use rand_0_10::SeedableRng; +use serde::Deserialize; +use tracing::warn; + +use crate::Error; + +/// Rand 0.10 `SmallRng`, bridging lading's rand 0.9 to barkus's rand 0.10. +type BarkusRng = rand_0_10::rngs::SmallRng; + +/// Maximum number of consecutive generation failures (empty output or budget +/// exhaustion) before we stop retrying and return what we have so far. +const MAX_CONSECUTIVE_FAILURES: u32 = 1_000; + +/// Default max recursion depth for grammar generation. +const DEFAULT_MAX_DEPTH: u32 = 30; + +/// Default max total AST nodes per generated sample. +const DEFAULT_MAX_TOTAL_NODES: u32 = 20_000; + +/// Grammar format understood by the parser. +#[derive(Debug, Deserialize, serde::Serialize, Clone, Copy, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum GrammarFormat { + /// ISO/IEC 14977 Extended Backus-Naur Form + Ebnf, + /// Parsing Expression Grammar + Peg, + /// ANTLR v4 combined or parser grammar + Antlr, +} + +/// Configuration for the grammar-based payload generator. +#[derive(Debug, Deserialize, serde::Serialize, Clone, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Config { + /// Path to the grammar file (.ebnf, .peg, .g4). + pub grammar_path: PathBuf, + /// Grammar format. + pub format: GrammarFormat, + /// Max recursion depth (default 30). + #[serde(default)] + pub max_depth: Option, + /// Max total AST nodes per sample (default 20000). + #[serde(default)] + pub max_total_nodes: Option, +} + +/// Grammar-based payload generator backed by barkus. +pub struct Grammar { + grammar_ir: barkus_core::ir::GrammarIr, + profile: Profile, +} + +impl std::fmt::Debug for Grammar { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Grammar") + .field("start", &self.grammar_ir.start) + .field("num_productions", &self.grammar_ir.productions.len()) + .finish_non_exhaustive() + } +} + +impl Grammar { + /// Construct a new grammar payload generator from configuration. + /// + /// Reads and compiles the grammar file, then builds a generation profile + /// with any config overrides applied. Validates that the grammar's start + /// production is reachable within the configured depth budget. + /// + /// # Errors + /// + /// Returns an error if the grammar file cannot be read, parsed, or if + /// the start production's minimum depth exceeds the configured max depth. + pub fn new(config: &Config) -> Result { + let source = std::fs::read_to_string(&config.grammar_path)?; + let grammar_ir = match config.format { + GrammarFormat::Ebnf => { + barkus_ebnf::compile(&source).map_err(|e| Error::Grammar(e.to_string()))? + } + GrammarFormat::Peg => { + barkus_peg::compile(&source).map_err(|e| Error::Grammar(e.to_string()))? + } + GrammarFormat::Antlr => { + barkus_antlr::compile(&source).map_err(|e| Error::Grammar(e.to_string()))? + } + }; + + let mut builder = Profile::builder(); + builder = builder.max_depth(config.max_depth.unwrap_or(DEFAULT_MAX_DEPTH)); + builder = + builder.max_total_nodes(config.max_total_nodes.unwrap_or(DEFAULT_MAX_TOTAL_NODES)); + let profile = builder.build(); + + // Validate that the start production can be reached within the depth + // budget. Without this check a grammar whose min_depth exceeds + // max_depth would cause every generation attempt to fail with + // BudgetExhausted, wasting time during block cache construction. + // Barkus guarantees that `start` is a valid index into `productions` + // (validated during grammar compilation). See barkus ADR-0010. + let start_min_depth = grammar_ir.productions[grammar_ir.start].attrs.min_depth; + if start_min_depth > profile.max_depth { + return Err(Error::Grammar(format!( + "start production requires minimum depth {start_min_depth} \ + but max_depth is {}", + profile.max_depth, + ))); + } + + Ok(Self { + grammar_ir, + profile, + }) + } +} + +impl crate::Serialize for Grammar { + fn to_bytes(&mut self, mut rng: R, max_bytes: usize, writer: &mut W) -> Result<(), Error> + where + R: Rng + Sized, + W: Write, + { + if max_bytes == 0 { + return Ok(()); + } + + // Bridge rand 0.9 (lading) -> rand 0.10 (barkus) by seeding a local + // SmallRng from the lading RNG. Each call gets a unique seed, + // preserving determinism when the outer RNG is seeded. + // NOTE: retry seeds also come from `rng`, so the number of retries + // affects the outer RNG state. Determinism is preserved as long as + // both the seed and the grammar behaviour are identical. + let seed: u64 = rng.random(); + let mut barkus_rng = BarkusRng::seed_from_u64(seed); + + let mut bytes_remaining = max_bytes; + let mut consecutive_failures: u32 = 0; + + loop { + match generate::generate(&self.grammar_ir, &self.profile, &mut barkus_rng) { + Ok((ast, _tape, _tape_map)) => { + let sample = ast.serialize(); + if sample.is_empty() { + consecutive_failures += 1; + if consecutive_failures >= MAX_CONSECUTIVE_FAILURES { + break; + } + continue; + } + consecutive_failures = 0; + let needed = sample.len(); + let Some(remainder) = bytes_remaining.checked_sub(needed) else { + break; + }; + writer.write_all(&sample)?; + bytes_remaining = remainder; + } + Err(GenerateError::BudgetExhausted { .. }) => { + // Budget exhaustion is expected for complex grammars — + // retry with a fresh seed so we don't get stuck. + consecutive_failures += 1; + if consecutive_failures >= MAX_CONSECUTIVE_FAILURES { + break; + } + let retry_seed: u64 = rng.random(); + barkus_rng = BarkusRng::seed_from_u64(retry_seed); + } + } + } + if consecutive_failures >= MAX_CONSECUTIVE_FAILURES && bytes_remaining == max_bytes { + warn!( + "grammar generator hit {MAX_CONSECUTIVE_FAILURES} consecutive failures \ + without producing any output" + ); + } + Ok(()) + } +} + +#[cfg(test)] +mod test { + use super::*; + + use proptest::prelude::*; + use rand::SeedableRng; + use rand::rngs::SmallRng; + + /// Helper: write `source` to a temp file with the given extension and + /// return a `Config` pointing to it. + fn config_from_source( + source: &str, + format: GrammarFormat, + ext: &str, + ) -> (tempfile::TempDir, Config) { + let dir = tempfile::tempdir().expect("failed to create temp dir"); + let path = dir.path().join(format!("grammar.{ext}")); + std::fs::write(&path, source).expect("failed to write grammar file"); + let config = Config { + grammar_path: path, + format, + max_depth: Some(10), + max_total_nodes: Some(1000), + }; + (dir, config) + } + + /// Call `to_bytes` on a `Grammar` instance (brings `crate::Serialize` + /// into scope without an `as` import). + fn generate_bytes(grammar: &mut Grammar, rng: SmallRng, max_bytes: usize) -> Vec { + let mut buf = Vec::new(); + crate::Serialize::to_bytes(grammar, rng, max_bytes, &mut buf) + .expect("to_bytes should not fail"); + buf + } + + // ── EBNF ──────────────────────────────────────────────────────────── + + #[test] + fn ebnf_generates_expected_output() { + // Grammar includes a trailing newline so each sample is its own line. + let source = "greeting = ( \"hello\" | \"world\" ) , \"\\n\" ;"; + let (_dir, config) = config_from_source(source, GrammarFormat::Ebnf, "ebnf"); + + let mut grammar = Grammar::new(&config).unwrap(); + let buf = generate_bytes(&mut grammar, SmallRng::seed_from_u64(42), 1024); + + assert!(!buf.is_empty(), "grammar should produce non-empty output"); + let output = String::from_utf8_lossy(&buf); + assert!( + output.lines().count() > 1, + "expected multiple lines in 1024 byte budget" + ); + for line in output.lines() { + assert!( + line == "hello" || line == "world", + "unexpected output line: {line:?}" + ); + } + } + + // ── PEG ───────────────────────────────────────────────────────────── + + #[test] + fn peg_generates_expected_output() { + let source = "greeting <- (\"hello\" / \"world\") \"\\n\""; + let (_dir, config) = config_from_source(source, GrammarFormat::Peg, "peg"); + + let mut grammar = Grammar::new(&config).unwrap(); + let buf = generate_bytes(&mut grammar, SmallRng::seed_from_u64(42), 1024); + + assert!( + !buf.is_empty(), + "PEG grammar should produce non-empty output" + ); + for line in String::from_utf8_lossy(&buf).lines() { + assert!( + line == "hello" || line == "world", + "unexpected PEG output: {line:?}" + ); + } + } + + // ── ANTLR ─────────────────────────────────────────────────────────── + + #[test] + fn antlr_generates_expected_output() { + let source = "grammar Test;\ngreeting : ('hello' | 'world') '\\n' ;"; + let (_dir, config) = config_from_source(source, GrammarFormat::Antlr, "g4"); + + let mut grammar = Grammar::new(&config).unwrap(); + let buf = generate_bytes(&mut grammar, SmallRng::seed_from_u64(42), 1024); + + assert!( + !buf.is_empty(), + "ANTLR grammar should produce non-empty output" + ); + for line in String::from_utf8_lossy(&buf).lines() { + assert!( + line == "hello" || line == "world", + "unexpected ANTLR output: {line:?}" + ); + } + } + + // ── Error paths ───────────────────────────────────────────────────── + + #[test] + fn missing_grammar_file_returns_error() { + let config = Config { + grammar_path: PathBuf::from("/nonexistent/path/grammar.ebnf"), + format: GrammarFormat::Ebnf, + max_depth: Some(10), + max_total_nodes: Some(1000), + }; + assert!(Grammar::new(&config).is_err()); + } + + #[test] + fn invalid_grammar_source_returns_error() { + let source = "this is not valid EBNF @@@ !!!"; + let (_dir, config) = config_from_source(source, GrammarFormat::Ebnf, "ebnf"); + assert!(Grammar::new(&config).is_err()); + } + + #[test] + fn depth_budget_too_small_returns_error() { + // A recursive grammar that needs more depth than we allow. + let source = "a = b ; b = c ; c = d ; d = e ; e = \"x\" ;"; + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("deep.ebnf"); + std::fs::write(&path, source).unwrap(); + let config = Config { + grammar_path: path, + format: GrammarFormat::Ebnf, + max_depth: Some(1), // too shallow for the chain a->b->c->d->e + max_total_nodes: Some(1000), + }; + assert!(Grammar::new(&config).is_err()); + } + + // ── Determinism ───────────────────────────────────────────────────── + + #[test] + fn same_seed_produces_same_output() { + let source = "item = \"a\" | \"b\" | \"c\" | \"d\" ;"; + let (_dir, config) = config_from_source(source, GrammarFormat::Ebnf, "ebnf"); + + let mut grammar1 = Grammar::new(&config).unwrap(); + let mut grammar2 = Grammar::new(&config).unwrap(); + + let buf1 = generate_bytes(&mut grammar1, SmallRng::seed_from_u64(123), 2048); + let buf2 = generate_bytes(&mut grammar2, SmallRng::seed_from_u64(123), 2048); + + assert_eq!(buf1, buf2, "same seed must produce identical output"); + } + + // ── Property tests ────────────────────────────────────────────────── + + proptest! { + #[test] + fn payload_not_exceed_max_bytes(seed: u64, max_bytes in 0..4096u16) { + let max_bytes = max_bytes as usize; + let source = "item = \"abcdefghij\" ;"; + let (_dir, config) = config_from_source(source, GrammarFormat::Ebnf, "ebnf"); + let mut grammar = Grammar::new(&config).unwrap(); + let buf = generate_bytes(&mut grammar, SmallRng::seed_from_u64(seed), max_bytes); + prop_assert!( + buf.len() <= max_bytes, + "output {} bytes exceeds budget of {max_bytes}", + buf.len() + ); + } + } + + // ── Zero budget ───────────────────────────────────────────────────── + + #[test] + fn zero_budget_produces_empty_output() { + let source = "item = \"hello\" ;"; + let (_dir, config) = config_from_source(source, GrammarFormat::Ebnf, "ebnf"); + let mut grammar = Grammar::new(&config).unwrap(); + let buf = generate_bytes(&mut grammar, SmallRng::seed_from_u64(1), 0); + assert!(buf.is_empty()); + } +} diff --git a/lading_payload/src/lib.rs b/lading_payload/src/lib.rs index 7da716c56..6b37ee5f8 100644 --- a/lading_payload/src/lib.rs +++ b/lading_payload/src/lib.rs @@ -21,6 +21,7 @@ pub use ascii::Ascii; pub use datadog_logs::DatadogLog; pub use dogstatsd::DogStatsD; pub use fluent::Fluent; +pub use grammar::Grammar; pub use json::Json; pub use opentelemetry::log::OpentelemetryLogs; pub use opentelemetry::metric::OpentelemetryMetrics; @@ -37,6 +38,7 @@ pub mod common; pub mod datadog_logs; pub mod dogstatsd; pub mod fluent; +pub mod grammar; pub mod json; pub mod opentelemetry; pub mod procfs; @@ -88,6 +90,9 @@ pub enum Error { /// Template generation error (invalid references, unbound variables, etc.) #[error("Template generation error: {0}")] TemplateError(String), + /// Grammar parse or compilation error + #[error("Grammar error: {0}")] + Grammar(String), } /// To serialize into bytes @@ -171,6 +176,8 @@ pub enum Config { /// startup and can be shared across multiple lading configs. template_path: PathBuf, }, + /// Generates payloads from a grammar file via barkus + Grammar(crate::grammar::Config), } /// Unified payload type for all serializers @@ -207,6 +214,8 @@ pub enum Payload { TraceAgent(crate::trace_agent::v04::V04), /// JSON generated from a user-supplied template file TemplatedJson(TemplatedJson), + /// Grammar-based payload via barkus + Grammar(Grammar), } impl Serialize for Payload { @@ -231,6 +240,7 @@ impl Serialize for Payload { Payload::DogStatsdD(ser) => ser.to_bytes(rng, max_bytes, writer), Payload::TraceAgent(ser) => ser.to_bytes(rng, max_bytes, writer), Payload::TemplatedJson(ser) => ser.to_bytes(rng, max_bytes, writer), + Payload::Grammar(ser) => ser.to_bytes(rng, max_bytes, writer), } }