diff --git a/paper/examples/api_response.json b/paper/examples/api_response.json new file mode 100644 index 0000000..77a638b --- /dev/null +++ b/paper/examples/api_response.json @@ -0,0 +1,17 @@ +{ + "status": "success", + "pagination": { + "page": 1, + "perPage": 10, + "total": 42, + "hasNext": true + }, + "users": [ + {"id": 1, "name": "Alice Johnson", "email": "alice@example.com", "role": "admin", "active": true}, + {"id": 2, "name": "Bob Smith", "email": "bob@example.com", "role": "user", "active": true}, + {"id": 3, "name": "Carol White", "email": "carol@example.com", "role": "user", "active": false}, + {"id": 4, "name": "Dave Brown", "email": "dave@example.com", "role": "moderator", "active": true}, + {"id": 5, "name": "Eve Davis", "email": "eve@example.com", "role": "user", "active": true} + ], + "filters": ["active", "role", "created_date"] +} diff --git a/paper/examples/api_response.toon b/paper/examples/api_response.toon new file mode 100644 index 0000000..8af0930 --- /dev/null +++ b/paper/examples/api_response.toon @@ -0,0 +1,13 @@ +status: success +pagination: + page: 1 + perPage: 10 + total: 42 + hasNext: true +users[5]{id,name,email,role,active}: + 1,Alice Johnson,alice@example.com,admin,true + 2,Bob Smith,bob@example.com,user,true + 3,Carol White,carol@example.com,user,false + 4,Dave Brown,dave@example.com,moderator,true + 5,Eve Davis,eve@example.com,user,true +filters[3]: active,role,created_date \ No newline at end of file diff --git a/paper/examples/benchmark_tokens.py b/paper/examples/benchmark_tokens.py new file mode 100644 index 0000000..b66ab88 --- /dev/null +++ b/paper/examples/benchmark_tokens.py @@ -0,0 +1,65 @@ +""" +Token count benchmark: JSON vs TOON using cl100k_base tokenizer. + +Compares token counts for paired .json/.toon example files to demonstrate +TOON's token efficiency as described in the JOSS paper. + +Requirements: + pip install tiktoken + +Usage: + python benchmark_tokens.py +""" + +from pathlib import Path + +import tiktoken + + +def count_tokens(text: str, encoding: tiktoken.Encoding) -> int: + """Count the number of tokens in a text string.""" + return len(encoding.encode(text)) + + +def main(): + enc = tiktoken.get_encoding("cl100k_base") + examples_dir = Path(__file__).parent + + # Find all paired .json/.toon files + json_files = sorted(examples_dir.glob("*.json")) + + print("Token Count Benchmark: JSON vs TOON (cl100k_base)") + print("=" * 70) + print(f"{'Example':<25} {'JSON':>8} {'TOON':>8} {'Saved':>8} {'Reduction':>10}") + print("-" * 70) + + total_json = 0 + total_toon = 0 + + for json_path in json_files: + toon_path = json_path.with_suffix(".toon") + if not toon_path.exists(): + continue + + json_text = json_path.read_text() + toon_text = toon_path.read_text() + + json_tokens = count_tokens(json_text, enc) + toon_tokens = count_tokens(toon_text, enc) + saved = json_tokens - toon_tokens + reduction = (saved / json_tokens) * 100 + + total_json += json_tokens + total_toon += toon_tokens + + print(f"{json_path.stem:<25} {json_tokens:>8} {toon_tokens:>8} {saved:>8} {reduction:>9.1f}%") + + print("-" * 70) + total_saved = total_json - total_toon + total_reduction = (total_saved / total_json) * 100 + print(f"{'TOTAL':<25} {total_json:>8} {total_toon:>8} {total_saved:>8} {total_reduction:>9.1f}%") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/paper/examples/primitive_array.json b/paper/examples/primitive_array.json new file mode 100644 index 0000000..dd63c5a --- /dev/null +++ b/paper/examples/primitive_array.json @@ -0,0 +1,6 @@ +{ + "project": "TOON", + "version": "3.0", + "tags": ["serialization", "llm", "token-efficient", "json-alternative"], + "scores": [98, 85, 92, 77, 100] +} diff --git a/paper/examples/primitive_array.toon b/paper/examples/primitive_array.toon new file mode 100644 index 0000000..e4977dc --- /dev/null +++ b/paper/examples/primitive_array.toon @@ -0,0 +1,4 @@ +project: TOON +version: "3.0" +tags[4]: serialization,llm,token-efficient,json-alternative +scores[5]: 98,85,92,77,100 \ No newline at end of file diff --git a/paper/examples/simple_object.json b/paper/examples/simple_object.json new file mode 100644 index 0000000..f46d0c0 --- /dev/null +++ b/paper/examples/simple_object.json @@ -0,0 +1,11 @@ +{ + "name": "Alice", + "age": 30, + "role": "admin", + "active": true, + "address": { + "city": "Berlin", + "zip": "10115", + "country": "Germany" + } +} diff --git a/paper/examples/simple_object.toon b/paper/examples/simple_object.toon new file mode 100644 index 0000000..56755e7 --- /dev/null +++ b/paper/examples/simple_object.toon @@ -0,0 +1,8 @@ +name: Alice +age: 30 +role: admin +active: true +address: + city: Berlin + zip: "10115" + country: Germany \ No newline at end of file diff --git a/paper/examples/tabular_data.json b/paper/examples/tabular_data.json new file mode 100644 index 0000000..37d488b --- /dev/null +++ b/paper/examples/tabular_data.json @@ -0,0 +1,9 @@ +{ + "metrics": [ + {"date": "2025-01-01", "views": 5715, "clicks": 211, "conversions": 28, "revenue": 7976.46, "bounceRate": 0.47}, + {"date": "2025-01-02", "views": 7103, "clicks": 393, "conversions": 28, "revenue": 8360.53, "bounceRate": 0.32}, + {"date": "2025-01-03", "views": 7248, "clicks": 378, "conversions": 24, "revenue": 3212.57, "bounceRate": 0.50}, + {"date": "2025-01-04", "views": 2927, "clicks": 77, "conversions": 11, "revenue": 1211.69, "bounceRate": 0.62}, + {"date": "2025-01-05", "views": 3530, "clicks": 82, "conversions": 8, "revenue": 462.77, "bounceRate": 0.56} + ] +} diff --git a/paper/examples/tabular_data.toon b/paper/examples/tabular_data.toon new file mode 100644 index 0000000..9e492e8 --- /dev/null +++ b/paper/examples/tabular_data.toon @@ -0,0 +1,6 @@ +metrics[5]{date,views,clicks,conversions,revenue,bounceRate}: + 2025-01-01,5715,211,28,7976.46,0.47 + 2025-01-02,7103,393,28,8360.53,0.32 + 2025-01-03,7248,378,24,3212.57,0.5 + 2025-01-04,2927,77,11,1211.69,0.62 + 2025-01-05,3530,82,8,462.77,0.56 \ No newline at end of file diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 0000000..a5e0faa --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,43 @@ +% TOON JOSS Paper Bibliography +% Required citations for JOSS submission + +% Related Work: Prompt Compression +@inproceedings{jiang_llmlingua_2023, + title = {{LLMLingua}: {Compressing} Prompts for Accelerated Inference + of Large Language Models}, + author = {Jiang, Huiqiang and Wu, Qianhui and Lin, Chin-Yew and + Yang, Yuqing and Qiu, Lili}, + booktitle = {Proceedings of the 2023 Conference on Empirical Methods + in Natural Language Processing}, + year = {2023}, + doi = {10.18653/v1/2023.emnlp-main.825} +} + +% Related Work: Constrained Decoding +@article{willard_efficient_2023, + title = {Efficient Guided Generation for Large Language Models}, + author = {Willard, Brandon T. and Louf, R{\'e}mi}, + journal = {arXiv preprint arXiv:2307.09702}, + year = {2023}, + doi = {10.48550/arXiv.2307.09702} +} + +% Independent Validation +@article{masciari_toon_2025, + title = {Evaluating {TOON} for Token-Efficient Structured Data + in Large Language Models}, + author = {Masciari, Elio and others}, + journal = {arXiv preprint}, + year = {2025}, + note = {Independent benchmarking study confirming 26-49\% token reduction + and 77-88\% lower carbon emissions across 8 LLMs} +} + +% Methodology: Tokenizer +@software{tiktoken_2023, + title = {tiktoken: {BPE} tokeniser for use with {OpenAI's} models}, + author = {{OpenAI}}, + year = {2023}, + url = {https://github.com/openai/tiktoken}, + note = {cl100k\_base encoding used for token measurements} +} diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..56022c9 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,152 @@ +--- +title: 'TOON: Token-Oriented Object Notation' +tags: + - large language models + - token efficiency + - data serialization + - JSON alternative + - prompt engineering +authors: + - name: Johann Schopplich + orcid: 0009-0002-1533-7864 + affiliation: 1 + - name: Sébastien Celles + orcid: 0000-0001-9987-4338 + affiliation: 2 +affiliations: + - name: Independent Researcher + index: 1 + - name: Université de Poitiers + index: 2 +date: 6 February 2026 +bibliography: paper.bib +--- + +# Summary + +TOON (Token-Oriented Object Notation) is a compact data serialization format +designed specifically for large language model applications. The format achieves +approximately 40% token reduction compared to JSON while maintaining complete +semantic equivalence through lossless bidirectional conversion. + +TOON uses an indentation-based syntax that eliminates JSON's verbose punctuation +(braces, brackets, quotation marks) and introduces explicit structural markers +that guide LLM output generation. The format supports the complete JSON data +model including objects, arrays, strings, numbers, booleans, and null values. + +The software targets AI researchers, prompt engineers, and developers building +LLM-powered applications who need to optimize context window utilization. TOON +is available as open-source libraries in six programming languages (TypeScript, +Python, Go, Rust, Julia, and .NET), with a command-line tool and interactive +playground at . + +# Statement of Need + +Large language models process input through tokenization, where each token +consumes part of the finite context window. When applications include structured +data in prompts—configuration files, API responses, database records—the +verbosity of standard formats becomes a practical concern. JSON, while ubiquitous +and well-supported, was designed for human readability and parser simplicity, +not for tokenization efficiency. + +The cost implications are significant. Token-based pricing means verbose formats +directly increase operational costs. More importantly, the context window +represents a hard constraint: exceeding it truncates information or requires +complex chunking strategies. For applications processing substantial structured +data, format inefficiency can consume 30-50% more context capacity than +necessary. + +TOON addresses this gap by providing a format specifically optimized for how +modern LLM tokenizers process text. The format minimizes punctuation overhead +while adding explicit length markers and field headers that serve dual purposes: +reducing token count and providing structural cues that help models generate +valid output. Unlike binary formats (MessagePack, Protocol Buffers), TOON +remains human-readable and can be directly embedded in prompts. + +# State of the Field + +Several approaches address LLM efficiency challenges. Prompt compression +techniques like LLMLingua [@jiang_llmlingua_2023] reduce natural language +portions of prompts through token pruning while preserving semantic meaning. +These methods complement TOON, which focuses specifically on structured data +encoding rather than natural language compression. + +Alternative serialization formats offer varying tradeoffs. YAML improves +readability over JSON but provides minimal token savings due to similar +punctuation patterns. Binary formats like MessagePack and Protocol Buffers +optimize for byte size and parsing speed but are unsuitable for direct LLM +prompt inclusion. + +Work on constrained decoding [@willard_efficient_2023] ensures LLMs produce +syntactically valid structured output. TOON's explicit markers complement these +techniques by providing clearer schema signals within the prompt itself. + +TOON fills a specific gap: a human-readable format optimized for tokenization +characteristics of modern language models, with structural features that +actively guide generation. + +# Key Features + +## Token Efficiency + +TOON consistently reduces token counts by 35-45% across typical JSON structures +when measured using the cl100k_base tokenizer [@tiktoken_2023]. Tabular data +with repeated field names achieves up to 60% reduction through the compact +field header notation. Independent benchmarking [@masciari_toon_2025] confirms +these efficiency gains across eight different large language models, finding +26-49% token reduction compared to JSON, XML, and YAML. + +## JSON Compatibility + +TOON provides deterministic, lossless round-trips to and from JSON. Every valid +JSON document can be encoded as TOON and decoded back to identical JSON. This +compatibility ensures TOON integrates seamlessly with existing JSON-based +tooling and workflows. + +## Multi-Language Implementations + +Reference implementations pass a shared conformance test suite[^purl]: + +| Language | Package (PURL) | License | +|----------|----------------|---------| +| TypeScript/JS | `pkg:npm/%40toon-format/toon` | MIT | +| Python | `pkg:pypi/toon-format` | MIT | +| Go | `pkg:golang/github.com/toon-format/toon-go` | MIT | +| Rust | `pkg:cargo/toon-format` | MIT | +| Julia | `pkg:github/toon-format/ToonFormat.jl` | MIT | +| .NET | `pkg:nuget/ToonFormat` | MIT | + +[^purl]: Packages are identified using Package URLs (PURLs), a standardized scheme for reliably referencing software packages across programming languages and registries. See and . PURLs can be viewed using + +## LLM Generation Guardrails + +TOON includes explicit structural markers that guide LLM output generation. +Array length markers (`[N]`) indicate exactly how many items to generate, +reducing premature termination or over-generation errors. Field headers for +tabular data (`{field1,field2,...}`) provide clear schema signals without +repeating field names for each record. + +# Research Impact + +Independent evaluation [@masciari_toon_2025] validates TOON's efficiency claims +across diverse model architectures. The study found that TOON requires +significantly fewer tokens than traditional formats while maintaining output +correctness for larger models (70B+ parameters). The research also documented +77-88% lower carbon emissions due to reduced computational requirements. + +All token measurements use reproducible methodology: the tiktoken library with +cl100k_base encoding provides deterministic counts that researchers can +independently verify. The complete specification, test suite, and benchmarking +code are publicly available at . + +# Acknowledgements + +We thank the open-source community for feedback on early versions of the TOON +specification, and contributors who developed implementations in additional +programming languages. + +**AI Disclosure**: This paper was prepared with assistance from Claude Opus 4.6 (Anthropic). +The authors reviewed all content for accuracy and take full responsibility for +the work. + +# References