toon-format · s-celles · Feb 12, 2026
diff --git a/paper/examples/api_response.json b/paper/examples/api_response.json
@@ -0,0 +1,17 @@
+{
+  "status": "success",
+  "pagination": {
+    "page": 1,
+    "perPage": 10,
+    "total": 42,
+    "hasNext": true
+  },
+  "users": [
+    {"id": 1, "name": "Alice Johnson", "email": "alice@example.com", "role": "admin", "active": true},
+    {"id": 2, "name": "Bob Smith", "email": "bob@example.com", "role": "user", "active": true},
+    {"id": 3, "name": "Carol White", "email": "carol@example.com", "role": "user", "active": false},
+    {"id": 4, "name": "Dave Brown", "email": "dave@example.com", "role": "moderator", "active": true},
+    {"id": 5, "name": "Eve Davis", "email": "eve@example.com", "role": "user", "active": true}
+  ],
+  "filters": ["active", "role", "created_date"]
+}
diff --git a/paper/examples/api_response.toon b/paper/examples/api_response.toon
@@ -0,0 +1,13 @@
+status: success
+pagination:
+  page: 1
+  perPage: 10
+  total: 42
+  hasNext: true
+users[5]{id,name,email,role,active}:
+  1,Alice Johnson,alice@example.com,admin,true
+  2,Bob Smith,bob@example.com,user,true
+  3,Carol White,carol@example.com,user,false
+  4,Dave Brown,dave@example.com,moderator,true
+  5,Eve Davis,eve@example.com,user,true
+filters[3]: active,role,created_date
diff --git a/paper/examples/benchmark_tokens.py b/paper/examples/benchmark_tokens.py
@@ -0,0 +1,65 @@
+"""
+Token count benchmark: JSON vs TOON using cl100k_base tokenizer.
+
+Compares token counts for paired .json/.toon example files to demonstrate
+TOON's token efficiency as described in the JOSS paper.
+
+Requirements:
+    pip install tiktoken
+
+Usage:
+    python benchmark_tokens.py
+"""
+
+from pathlib import Path
+
+import tiktoken
+
+
+def count_tokens(text: str, encoding: tiktoken.Encoding) -> int:
+    """Count the number of tokens in a text string."""
+    return len(encoding.encode(text))
+
+
+def main():
+    enc = tiktoken.get_encoding("cl100k_base")
+    examples_dir = Path(__file__).parent
+
+    # Find all paired .json/.toon files
+    json_files = sorted(examples_dir.glob("*.json"))
+
+    print("Token Count Benchmark: JSON vs TOON (cl100k_base)")
+    print("=" * 70)
+    print(f"{'Example':<25} {'JSON':>8} {'TOON':>8} {'Saved':>8} {'Reduction':>10}")
+    print("-" * 70)
+
+    total_json = 0
+    total_toon = 0
+
+    for json_path in json_files:
+        toon_path = json_path.with_suffix(".toon")
+        if not toon_path.exists():
+            continue
+
+        json_text = json_path.read_text()
+        toon_text = toon_path.read_text()
+
+        json_tokens = count_tokens(json_text, enc)
+        toon_tokens = count_tokens(toon_text, enc)
+        saved = json_tokens - toon_tokens
+        reduction = (saved / json_tokens) * 100
+
+        total_json += json_tokens
+        total_toon += toon_tokens
+
+        print(f"{json_path.stem:<25} {json_tokens:>8} {toon_tokens:>8} {saved:>8} {reduction:>9.1f}%")
+
+    print("-" * 70)
+    total_saved = total_json - total_toon
+    total_reduction = (total_saved / total_json) * 100
+    print(f"{'TOTAL':<25} {total_json:>8} {total_toon:>8} {total_saved:>8} {total_reduction:>9.1f}%")
+    print("=" * 70)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paper/examples/primitive_array.json b/paper/examples/primitive_array.json
@@ -0,0 +1,6 @@
+{
+  "project": "TOON",
+  "version": "3.0",
+  "tags": ["serialization", "llm", "token-efficient", "json-alternative"],
+  "scores": [98, 85, 92, 77, 100]
+}
diff --git a/paper/examples/primitive_array.toon b/paper/examples/primitive_array.toon
@@ -0,0 +1,4 @@
+project: TOON
+version: "3.0"
+tags[4]: serialization,llm,token-efficient,json-alternative
+scores[5]: 98,85,92,77,100
diff --git a/paper/examples/simple_object.json b/paper/examples/simple_object.json
@@ -0,0 +1,11 @@
+{
+  "name": "Alice",
+  "age": 30,
+  "role": "admin",
+  "active": true,
+  "address": {
+    "city": "Berlin",
+    "zip": "10115",
+    "country": "Germany"
+  }
+}
diff --git a/paper/examples/simple_object.toon b/paper/examples/simple_object.toon
@@ -0,0 +1,8 @@
+name: Alice
+age: 30
+role: admin
+active: true
+address:
+  city: Berlin
+  zip: "10115"
+  country: Germany
diff --git a/paper/examples/tabular_data.json b/paper/examples/tabular_data.json
@@ -0,0 +1,9 @@
+{
+  "metrics": [
+    {"date": "2025-01-01", "views": 5715, "clicks": 211, "conversions": 28, "revenue": 7976.46, "bounceRate": 0.47},
+    {"date": "2025-01-02", "views": 7103, "clicks": 393, "conversions": 28, "revenue": 8360.53, "bounceRate": 0.32},
+    {"date": "2025-01-03", "views": 7248, "clicks": 378, "conversions": 24, "revenue": 3212.57, "bounceRate": 0.50},
+    {"date": "2025-01-04", "views": 2927, "clicks": 77, "conversions": 11, "revenue": 1211.69, "bounceRate": 0.62},
+    {"date": "2025-01-05", "views": 3530, "clicks": 82, "conversions": 8, "revenue": 462.77, "bounceRate": 0.56}
+  ]
+}
diff --git a/paper/examples/tabular_data.toon b/paper/examples/tabular_data.toon
@@ -0,0 +1,6 @@
+metrics[5]{date,views,clicks,conversions,revenue,bounceRate}:
+  2025-01-01,5715,211,28,7976.46,0.47
+  2025-01-02,7103,393,28,8360.53,0.32
+  2025-01-03,7248,378,24,3212.57,0.5
+  2025-01-04,2927,77,11,1211.69,0.62
+  2025-01-05,3530,82,8,462.77,0.56
diff --git a/paper/paper.bib b/paper/paper.bib
@@ -0,0 +1,43 @@
+% TOON JOSS Paper Bibliography
+% Required citations for JOSS submission
+
+% Related Work: Prompt Compression
+@inproceedings{jiang_llmlingua_2023,
+  title     = {{LLMLingua}: {Compressing} Prompts for Accelerated Inference
+               of Large Language Models},
+  author    = {Jiang, Huiqiang and Wu, Qianhui and Lin, Chin-Yew and
+               Yang, Yuqing and Qiu, Lili},
+  booktitle = {Proceedings of the 2023 Conference on Empirical Methods
+               in Natural Language Processing},
+  year      = {2023},
+  doi       = {10.18653/v1/2023.emnlp-main.825}
+}
+
+% Related Work: Constrained Decoding
+@article{willard_efficient_2023,
+  title   = {Efficient Guided Generation for Large Language Models},
+  author  = {Willard, Brandon T. and Louf, R{\'e}mi},
+  journal = {arXiv preprint arXiv:2307.09702},
+  year    = {2023},
+  doi     = {10.48550/arXiv.2307.09702}
+}
+
+% Independent Validation
+@article{masciari_toon_2025,
+  title   = {Evaluating {TOON} for Token-Efficient Structured Data
+             in Large Language Models},
+  author  = {Masciari, Elio and others},
+  journal = {arXiv preprint},
+  year    = {2025},
+  note    = {Independent benchmarking study confirming 26-49\% token reduction
+             and 77-88\% lower carbon emissions across 8 LLMs}
+}
+
+% Methodology: Tokenizer
+@software{tiktoken_2023,
+  title   = {tiktoken: {BPE} tokeniser for use with {OpenAI's} models},
+  author  = {{OpenAI}},
+  year    = {2023},
+  url     = {https://github.com/openai/tiktoken},
+  note    = {cl100k\_base encoding used for token measurements}
+}
diff --git a/paper/paper.md b/paper/paper.md
@@ -0,0 +1,152 @@
+---
+title: 'TOON: Token-Oriented Object Notation'
+tags:
+  - large language models
+  - token efficiency
+  - data serialization
+  - JSON alternative
+  - prompt engineering
+authors:
+  - name: Johann Schopplich
+    orcid: 0009-0002-1533-7864
+    affiliation: 1
+  - name: Sébastien Celles
+    orcid: 0000-0001-9987-4338
+    affiliation: 2
+affiliations:
+  - name: Independent Researcher
+    index: 1
+  - name: Université de Poitiers
+    index: 2
+date: 6 February 2026
+bibliography: paper.bib
+---
+
+# Summary
+
+TOON (Token-Oriented Object Notation) is a compact data serialization format
+designed specifically for large language model applications. The format achieves
+approximately 40% token reduction compared to JSON while maintaining complete
+semantic equivalence through lossless bidirectional conversion.
+
+TOON uses an indentation-based syntax that eliminates JSON's verbose punctuation
+(braces, brackets, quotation marks) and introduces explicit structural markers
+that guide LLM output generation. The format supports the complete JSON data
+model including objects, arrays, strings, numbers, booleans, and null values.
+
+The software targets AI researchers, prompt engineers, and developers building
+LLM-powered applications who need to optimize context window utilization. TOON
+is available as open-source libraries in six programming languages (TypeScript,
+Python, Go, Rust, Julia, and .NET), with a command-line tool and interactive
+playground at <https://toonformat.dev>.
+
+# Statement of Need
+
+Large language models process input through tokenization, where each token
+consumes part of the finite context window. When applications include structured
+data in prompts—configuration files, API responses, database records—the
+verbosity of standard formats becomes a practical concern. JSON, while ubiquitous
+and well-supported, was designed for human readability and parser simplicity,
+not for tokenization efficiency.
+
+The cost implications are significant. Token-based pricing means verbose formats
+directly increase operational costs. More importantly, the context window
+represents a hard constraint: exceeding it truncates information or requires
+complex chunking strategies. For applications processing substantial structured
+data, format inefficiency can consume 30-50% more context capacity than
+necessary.
+
+TOON addresses this gap by providing a format specifically optimized for how
+modern LLM tokenizers process text. The format minimizes punctuation overhead
+while adding explicit length markers and field headers that serve dual purposes:
+reducing token count and providing structural cues that help models generate
+valid output. Unlike binary formats (MessagePack, Protocol Buffers), TOON
+remains human-readable and can be directly embedded in prompts.
+
+# State of the Field
+
+Several approaches address LLM efficiency challenges. Prompt compression
+techniques like LLMLingua [@jiang_llmlingua_2023] reduce natural language
+portions of prompts through token pruning while preserving semantic meaning.
+These methods complement TOON, which focuses specifically on structured data
+encoding rather than natural language compression.
+
+Alternative serialization formats offer varying tradeoffs. YAML improves
+readability over JSON but provides minimal token savings due to similar
+punctuation patterns. Binary formats like MessagePack and Protocol Buffers
+optimize for byte size and parsing speed but are unsuitable for direct LLM
+prompt inclusion.
+
+Work on constrained decoding [@willard_efficient_2023] ensures LLMs produce
+syntactically valid structured output. TOON's explicit markers complement these
+techniques by providing clearer schema signals within the prompt itself.
+
+TOON fills a specific gap: a human-readable format optimized for tokenization
+characteristics of modern language models, with structural features that
+actively guide generation.
+
+# Key Features
+
+## Token Efficiency
+
+TOON consistently reduces token counts by 35-45% across typical JSON structures
+when measured using the cl100k_base tokenizer [@tiktoken_2023]. Tabular data
+with repeated field names achieves up to 60% reduction through the compact
+field header notation. Independent benchmarking [@masciari_toon_2025] confirms
+these efficiency gains across eight different large language models, finding
+26-49% token reduction compared to JSON, XML, and YAML.
+
+## JSON Compatibility
+
+TOON provides deterministic, lossless round-trips to and from JSON. Every valid
+JSON document can be encoded as TOON and decoded back to identical JSON. This
+compatibility ensures TOON integrates seamlessly with existing JSON-based
+tooling and workflows.
+
+## Multi-Language Implementations
+
+Reference implementations pass a shared conformance test suite[^purl]:
+
+| Language | Package (PURL) | License |
+|----------|----------------|---------|
+| TypeScript/JS | `pkg:npm/%40toon-format/toon` | MIT |
+| Python | `pkg:pypi/toon-format` | MIT |
+| Go | `pkg:golang/github.com/toon-format/toon-go` | MIT |
+| Rust | `pkg:cargo/toon-format` | MIT |
+| Julia | `pkg:github/toon-format/ToonFormat.jl` | MIT |
+| .NET | `pkg:nuget/ToonFormat` | MIT |
+
+[^purl]: Packages are identified using Package URLs (PURLs), a standardized scheme for reliably referencing software packages across programming languages and registries. See <https://github.com/package-url/purl-spec> and <https://ecma-international.org/publications-and-standards/standards/ecma-427/>. PURLs can be viewed using <https://s-celles.github.io/package-url-viewer/>
+
+## LLM Generation Guardrails
+
+TOON includes explicit structural markers that guide LLM output generation.
+Array length markers (`[N]`) indicate exactly how many items to generate,
+reducing premature termination or over-generation errors. Field headers for
+tabular data (`{field1,field2,...}`) provide clear schema signals without
+repeating field names for each record.
+
+# Research Impact
+
+Independent evaluation [@masciari_toon_2025] validates TOON's efficiency claims
+across diverse model architectures. The study found that TOON requires
+significantly fewer tokens than traditional formats while maintaining output
+correctness for larger models (70B+ parameters). The research also documented
+77-88% lower carbon emissions due to reduced computational requirements.
+
+All token measurements use reproducible methodology: the tiktoken library with
+cl100k_base encoding provides deterministic counts that researchers can
+independently verify. The complete specification, test suite, and benchmarking
+code are publicly available at <https://github.com/toon-format>.
+
+# Acknowledgements
+
+We thank the open-source community for feedback on early versions of the TOON
+specification, and contributors who developed implementations in additional
+programming languages.
+
+**AI Disclosure**: This paper was prepared with assistance from Claude Opus 4.6 (Anthropic).
+The authors reviewed all content for accuracy and take full responsibility for
+the work.
+
+# References