huggingface · xenova · May 9, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
diff --git a/README.md b/README.md
@@ -451,3 +451,5 @@ To find compatible models on the Hub, select the "transformers.js" library tag i
 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://huggingface.co/papers/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://huggingface.co/papers/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
 1. **[Youtu-LLM](https://huggingface.co/docs/transformers/model_doc/youtu)** (from the Tencent Youtu Team) released with the paper [Youtu-LLM: Unlocking the Native Agentic Potential for Lightweight Large Language Models](https://huggingface.co/papers/2512.24618) by Junru Lu, Jiarui Qin, Lingfeng Qiao, Yinghui Li, Xinyi Dai, Bo Ke, Jianfeng He, Ruizhi Qiao, Di Yin, Xing Sun, Yunsheng Wu, Yinsong Liu, Shuangyin Liu, Mingkong Tang, Haodong Lin, Jiayi Kuang, Fanxu Meng, Xiaojuan Tang, Yunjia Xi, Junjie Huang, Haotong Yang, Zhenyi Shen, Yangning Li, Qianwen Zhang, Yifei Yu, Siyu An, Junnan Dong, Qiufeng Wang, Jie Wang, Keyu Chen, Wei Wen, Taian Guo, Zhifeng Shen, Daohai Yu, Jiahao Li, Ke Li, Zongyi Li, Xiaoyu Tan.
+1. **Zaya** (from Zyphra) released with the paper [ZAYA1-8B Technical Report](https://huggingface.co/papers/2605.05365) by Robert Washbourne, Rishi Iyer, Tomas Figliolia, Henry Zheng, Ryan Lorig-Roach, Sungyeon Yang, Pritish Yuvraj, Quentin Anthony, Yury Tokpanov, Xiao Yang, Ganesh Nanduru, Stephen Ebert, Praneeth Medepalli, Skyler Szot, Srivatsan Rajagopal, Alex Ong, Bhavana Mehta, Beren Millidge.
+
diff --git a/packages/transformers/docs/snippets/5_supported-models.snippet b/packages/transformers/docs/snippets/5_supported-models.snippet
@@ -211,4 +211,5 @@
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://huggingface.co/papers/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://huggingface.co/papers/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://huggingface.co/papers/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-1. **[Youtu-LLM](https://huggingface.co/docs/transformers/model_doc/youtu)** (from the Tencent Youtu Team) released with the paper [Youtu-LLM: Unlocking the Native Agentic Potential for Lightweight Large Language Models](https://huggingface.co/papers/2512.24618) by Junru Lu, Jiarui Qin, Lingfeng Qiao, Yinghui Li, Xinyi Dai, Bo Ke, Jianfeng He, Ruizhi Qiao, Di Yin, Xing Sun, Yunsheng Wu, Yinsong Liu, Shuangyin Liu, Mingkong Tang, Haodong Lin, Jiayi Kuang, Fanxu Meng, Xiaojuan Tang, Yunjia Xi, Junjie Huang, Haotong Yang, Zhenyi Shen, Yangning Li, Qianwen Zhang, Yifei Yu, Siyu An, Junnan Dong, Qiufeng Wang, Jie Wang, Keyu Chen, Wei Wen, Taian Guo, Zhifeng Shen, Daohai Yu, Jiahao Li, Ke Li, Zongyi Li, Xiaoyu Tan.
+1. **[Youtu-LLM](https://huggingface.co/docs/transformers/model_doc/youtu)** (from the Tencent Youtu Team) released with the paper [Youtu-LLM: Unlocking the Native Agentic Potential for Lightweight Large Language Models](https://huggingface.co/papers/2512.24618) by Junru Lu, Jiarui Qin, Lingfeng Qiao, Yinghui Li, Xinyi Dai, Bo Ke, Jianfeng He, Ruizhi Qiao, Di Yin, Xing Sun, Yunsheng Wu, Yinsong Liu, Shuangyin Liu, Mingkong Tang, Haodong Lin, Jiayi Kuang, Fanxu Meng, Xiaojuan Tang, Yunjia Xi, Junjie Huang, Haotong Yang, Zhenyi Shen, Yangning Li, Qianwen Zhang, Yifei Yu, Siyu An, Junnan Dong, Qiufeng Wang, Jie Wang, Keyu Chen, Wei Wen, Taian Guo, Zhifeng Shen, Daohai Yu, Jiahao Li, Ke Li, Zongyi Li, Xiaoyu Tan.
+1. **Zaya** (from Zyphra) released with the paper [ZAYA1-8B Technical Report](https://huggingface.co/papers/2605.05365) by Robert Washbourne, Rishi Iyer, Tomas Figliolia, Henry Zheng, Ryan Lorig-Roach, Sungyeon Yang, Pritish Yuvraj, Quentin Anthony, Yury Tokpanov, Xiao Yang, Ganesh Nanduru, Stephen Ebert, Praneeth Medepalli, Skyler Szot, Srivatsan Rajagopal, Alex Ong, Bhavana Mehta, Beren Millidge.
diff --git a/packages/transformers/src/configs.js b/packages/transformers/src/configs.js
@@ -216,6 +216,13 @@ function getNormalizedConfig(config) {
             mapping['dim_kv'] = config.model_type === 'deepseek_v4' ? 'head_dim' : 'qk_head_dim';
             mapping['num_attention_heads'] = 'num_attention_heads';
             break;
+        case 'zaya':
+            mapping['num_heads'] = 'num_key_value_heads';
+            mapping['num_layers'] = 'num_hidden_layers';
+            mapping['hidden_size'] = 'hidden_size';
+            mapping['dim_kv'] = 'head_dim';
+            mapping['num_attention_heads'] = 'num_attention_heads';
+            break;
 
         // Encoder-decoder models
         case 't5':
@@ -420,6 +427,16 @@ export function getCacheNames(config, options) {
             }
         }
         return names;
+    } else if (config.model_type === 'zaya') {
+        const { num_hidden_layers, cca_time1 } = /** @type {any} */ (config);
+        const stride = cca_time1 ?? 1;
+        for (let i = 0; i < num_hidden_layers; i += stride) {
+            names.add(`${pkv_prefix}.${i}.key`);
+            names.add(`${pkv_prefix}.${i}.value`);
+            names.add(`${pkv_prefix}.${i}.conv_state`);
+            names.add(`${pkv_prefix}.${i}.shift_state`);
+        }
+        return names;
     } else if (['lfm2_vl', 'qwen3_5', 'qwen3_5_moe', 'voxtral_realtime'].includes(config.model_type)) {
         let subConfig;
         if (config.model_type === 'voxtral_realtime' && options?.session_name === 'audio_encoder') {

diff --git a/packages/transformers/src/models/models.js b/packages/transformers/src/models/models.js
@@ -195,5 +195,6 @@ export * from './xlm/modeling_xlm.js';
 export * from './xlm_roberta/modeling_xlm_roberta.js';
 export * from './yolos/modeling_yolos.js';
 export * from './youtu/modeling_youtu.js';
+export * from './zaya/modeling_zaya.js';
 
 export { PreTrainedModel } from './modeling_utils.js';
diff --git a/packages/transformers/src/models/registry.js b/packages/transformers/src/models/registry.js
@@ -181,6 +181,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
     ['modernbert-decoder', 'ModernBertDecoderModel'],
     ['hunyuan_v1_dense', 'HunYuanDenseV1Model'],
     ['youtu', 'YoutuModel'],
+    ['zaya', 'ZayaModel'],
 ]);
 
 export const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([
@@ -329,6 +330,7 @@ export const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
     ['modernbert-decoder', 'ModernBertDecoderForCausalLM'],
     ['hunyuan_v1_dense', 'HunYuanDenseV1ForCausalLM'],
     ['youtu', 'YoutuForCausalLM'],
+    ['zaya', 'ZayaForCausalLM'],
 
     // Also image-text-to-text
     ['phi3_v', 'Phi3VForCausalLM'],

diff --git a/packages/transformers/src/models/zaya/modeling_zaya.js b/packages/transformers/src/models/zaya/modeling_zaya.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../modeling_utils.js';
+
+export class ZayaPreTrainedModel extends PreTrainedModel {}
+export class ZayaModel extends ZayaPreTrainedModel {}
+export class ZayaForCausalLM extends ZayaPreTrainedModel {}
diff --git a/packages/transformers/src/utils/hub.js b/packages/transformers/src/utils/hub.js
@@ -8,7 +8,14 @@ import { apis, env } from '../env.js';
 import { DefaultProgressCallback, dispatchCallback } from './core.js';
 import { FileResponse } from './hub/FileResponse.js';
 import { FileCache } from './cache/FileCache.js';
-import { handleError, isValidUrl, pathJoin, isValidHfModelId, makePretrainedOptionsKey, readResponse } from './hub/utils.js';
+import {
+    handleError,
+    isValidUrl,
+    pathJoin,
+    isValidHfModelId,
+    makePretrainedOptionsKey,
+    readResponse,
+} from './hub/utils.js';
 import { getCache, tryCache } from './cache.js';
 import { get_file_metadata } from './model_registry/get_file_metadata.js';
 import { logger } from './logger.js';

diff --git a/packages/transformers/tests/models/zaya/test_modeling_zaya.js b/packages/transformers/tests/models/zaya/test_modeling_zaya.js
@@ -0,0 +1,51 @@
+import { PreTrainedTokenizer, ZayaForCausalLM } from "../../../src/transformers.js";
+
+import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js";
+
+export default () => {
+  describe("ZayaForCausalLM", () => {
+    const model_id = "onnx-internal-testing/tiny-random-ZayaForCausalLM";
+    /** @type {ZayaForCausalLM} */
+    let model;
+    /** @type {PreTrainedTokenizer} */
+    let tokenizer;
+    beforeAll(async () => {
+      model = await ZayaForCausalLM.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS);
+      tokenizer = await PreTrainedTokenizer.from_pretrained(model_id);
+      tokenizer.padding_side = "left";
+    }, MAX_MODEL_LOAD_TIME);
+
+    it(
+      "batch_size=1",
+      async () => {
+        const inputs = tokenizer("hello");
+        const outputs = await model.generate({
+          ...inputs,
+          max_length: 10,
+        });
+        expect(outputs.tolist()).toEqual([[2n, 23391n, 106n, 155331n, 155331n, 155331n, 155331n, 155331n, 155331n, 155331n]]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "batch_size>1",
+      async () => {
+        const inputs = tokenizer(["hello", "hello world"], { padding: true });
+        const outputs = await model.generate({
+          ...inputs,
+          max_length: 10,
+        });
+        expect(outputs.tolist()).toEqual([
+          [0n, 2n, 23391n, 106n, 155331n, 155331n, 155331n, 155331n, 155331n, 155331n],
+          [2n, 23391n, 1902n, 106n, 155331n, 155331n, 155331n, 155331n, 155331n, 155331n],
+        ]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    afterAll(async () => {
+      await model?.dispose();
+    }, MAX_MODEL_DISPOSE_TIME);
+  });
+};