diff --git a/examples/aquila/conf/train/7b.yaml b/examples/aquila/conf/train/7b.yaml index a534b9e135..8d9942cc81 100644 --- a/examples/aquila/conf/train/7b.yaml +++ b/examples/aquila/conf/train/7b.yaml @@ -57,6 +57,7 @@ data: data_path: ${data_path:??} split: 1 tokenizer: + legacy_tokenizer: true tokenizer_type: AquilaTokenizerFS vocab_file: ./examples/aquila/tokenizer/vocab.json merge_file: ./examples/aquila/tokenizer/merges.txt diff --git a/examples/deepseek_v3/conf/train/16b_a3b.yaml b/examples/deepseek_v3/conf/train/16b_a3b.yaml index 532d962117..7de4cd28de 100644 --- a/examples/deepseek_v3/conf/train/16b_a3b.yaml +++ b/examples/deepseek_v3/conf/train/16b_a3b.yaml @@ -111,6 +111,7 @@ data: split: 1 no_mmap_bin_files: true tokenizer: + legacy_tokenizer: true tokenizer_type: QwenTokenizerFS tokenizer_path: examples/aquila/qwentokenizer vocab_size: 151851 diff --git a/examples/llama2/conf/train/7b.yaml b/examples/llama2/conf/train/7b.yaml index bee99d65cf..3b35f3ee45 100644 --- a/examples/llama2/conf/train/7b.yaml +++ b/examples/llama2/conf/train/7b.yaml @@ -62,6 +62,7 @@ data: data_path: ${data_path:??} split: 1 tokenizer: + legacy_tokenizer: true tokenizer_type: Llama2Tokenizer tokenizer_model: examples/llama/tokenizer.model vocab_size: 32000 diff --git a/examples/llama3/conf/train/70b.yaml b/examples/llama3/conf/train/70b.yaml index 8e4a06be51..e076b7ecd3 100644 --- a/examples/llama3/conf/train/70b.yaml +++ b/examples/llama3/conf/train/70b.yaml @@ -67,6 +67,7 @@ data: data_path: ${data_path:??} split: 1 tokenizer: + legacy_tokenizer: true tokenizer_type: Llama3TokenizerFS tokenizer_path: ${tokenizer_path:??} vocab_size: 128256 diff --git a/examples/llava1_5/conf/train/7b.yaml b/examples/llava1_5/conf/train/7b.yaml index fc13c683f5..d38f1e5792 100644 --- a/examples/llava1_5/conf/train/7b.yaml +++ b/examples/llava1_5/conf/train/7b.yaml @@ -78,6 +78,7 @@ data: dataloader_type: external split: 100,0,0 tokenizer: + legacy_tokenizer: true tokenizer_type: Llama2Tokenizer tokenizer_model: ${tokenizer_model_path:??} vocab_size: 32000 diff --git a/examples/llava_onevision/conf/train/1_5b.yaml b/examples/llava_onevision/conf/train/1_5b.yaml index 7ea0d901b6..b5d7272b8c 100644 --- a/examples/llava_onevision/conf/train/1_5b.yaml +++ b/examples/llava_onevision/conf/train/1_5b.yaml @@ -88,6 +88,7 @@ data: dataloader_type: external split: 100,0,0 tokenizer: + legacy_tokenizer: true tokenizer_type: Qwen2TokenizerFS tokenizer_path: xxxx # vocab_size: 152064 # 7b diff --git a/examples/mixtral/conf/train/8x7b.yaml b/examples/mixtral/conf/train/8x7b.yaml index 9332c5e980..fc254e4e7f 100644 --- a/examples/mixtral/conf/train/8x7b.yaml +++ b/examples/mixtral/conf/train/8x7b.yaml @@ -68,6 +68,7 @@ data: data_path: split: 1 tokenizer: + legacy_tokenizer: true tokenizer_type: QwenTokenizerFS tokenizer_path: make_vocab_size_divisible_by: 64 diff --git a/examples/qwen2_5/conf/train/1_5b.yaml b/examples/qwen2_5/conf/train/1_5b.yaml index b24b53be33..f5bf7d684f 100644 --- a/examples/qwen2_5/conf/train/1_5b.yaml +++ b/examples/qwen2_5/conf/train/1_5b.yaml @@ -75,6 +75,7 @@ data: split: 1 apply_sft_dataset_separated_loss_mask_if_existed: true tokenizer: + legacy_tokenizer: true tokenizer_type: HFTokenizerFS tokenizer_path: ${HF_model_path:??} vocab_size: 151665 diff --git a/examples/qwen2_5_vl/conf/train/7b.yaml b/examples/qwen2_5_vl/conf/train/7b.yaml index 62b2e07206..28530255f7 100644 --- a/examples/qwen2_5_vl/conf/train/7b.yaml +++ b/examples/qwen2_5_vl/conf/train/7b.yaml @@ -100,6 +100,7 @@ data: dataloader_type: external split: 100,0,0 tokenizer: + legacy_tokenizer: true tokenizer_type: Qwen2VLTokenizer tokenizer_path: xxxx vocab_size: 152064 # 7b diff --git a/examples/qwen3/conf/train/32b.yaml b/examples/qwen3/conf/train/32b.yaml index 757d8f3164..4cfaa30e6f 100644 --- a/examples/qwen3/conf/train/32b.yaml +++ b/examples/qwen3/conf/train/32b.yaml @@ -82,6 +82,7 @@ data: split: 1 no_mmap_bin_files: true tokenizer: + legacy_tokenizer: true tokenizer_type: QwenTokenizerFS tokenizer_path: examples/aquila/qwentokenizer vocab_size: 151851 diff --git a/examples/qwq/conf/train/32b.yaml b/examples/qwq/conf/train/32b.yaml index 5d9d9903b6..6ed182119a 100644 --- a/examples/qwq/conf/train/32b.yaml +++ b/examples/qwq/conf/train/32b.yaml @@ -80,6 +80,7 @@ data: split: 1 no_mmap_bin_files: true tokenizer: + legacy_tokenizer: true tokenizer_type: Qwen2TokenizerFS tokenizer_path: /tokenizer_path vocab_size: 151851 diff --git a/examples/rwkv/conf/train/3b.yaml b/examples/rwkv/conf/train/3b.yaml index 701b253934..1d829a969c 100644 --- a/examples/rwkv/conf/train/3b.yaml +++ b/examples/rwkv/conf/train/3b.yaml @@ -45,4 +45,5 @@ data: data_path: ${data_path:??} split: "1" tokenizer: + legacy_tokenizer: true tokenizer_path: ${tokenizer_path:??} # The vocab file can be found at https://github.com/RWKV-Vibe/RWKV-LM-V7/tree/main/data/tokenizer