diff --git a/.ci/docker/requirements-dev.txt b/.ci/docker/requirements-dev.txt
index bd6112228..6d53b2f81 100644
--- a/.ci/docker/requirements-dev.txt
+++ b/.ci/docker/requirements-dev.txt
@@ -3,3 +3,4 @@ pytest==7.3.2
 pytest-cov
 pre-commit
 tomli-w >= 1.1.0
+transformers
diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
index 11eae863f..c33bfe4d8 100644
--- a/.ci/docker/requirements.txt
+++ b/.ci/docker/requirements.txt
@@ -2,8 +2,6 @@ torchdata >= 0.8.0
 datasets >= 3.6.0
 tomli >= 1.1.0 ; python_version < "3.11"
 tensorboard
-tiktoken
-blobfile
 tabulate
 wandb
 fsspec
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d317f0bfe..d92f532bb 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -14,7 +14,7 @@ We actively welcome your pull requests.
 2. If you've added code that should be tested, add tests.
 3. If you've changed APIs, update the documentation.
 4. Ensure the test suite passes.
-5. Make sure your code lints (`pre-commit run --all-files`).
+5. Make sure your code lints (`pre-commit run --files $(git diff --name-only HEAD~1)`).
 6. If you haven't already, complete the Contributor License Agreement ("CLA").
 
 ### Contributor License Agreement ("CLA")
diff --git a/pyproject.toml b/pyproject.toml
index fc810b981..f153465d8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,8 +17,7 @@ dependencies = [
     "datasets>=2.21.0",
 
     # Tokenization
-    "blobfile",
-    "tiktoken",
+    "tokenizers",
 
     # Miscellaneous
     "tomli>=1.1.0",
diff --git a/scripts/download_tokenizer.py b/scripts/download_tokenizer.py
index a28dc4992..664bd369b 100644
--- a/scripts/download_tokenizer.py
+++ b/scripts/download_tokenizer.py
@@ -108,7 +108,7 @@ def is_tokenizer_file(filename: str) -> bool:
                 print(f"Successfully downloaded {filename} to {file_path}")
                 downloaded_files.append(filename)
             except HTTPError as e:
-                if e.response.status_code == 404:
+                if e.response and e.response.status_code == 404:
                     print(f"File {filename} not found, skipping...")
                     continue
                 else:
@@ -122,7 +122,7 @@ def is_tokenizer_file(filename: str) -> bool:
             print(f"Warning: No tokenizer files could be downloaded from {repo_id}")
 
     except HTTPError as e:
-        if e.response.status_code == 401:
+        if e.response and e.response.status_code == 401:
             print(
                 "You need to pass a valid `--hf_token=...` to download private checkpoints."
             )
diff --git a/scripts/generate/test_generate.py b/scripts/generate/test_generate.py
index 157d000be..ef31c1850 100644
--- a/scripts/generate/test_generate.py
+++ b/scripts/generate/test_generate.py
@@ -165,7 +165,7 @@ def test_generate(
     input_ids = (
         (
             torch.tensor(
-                tokenizer.encode(prompt, bos=True, eos=False), dtype=torch.long
+                tokenizer.encode(prompt, add_bos=True, add_eos=False), dtype=torch.long
             )
             .view(1, -1)
             .repeat(batch_size, 1)
diff --git a/tests/assets/test_tiktoken.model b/tests/assets/test_tiktoken.model
deleted file mode 100644
index 4bfad6254..000000000
--- a/tests/assets/test_tiktoken.model
+++ /dev/null
@@ -1,2000 +0,0 @@
-AA== 0
-AQ== 1
-Ag== 2
-Aw== 3
-BA== 4
-BQ== 5
-Bg== 6
-Bw== 7
-CA== 8
-CQ== 9
-Cg== 10
-Cw== 11
-DA== 12
-DQ== 13
-Dg== 14
-Dw== 15
-EA== 16
-EQ== 17
-Eg== 18
-Ew== 19
-FA== 20
-FQ== 21
-Fg== 22
-Fw== 23
-GA== 24
-GQ== 25
-Gg== 26
-Gw== 27
-HA== 28
-HQ== 29
-Hg== 30
-Hw== 31
-IA== 32
-IQ== 33
-Ig== 34
-Iw== 35
-JA== 36
-JQ== 37
-Jg== 38
-Jw== 39
-KA== 40
-KQ== 41
-Kg== 42
-Kw== 43
-LA== 44
-LQ== 45
-Lg== 46
-Lw== 47
-MA== 48
-MQ== 49
-Mg== 50
-Mw== 51
-NA== 52
-NQ== 53
-Ng== 54
-Nw== 55
-OA== 56
-OQ== 57
-Og== 58
-Ow== 59
-PA== 60
-PQ== 61
-Pg== 62
-Pw== 63
-QA== 64
-QQ== 65
-Qg== 66
-Qw== 67
-RA== 68
-RQ== 69
-Rg== 70
-Rw== 71
-SA== 72
-SQ== 73
-Sg== 74
-Sw== 75
-TA== 76
-TQ== 77
-Tg== 78
-Tw== 79
-UA== 80
-UQ== 81
-Ug== 82
-Uw== 83
-VA== 84
-VQ== 85
-Vg== 86
-Vw== 87
-WA== 88
-WQ== 89
-Wg== 90
-Ww== 91
-XA== 92
-XQ== 93
-Xg== 94
-Xw== 95
-YA== 96
-YQ== 97
-Yg== 98
-Yw== 99
-ZA== 100
-ZQ== 101
-Zg== 102
-Zw== 103
-aA== 104
-aQ== 105
-ag== 106
-aw== 107
-bA== 108
-bQ== 109
-bg== 110
-bw== 111
-cA== 112
-cQ== 113
-cg== 114
-cw== 115
-dA== 116
-dQ== 117
-dg== 118
-dw== 119
-eA== 120
-eQ== 121
-eg== 122
-ew== 123
-fA== 124
-fQ== 125
-fg== 126
-fw== 127
-gA== 128
-gQ== 129
-gg== 130
-gw== 131
-hA== 132
-hQ== 133
-hg== 134
-hw== 135
-iA== 136
-iQ== 137
-ig== 138
-iw== 139
-jA== 140
-jQ== 141
-jg== 142
-jw== 143
-kA== 144
-kQ== 145
-kg== 146
-kw== 147
-lA== 148
-lQ== 149
-lg== 150
-lw== 151
-mA== 152
-mQ== 153
-mg== 154
-mw== 155
-nA== 156
-nQ== 157
-ng== 158
-nw== 159
-oA== 160
-oQ== 161
-og== 162
-ow== 163
-pA== 164
-pQ== 165
-pg== 166
-pw== 167
-qA== 168
-qQ== 169
-qg== 170
-qw== 171
-rA== 172
-rQ== 173
-rg== 174
-rw== 175
-sA== 176
-sQ== 177
-sg== 178
-sw== 179
-tA== 180
-tQ== 181
-tg== 182
-tw== 183
-uA== 184
-uQ== 185
-ug== 186
-uw== 187
-vA== 188
-vQ== 189
-vg== 190
-vw== 191
-wA== 192
-wQ== 193
-wg== 194
-ww== 195
-xA== 196
-xQ== 197
-xg== 198
-xw== 199
-yA== 200
-yQ== 201
-yg== 202
-yw== 203
-zA== 204
-zQ== 205
-zg== 206
-zw== 207
-0A== 208
-0Q== 209
-0g== 210
-0w== 211
-1A== 212
-1Q== 213
-1g== 214
-1w== 215
-2A== 216
-2Q== 217
-2g== 218
-2w== 219
-3A== 220
-3Q== 221
-3g== 222
-3w== 223
-4A== 224
-4Q== 225
-4g== 226
-4w== 227
-5A== 228
-5Q== 229
-5g== 230
-5w== 231
-6A== 232
-6Q== 233
-6g== 234
-6w== 235
-7A== 236
-7Q== 237
-7g== 238
-7w== 239
-8A== 240
-8Q== 241
-8g== 242
-8w== 243
-9A== 244
-9Q== 245
-9g== 246
-9w== 247
-+A== 248
-+Q== 249
-+g== 250
-+w== 251
-/A== 252
-/Q== 253
-/g== 254
-/w== 255
-IHQ= 256
-aGU= 257
-IGE= 258
-aW4= 259
-IHM= 260
-IHc= 261
-IHRoZQ== 262
-IG8= 263
-cmU= 264
-IGI= 265
-b3U= 266
-ZWQ= 267
-IG0= 268
-bmQ= 269
-IEk= 270
-aGE= 271
-aXQ= 272
-ZXI= 273
-aW5n 274
-IGY= 275
-aXM= 276
-IHRv 277
-ZW4= 278
-b24= 279
-b3I= 280
-YXM= 281
-IGM= 282
-IG9m 283
-IGFuZA== 284
-IGQ= 285
-bGw= 286
-YXQ= 287
-YW4= 288
-YXI= 289
-IHA= 290
-IG4= 291
-IGlu 292
-bGU= 293
-b20= 294
-b3Q= 295
-IGJl 296
-IGg= 297
-dXQ= 298
-b3c= 299
-ZXM= 300
-aGF0 301
-IGc= 302
-IGhl 303
-IGhh 304
-IGw= 305
-IHdhcw== 306
-bGQ= 307
-Z2g= 308
-aWQ= 309
-Y2g= 310
-IHRo 311
-IGl0 312
-YXk= 313
-IG9u 314
-Y2U= 315
-c2U= 316
-ZW50 317
-IHN0 318
-bHk= 319
-dmU= 320
-ZXQ= 321
-c3Q= 322
-IFQ= 323
-IGU= 324
-IHk= 325
-Z2h0 326
-aXI= 327
-IG1l 328
-b28= 329
-YWw= 330
-aXRo 331
-IHJl 332
-aW0= 333
-IHRoYXQ= 334
-IGFz 335
-b3VsZA== 336
-cm8= 337
-YWQ= 338
-aW9u 339
-Lgo= 340
-aGVy 341
-IG15 342
-Y3Q= 343
-IG5vdA== 344
-IHdpdGg= 345
-IGZvcg== 346
-IHU= 347
-a2U= 348
-IHlvdQ== 349
-IFM= 350
-IGlz 351
-aWdodA== 352
-Igo= 353
-YW0= 354
-aWM= 355
-dXI= 356
-IGF0 357
-Li4= 358
-YWM= 359
-dGVy 360
-IHdo 361
-IGFu 362
-IHdl 363
-IFRoZQ== 364
-aWY= 365
-IG9y 366
-IGJ1dA== 367
-dmVy 368
-ICI= 369
-IHI= 370
-b3V0 371
-b21l 372
-IGhhZA== 373
-cHA= 374
-cXU= 375
-IHN1 376
-IHRoaXM= 377
-cmVk 378
-YXJk 379
-IHNv 380
-ZWxs 381
-IHdvdWxk 382
-IGhpcw== 383
-IHNo 384
-aW5l 385
-cmE= 386
-IHNl 387
-IGJ5 388
-LiIK 389
-IFA= 390
-aGVu 391
-IEE= 392
-IGhhdmU= 393
-IGZy 394
-IHNh 395
-IEg= 396
-IG9uZQ== 397
-ZW0= 398
-a2Vk 399
-aXJ0 400
-ZWN0 401
-IGhpbQ== 402
-IGxp 403
-IGFi 404
-YXRpb24= 405
-aGluZw== 406
-dGhl 407
-IFI= 408
-IGxl 409
-c3M= 410
-IFc= 411
-Y3U= 412
-aWxs 413
-J3Q= 414
-YXJ0 415
-YWxs 416
-LAo= 417
-b3du 418
-b3Jl 419
-IGFsbA== 420
-IGs= 421
-IGdv 422
-aGlydA== 423
-YW5k 424
-IG91dA== 425
-YW1l 426
-YWlu 427
-IGlm 428
-IG5v 429
-IGRv 430
-IHRoZXk= 431
-b29s 432
-dW4= 433
-dG8= 434
-IHVw 435
-IFJlZA== 436
-IG5l 437
-IEs= 438
-IGZyb20= 439
-IFNoaXJ0 440
-IHdvcg== 441
-b25n 442
-IHRoZXJl 443
-IHNhaWQ= 444
-cmk= 445
-YW50 446
-IEI= 447
-IGFueQ== 448
-dWQ= 449
-aW5k 450
-IHdoaQ== 451
-YWI= 452
-b3VuZA== 453
-IGFib3V0 454
-IHRoZW0= 455
-Y3Vw 456
-YWs= 457
-IGRl 458
-IHRl 459
-IE0= 460
-YWtl 461
-Y3VwaW5l 462
-aWc= 463
-IHdlcmU= 464
-b3JjdXBpbmU= 465
-aWw= 466
-Y2hvb2w= 467
-IHJv 468
-b29k 469
-IGFyZQ== 470
-aXZl 471
-IGxpa2U= 472
-eW8= 473
-IGhvdQ== 474
-J3M= 475
-b25l 476
-dXM= 477
-ZWw= 478
-dWw= 479
-YWNr 480
-b3A= 481
-LCI= 482
-dGg= 483
-YWNoZXI= 484
-dW0= 485
-YW5n 486
-IGZh 487
-YWc= 488
-IHNjaG9vbA== 489
-IGo= 490
-dGU= 491
-b2s= 492
-ZXNz 493
-dXN0 494
-ZXJz 495
-Li4uLg== 496
-IEM= 497
-dGhlcg== 498
-aGFu 499
-IHdoZW4= 500
-IHNw 501
-IG1hbg== 502
-IGNhbg== 503
-b3VnaA== 504
-IHdobw== 505
-IGdldA== 506
-IGRpZA== 507
-IHBv 508
-Y2k= 509
-IGFs 510
-aXN0 511
-IGNvbQ== 512
-bGY= 513
-YXU= 514
-IFBvcmN1cGluZQ== 515
-IHdoaWNo 516
-dmVu 517
-IGFm 518
-d24= 519
-YXNz 520
-YmVy 521
-IGV4 522
-b3Vz 523
-ZXN0 524
-bG8= 525
-IHRy 526
-ZWxsb3c= 527
-IHNheQ== 528
-b3VnaHQ= 529
-IHJvb20= 530
-IHNvbWU= 531
-LS0= 532
-IE8= 533
-YXRl 534
-IHY= 535
-aGVk 536
-YXA= 537
-IHR3 538
-IGJlYw== 539
-cmVl 540
-amVjdA== 541
-a3M= 542
-IGNvbg== 543
-IGJlZW4= 544
-ZW50cw== 545
-aWRl 546
-IGNvdWxk 547
-IEc= 548
-ZXA= 549
-IHBybw== 550
-bnQ= 551
-IGhvdXNl 552
-IGFn 553
-IElm 554
-IGtu 555
-IGZlbGxvdw== 556
-IHdoYXQ= 557
-d2F5 558
-aXNo 559
-IGFt 560
-aXRl 561
-bmRlcg== 562
-aW1l 563
-IHBy 564
-IHRlYWNoZXI= 565
-YXJl 566
-IGJv 567
-IHNoZQ== 568
-IE4= 569
-aWNl 570
-YXN0 571
-dXJl 572
-aWU= 573
-IHN1Y2g= 574
-dXRlbg== 575
-dXRlbmJlcg== 576
-dXRlbmJlcmc= 577
-IHF1 578
-bG93bg== 579
-IHdy 580
-cHQ= 581
-IEhl 582
-IHN0dWQ= 583
-aGVyZQ== 584
-IG1vcmU= 585
-cnk= 586
-dHRlcg== 587
-IFk= 588
-IG1heQ== 589
-aXR5 590
-IGxvbw== 591
-IG90aGVy 592
-aGlz 593
-IFBybw== 594
-IHdpbGw= 595
-IEl0 596
-b3J0 597
-IHNob3VsZA== 598
-dmVyeQ== 599
-d2U= 600
-IHBs 601
-YXNo 602
-LiI= 603
-IGFwcA== 604
-IGRheQ== 605
-dXJu 606
-cG8= 607
-IGhlcg== 608
-ICA= 609
-bm90 610
-Y2s= 611
-IHVu 612
-aGk= 613
-dmluZw== 614
-IG9sZA== 615
-IHRpbWU= 616
-IlQ= 617
-IHdheQ== 618
-YWJsZQ== 619
-PyIK 620
-IENsb3du 621
-IG9ubHk= 622
-dWI= 623
-YWNo 624
-IG9mZg== 625
-IHRoYW4= 626
-YWxseQ== 627
-IHRoZWly 628
-YmU= 629
-a2luZw== 630
-b3RoZXI= 631
-YXJ5 632
-YW5z 633
-YXRlZA== 634
-c2VsZg== 635
-IGdvaW5n 636
-dWNo 637
-b2xs 638
-IGJhY2s= 639
-aXlv 640
-LXQ= 641
-YW5jZQ== 642
-YWRl 643
-IFByb2plY3Q= 644
-c3A= 645
-IHR3bw== 646
-IHRob3VnaHQ= 647
-c28= 648
-IHJpZ2h0 649
-IGhlYWQ= 650
-dmVk 651
-IEQ= 652
-IHByZQ== 653
-IHNlZQ== 654
-IHVz 655
-IHN0dWRlbnRz 656
-Y2lw 657
-IGRvbg== 658
-IG5pZ2h0 659
-aW5jaXA= 660
-IEtpeW8= 661
-cGw= 662
-YXJlZA== 663
-IEd1dGVuYmVyZw== 664
-IGNv 665
-IGhvdw== 666
-b21ldA== 667
-ZmY= 668
-Ikk= 669
-LC0t 670
-IGFza2Vk 671
-aW5jaXBhbA== 672
-ZXZlcg== 673
-IGFj 674
-IEY= 675
-IG1ha2U= 676
-aXR0 677
-IG1pZ2h0 678
-Z2U= 679
-bGVk 680
-IGFmdGVy 681
-aWdu 682
-IGdy 683
-IG1hZGU= 684
-ZGQ= 685
-IGtub3c= 686
-IGNvbWU= 687
-IGJy 688
-dGhpbmc= 689
-IEJ1dA== 690
-IG1hdA== 691
-IE9u 692
-b3J5 693
-Y2w= 694
-IEU= 695
-Ymxl 696
-b2c= 697
-IHlvdXI= 698
-dWxs 699
-IHdvcms= 700
-ZWFy 701
-IHRocmVl 702
-aWVk 703
-YnV0 704
-VGhl 705
-cGU= 706
-YWNl 707
-IHN0YXJ0 708
-aWNr 709
-IG92ZXI= 710
-b3Vy 711
-IG11Y2g= 712
-IHdhbnQ= 713
-aW1w 714
-IHBhcnQ= 715
-aG8= 716
-aW5r 717
-ZW5jZQ== 718
-IGRvd24= 719
-IGV2ZW4= 720
-IHByaW5jaXBhbA== 721
-bGluZw== 722
-b3VudA== 723
-YXVzZQ== 724
-IGNs 725
-IGJs 726
-LXRt 727
-b21ldGhpbmc= 728
-IGludG8= 729
-b3Jt 730
-b2t5bw== 731
-IGRpcw== 732
-IGZl 733
-IGZhY2U= 734
-Li4uLi4u 735
-cmVzcw== 736
-bWVudA== 737
-aXJl 738
-IGFy 739
-dHk= 740
-IG1v 741
-cmVhdA== 742
-IGZpcg== 743
-cGVy 744
-IG91cg== 745
-Y28= 746
-IHRoZW4= 747
-IHRvbGQ= 748
-aW5ncw== 749
-IHRha2U= 750
-IGJlZw== 751
-bmVy 752
-aXRpb24= 753
-b3Nl 754
-IG93bg== 755
-IGFnYWlu 756
-IHNlZW0= 757
-aXNl 758
-IHdhdA== 759
-Ilc= 760
-IGZhcg== 761
-YWtpbmc= 762
-Zm9yZQ== 763
-YWR5 764
-LXM= 765
-bGVzcw== 766
-IHJldA== 767
-IHNoYQ== 768
-IGNhbWU= 769
-Z2Vy 770
-IGdvb2Q= 771
-YXRoZXI= 772
-YXJr 773
-cm93 774
-IGtl 775
-J20= 776
-IGhhcw== 777
-YXRo 778
-cHBlZA== 779
-IHdlbnQ= 780
-IHRlbGw= 781
-cXVhc2g= 782
-IGVu 783
-IGZpcnN0 784
-IGhvdA== 785
-aXo= 786
-IGF3YXk= 787
-IHNvbWV0aGluZw== 788
-IHJlbQ== 789
-IHRvd24= 790
-IHNt 791
-IFRoaXM= 792
-IGJldHRlcg== 793
-IFRoZW4= 794
-d2Fz 795
-b2Y= 796
-YmFyZA== 797
-IEw= 798
-bGk= 799
-ZmU= 800
-IFRva3lv 801
-IGxvbmc= 802
-aWx5 803
-IHN1cmU= 804
-IGxvb2tlZA== 805
-dWJiYXJk 806
-Y3Rpb24= 807
-b3Jk 808
-IG1hbnk= 809
-aW91cw== 810
-IHRvbw== 811
-IGhlcmU= 812
-b3M= 813
-IHVuZGVy 814
-YXNl 815
-bmc= 816
-cGVk 817
-b2Q= 818
-bWU= 819
-IGp1c3Q= 820
-IG5vdw== 821
-aW5jZQ== 822
-IGhlYXJk 823
-IGtpbmQ= 824
-IFRoZXk= 825
-IGJlZm9yZQ== 826
-aHk= 827
-IElu 828
-IGVudA== 829
-IGJvYXJk 830
-ISI= 831
-d2FyZA== 832
-IGJlaW5n 833
-IHdlbGw= 834
-ZXJt 835
-cmllZA== 836
-IHdyb25n 837
-YWlk 838
-eHQ= 839
-IHJldHVybg== 840
-aXRlZA== 841
-IHllbg== 842
-IG1hdHRlcg== 843
-IGNhbGw= 844
-IHRhbA== 845
-IFlvdQ== 846
-Y2Vk 847
-aXNlZA== 848
-IGNoYQ== 849
-b25z 850
-IHNhbWU= 851
-IG9uY2U= 852
-ZGF5 853
-ZnQ= 854
-IHN3 855
-IGJlY2F1c2U= 856
-IHRoaW5r 857
-IHdoZXJl 858
-IE5v 859
-IEh1YmJhcmQ= 860
-IFNxdWFzaA== 861
-IGNvcA== 862
-d2l0aA== 863
-ZXJlZA== 864
-b2xsb3c= 865
-IHBsYWNl 866
-aWRk 867
-Y2Vzcw== 868
-IHNob3c= 869
-aXNoYQ== 870
-IHJh 871
-IGxldHRlcg== 872
-bmU= 873
-dmVz 874
-YXRpbmc= 875
-cmFuZw== 876
-IGFmZg== 877
-IGhhbmQ= 878
-IHNj 879
-IHBlcnM= 880
-aW50 881
-cHI= 882
-c2lkZQ== 883
-ZnRlcg== 884
-IHNheWluZw== 885
-IGxhdQ== 886
-dGhhdA== 887
-IHdpdGhvdXQ= 888
-cm9u 889
-YWly 890
-bGVjdA== 891
-IFdoYXQ= 892
-ZWx0 893
-IHdoaWxl 894
-b2dh 895
-YXBlcg== 896
-IHBl 897
-b3k= 898
-IHNhdA== 899
-aWVz 900
-IGFkZA== 901
-IGRheXM= 902
-IHNwZQ== 903
-IGhv 904
-IGFucw== 905
-IGhhcg== 906
-IFdoZW4= 907
-IGFueXRoaW5n 908
-cGVu 909
-XQo= 910
-dGFpbg== 911
-IG11c3Q= 912
-IG5ldw== 913
-bGlj 914
-IHZv 915
-aGlsZQ== 916
-Z2V0 917
-IEFz 918
-IHZlcnk= 919
-J3Jl 920
-IGV2ZXJ5 921
-YXZl 922
-PyI= 923
-YWRnZXI= 924
-IEtvZ2E= 925
-IE1y 926
-cm91Z2g= 927
-dWx0 928
-IGZvbGxvdw== 929
-dGluZw== 930
-aWZl 931
-aWRkbGU= 932
-ZnVs 933
-YW5r 934
-IFNv 935
-IHNlZW1lZA== 936
-IEFuZA== 937
-aXg= 938
-IHNldA== 939
-IGNhcmU= 940
-IHJlcw== 941
-IG5ldmVy 942
-IGZvdW5k 943
-IGxv 944
-Y2lk 945
-aW5lZA== 946
-IGNsYXNz 947
-IG15c2VsZg== 948
-YXc= 949
-IHdvbQ== 950
-YXRpb25z 951
-IGxlZnQ= 952
-IFdl 953
-IHRlYWNoZXJz 954
-Ilk= 955
-bmE= 956
-b250 957
-IGRlcw== 958
-IHRob3Nl 959
-aXJlZA== 960
-IHNlbg== 961
-eWluZw== 962
-IHRoZXNl 963
-YXo= 964
-IFRoZXJl 965
-Y2VwdA== 966
-IGRhbmc= 967
-IFU= 968
-Ikg= 969
-Ym9k 970
-Ym9keQ== 971
-IGhhdmluZw== 972
-YWxhcnk= 973
-IHdhdGNo 974
-IGdpdmU= 975
-YWdl 976
-IGl0cw== 977
-IGFwcGU= 978
-dWU= 979
-IGNvdW50 980
-IGhhcmQ= 981
-IGJlbA== 982
-b3R0 983
-IGRpc3Q= 984
-IlM= 985
-IE1hZA== 986
-LW4= 987
-cmlidXQ= 988
-Z2Vk 989
-IGF0dA== 990
-ZmVyZQ== 991
-aXRoZXI= 992
-IHVwb24= 993
-IHRlbQ== 994
-IHBlcnNvbg== 995
-bmluZw== 996
-IGNoZQ== 997
-YXJseQ== 998
-b25leQ== 999
-IHNvb24= 1000
-ZW1lbnQ= 1001
-ICg= 1002
-IHRyYW5z 1003
-IGV4cA== 1004
-IHNlcg== 1005
-IHJlZw== 1006
-YXNvbg== 1007
-IHNhdw== 1008
-IG5leHQ= 1009
-b290 1010
-IGhhbGY= 1011
-IHRvb2s= 1012
-IGJhZA== 1013
-IGhvdXI= 1014
-IHNhbGFyeQ== 1015
-IGJlZ2Fu 1016
-cmlnaHQ= 1017
-b25uYQ== 1018
-LXNhbg== 1019
-IHdvcmtz 1020
-IEo= 1021
-Zm9ybQ== 1022
-aWNhbA== 1023
-IHRyYQ== 1024
-bWFu 1025
-IG5vdGhpbmc= 1026
-IHN0aWxs 1027
-ZWFycw== 1028
-IHN1cHA= 1029
-IHR1cm4= 1030
-IGZlbHQ= 1031
-IHdvbWFu 1032
-IHN0YXJ0ZWQ= 1033
-b3VibGU= 1034
-dXJh 1035
-aXNoaW5n 1036
-Ogo= 1037
-bGVjdHJvbg== 1038
-bGVjdHJvbmlj 1039
-b29r 1040
-IGNvcHk= 1041
-IGZ1bGw= 1042
-Y29uZA== 1043
-bWF0 1044
-IG1pZGRsZQ== 1045
-IGxvb2s= 1046
-IGNvbW0= 1047
-d2VyZWQ= 1048
-IGJlY2FtZQ== 1049
-IGZlbGxvd3M= 1050
-d291bGQ= 1051
-IGdvdA== 1052
-IGds 1053
-IGd1 1054
-IGtlZXA= 1055
-IGdl 1056
-IE1hZG9ubmE= 1057
-aXRlcg== 1058
-aXNoZWQ= 1059
-IHVuZGVyc3Q= 1060
-IHN0cmE= 1061
-c2lk 1062
-IGNvdW50cnk= 1063
-b3BsZQ== 1064
-IHByb3Y= 1065
-IHB1dA== 1066
-bm8= 1067
-J2xs 1068
-IHNsZQ== 1069
-cmFuZ2U= 1070
-IFNoZQ== 1071
-cG9z 1072
-IG1pbmQ= 1073
-IHBhc3M= 1074
-IHRocm91Z2g= 1075
-IHF1aXRl 1076
-IGluZA== 1077
-IGJvYXJkaW5n 1078
-dGVhY2hlcg== 1079
-cGxl 1080
-UG9yY3VwaW5l 1081
-IHBsZQ== 1082
-IGdlaXNoYQ== 1083
-ICAgIA== 1084
-b3N0 1085
-ZW5zZQ== 1086
-Tm8= 1087
-aWJsZQ== 1088
-IHJlYWQ= 1089
-IHJlZA== 1090
-ZW50aW9u 1091
-ZW5lZA== 1092
-ISIK 1093
-IHJlZg== 1094
-IGFk 1095
-IGZs 1096
-IHN0YXk= 1097
-dXA= 1098
-IHJvdW5k 1099
-IGNsZQ== 1100
-IG9wZW4= 1101
-IG9i 1102
-dGVuZA== 1103
-IGZpbmQ= 1104
-IHBlcg== 1105
-IGNhbGxlZA== 1106
-IHN1cg== 1107
-cmV3 1108
-IHBhcGVy 1109
-IEJhZGdlcg== 1110
-IG1lZXQ= 1111
-aXNz 1112
-IlRoYXQ= 1113
-ZXJtcw== 1114
-VEU= 1115
-aXR0ZW4= 1116
-YWJseQ== 1117
-bmVzcw== 1118
-IGNhbm5vdA== 1119
-IHNpbXA= 1120
-Y29u 1121
-IHJlYXNvbg== 1122
-eW91 1123
-IGhvbWU= 1124
-Ynk= 1125
-IGZpZ2h0 1126
-aXR0bGU= 1127
-IHRoaW5ncw== 1128
-IGVhcw== 1129
-IGltcA== 1130
-cmVzc2Vk 1131
-IG1lYW4= 1132
-IGFwcGVhcmVk 1133
-IG5hdA== 1134
-IGhlbA== 1135
-cmV0 1136
-YWtlbg== 1137
-IHN0cmFpZ2h0 1138
-IGFmZmFpcg== 1139
-aXRpbmc= 1140
-IGVk 1141
-IHNpbmNl 1142
-bG9n 1143
-IHBheQ== 1144
-IGZyb250 1145
-bXk= 1146
-IHZvaWNl 1147
-cmVhZHk= 1148
-IGZvb2w= 1149
-b3VuZGF0aW9u 1150
-IGVsZWN0cm9uaWM= 1151
-IHRlcm1z 1152
-IG1hcg== 1153
-YXBhbg== 1154
-YW55 1155
-IHJlc3A= 1156
-IGVuZA== 1157
-YXBw 1158
-d2hhdA== 1159
-c3Ry 1160
-cmFw 1161
-aWFs 1162
-aWN1bA== 1163
-IGFjYw== 1164
-b3Ro 1165
-IHNlY29uZA== 1166
-IGZsbw== 1167
-IHNpeA== 1168
-IGZlZXQ= 1169
-YnI= 1170
-aWV0 1171
-IGxpdHRsZQ== 1172
-bGVz 1173
-IG1vbmV5 1174
-IGRlY2w= 1175
-IGV5 1176
-IGNvbXA= 1177
-YXJpbmc= 1178
-IGFncmU= 1179
-d2hlcmU= 1180
-IFN0 1181
-IHN0cmU= 1182
-ZXg= 1183
-cmFjdA== 1184
-IGludA== 1185
-IGRpcmU= 1186
-IGJlY29tZQ== 1187
-IGhvbg== 1188
-IGNvbnNpZA== 1189
-ZXJ0YWlu 1190
-bm93 1191
-IHNs 1192
-aXRvcg== 1193
-Z2c= 1194
-IGp1bQ== 1195
-IGJ1 1196
-IHRoaW5n 1197
-IGFuc3dlcmVk 1198
-b2Vz 1199
-eWE= 1200
-IFRoYXQ= 1201
-aXpl 1202
-b25k 1203
-YWN0 1204
-IGVmZg== 1205
-IGJhbmc= 1206
-YWJvdXQ= 1207
-IGJlZA== 1208
-b3Jyb3c= 1209
-dW5n 1210
-IFRv 1211
-IGtlcHQ= 1212
-IHdhbA== 1213
-IGJhdGg= 1214
-IGRyYQ== 1215
-IkE= 1216
-cmluZ3M= 1217
-aG9wcA== 1218
-IHJlc2lnbg== 1219
-IGRpbg== 1220
-IGxhZHk= 1221
-LkU= 1222
-IHVzZQ== 1223
-bGlzaA== 1224
-b3Jz 1225
-IHdyaXR0ZW4= 1226
-ZW5l 1227
-aXY= 1228
-IGRpZg== 1229
-IHN0ZQ== 1230
-IHN0b3J5 1231
-Y29t 1232
-cmVz 1233
-ZW50bHk= 1234
-IGZhY3Q= 1235
-aGVz 1236
-d2F5cw== 1237
-IHdoeQ== 1238
-IHRob3VnaA== 1239
-IHN0cg== 1240
-b25kZXI= 1241
-aGVhZA== 1242
-IGNvdXI= 1243
-IG1vbg== 1244
-IHNr 1245
-IGJlbGll 1246
-IGxldA== 1247
-ZmVy 1248
-IHJlcXU= 1249
-IGxpbmU= 1250
-cm9vbQ== 1251
-LWRheQ== 1252
-IGRvbmU= 1253
-IGRvZXM= 1254
-IE9uZQ== 1255
-IGRhbmdv 1256
-YXNzaG9wcA== 1257
-IGNvbnNpZGVy 1258
-IGRpbm5lcg== 1259
-IEZvdW5kYXRpb24= 1260
-Kio= 1261
-ZW1wdA== 1262
-ZXNl 1263
-IHdvcmQ= 1264
-cmVzdA== 1265
-IGVub3VnaA== 1266
-IGdyZWF0 1267
-IG5hbWU= 1268
-IHB1Yg== 1269
-IG1hbm5lcg== 1270
-d2Vy 1271
-aWN0 1272
-aW5lc3M= 1273
-IGhpbXNlbGY= 1274
-IHBlb3BsZQ== 1275
-ZXc= 1276
-IGNvcg== 1277
-ZXN0aW9u 1278
-IGJpZw== 1279
-ZWU= 1280
-IHJp 1281
-aWRlcw== 1282
-IGJyb3RoZXI= 1283
-IGhlYXJ0 1284
-ZWN0ZWQ= 1285
-ZWVk 1286
-IG90aGVycw== 1287
-c29s 1288
-dGVk 1289
-IGV5ZXM= 1290
-IHRyb3VibGU= 1291
-IHRlYWNo 1292
-IGJvYXQ= 1293
-IGZvdXI= 1294
-IGFscmVhZHk= 1295
-cm9t 1296
-Z2hlZA== 1297
-IHNxdQ== 1298
-IHBvbA== 1299
-Y2Vz 1300
-IEhvdHQ= 1301
-IGxlYXZl 1302
-IGRpc3RyaWJ1dA== 1303
-YXN0ZXI= 1304
-Q0g= 1305
-dWM= 1306
-IGlt 1307
-IGhvd2V2ZXI= 1308
-dGhlcmU= 1309
-YXBhbmVzZQ== 1310
-IGxhc3Q= 1311
-IGNy 1312
-aWxpdHk= 1313
-IHNpbXBsZQ== 1314
-IGxpZmU= 1315
-LWM= 1316
-IHJlZ2FyZA== 1317
-IGZpbg== 1318
-dWFs 1319
-IG1lYW5z 1320
-IHN0YW5k 1321
-YXRjaA== 1322
-IHNob3J0 1323
-bmVk 1324
-IHNlZW4= 1325
-IGhhcHA= 1326
-LWs= 1327
-IGFnYWluc3Q= 1328
-aGlt 1329
-YW1lZA== 1330
-IHN0b29k 1331
-IGdyYQ== 1332
-IG1vdGhlcg== 1333
-IGZpc2g= 1334
-IHdhdGVy 1335
-YWls 1336
-Y2Vp 1337
-IHJhdGhlcg== 1338
-IGlucw== 1339
-IGZlZWw= 1340
-IGFsc28= 1341
-IG9yZA== 1342
-IGNvbWluZw== 1343
-aWNz 1344
-IGVpdGhlcg== 1345
-bmNl 1346
-ICc= 1347
-IGtpZA== 1348
-IGxhdWdoZWQ= 1349
-bGlrZQ== 1350
-IEFy 1351
-Z3I= 1352
-IEhvdHRh 1353
-IHRhbGs= 1354
-Z2V0aGVy 1355
-IFNpcg== 1356
-IHB1bg== 1357
-UHJv 1358
-YXRz 1359
-bW9zdA== 1360
-IHJlcA== 1361
-IGdp 1362
-aXNm 1363
-YmFibHk= 1364
-YWtlcw== 1365
-IE5vdA== 1366
-bnk= 1367
-IGFwcGVhcg== 1368
-bXA= 1369
-Y2hh 1370
-IGFjdA== 1371
-YmVk 1372
-aWVm 1373
-dWZm 1374
-IGFwbw== 1375
-IG1ldA== 1376
-IHJldHVybmVk 1377
-IHNvdW5k 1378
-dXNpbmVzcw== 1379
-IGxhdWdo 1380
-IGNsZWFy 1381
-IG5lZWQ= 1382
-ZmVzcw== 1383
-ZXN0ZWQ= 1384
-IGludg== 1385
-IGFjY2VwdA== 1386
-dW5kZXI= 1387
-Owo= 1388
-IHN1cnBy 1389
-ZGU= 1390
-IHRyYWlu 1391
-IGhvdGVs 1392
-IHNsZWVw 1393
-IGRy 1394
-IGhvbGQ= 1395
-bG9jaw== 1396
-cHVyYQ== 1397
-IHNwcmluZ3M= 1398
-IC4uLi4uLg== 1399
-IGFncmVlbWVudA== 1400
-IERhcg== 1401
-IHJlc3Q= 1402
-Y2x1ZA== 1403
-YXRvcg== 1404
-YXY= 1405
-IG9yaWc= 1406
-IG9yaWdpbg== 1407
-IGVs 1408
-IG5vcg== 1409
-IHByZXM= 1410
-IHVuZGVyc3RhbmQ= 1411
-IHRha2Vu 1412
-IGxpZ2h0 1413
-ZW5lcg== 1414
-c29tZQ== 1415
-IGJyb3VnaHQ= 1416
-cmFwaA== 1417
-IG1vc3Q= 1418
-b2tl 1419
-LXc= 1420
-IHVudA== 1421
-IGZhdGhlcg== 1422
-IHVzZWQ= 1423
-IGVhdA== 1424
-IHllYXJz 1425
-IFdoaWxl 1426
-IGNoYW4= 1427
-IHN1ZGQ= 1428
-IHN1ZGRlbg== 1429
-IGFwb2xvZw== 1430
-IHNldHQ= 1431
-IHRoaW4= 1432
-IE15 1433
-IHRlbg== 1434
-aW1lcw== 1435
-Zm9y 1436
-b3Vk 1437
-V2hlbg== 1438
-IGRldA== 1439
-IGxpdmU= 1440
-IG9j 1441
-IGZpdmU= 1442
-IGNvbnQ= 1443
-IGhlbHA= 1444
-IHdh 1445
-IHBhc3NlZA== 1446
-IHJ1bg== 1447
-IG1ha2luZw== 1448
-IHN0cmFuZ2U= 1449
-IHRha2luZw== 1450
-IGVhY2g= 1451
-IllvdQ== 1452
-IGFub3RoZXI= 1453
-IlNheQ== 1454
-IlRoZQ== 1455
-YXRlcw== 1456
-IHBsZWFz 1457
-YXNzaG9wcGVycw== 1458
-IG1vbQ== 1459
-IG1vbWVudA== 1460
-ZW50bGU= 1461
-bmdsaXNo 1462
-Q0hB 1463
-IG9yaWdpbmFs 1464
-aW9ucw== 1465
-dXJpbmc= 1466
-IHB1YmxpYw== 1467
-dWN0 1468
-dWNr 1469
-IHF1ZXN0aW9u 1470
-YWk= 1471
-Y3k= 1472
-ZWs= 1473
-IGZsb29y 1474
-IGNhcg== 1475
-b3VzZQ== 1476
-IHNpZGU= 1477
-LXlh 1478
-IGNlcnRhaW4= 1479
-aHlz 1480
-LWQ= 1481
-aWdo 1482
-YWdpbg== 1483
-d2VldA== 1484
-IHBvb3I= 1485
-IGRlY2lk 1486
-dWFsbHk= 1487
-IGJ1c2luZXNz 1488
-cHJv 1489
-cGxhaW4= 1490
-IHN0b3A= 1491
-IQo= 1492
-IEhvdw== 1493
-IldoYXQ= 1494
-Y2Fu 1495
-IFVu 1496
-cHM= 1497
-dW5k 1498
-LW5pZ2h0 1499
-IG1lZXRpbmc= 1500
-ZWRv 1501
-IHJhaXNl 1502
-R3V0ZW5iZXJn 1503
-IERhcmxpbmc= 1504
-dW1l 1505
-IEVuZ2xpc2g= 1506
-VEVS 1507
-YWRpbmc= 1508
-IHRyYW5zbA== 1509
-IGFibGU= 1510
-c3NpYmxl 1511
-IHNhdGlzZg== 1512
-IHdhbnRlZA== 1513
-IHN1Yg== 1514
-IGNhc2U= 1515
-aWZpYw== 1516
-aXRlcmFyeQ== 1517
-IG1haWQ= 1518
-IGluYw== 1519
-IHBvcw== 1520
-IHBvc2l0aW9u 1521
-IHBhdA== 1522
-dXJlZA== 1523
-b3JyeQ== 1524
-IGFjY291bnQ= 1525
-IGJvdGg= 1526
-IGZyaWU= 1527
-IGZyaWVuZA== 1528
-dGhpcw== 1529
-IGFsd2F5cw== 1530
-IHBhcnRpY3Vs 1531
-V2hhdA== 1532
-IHNtYWxs 1533
-ZW50eQ== 1534
-dXNoZWQ= 1535
-IG1pcw== 1536
-dWxseQ== 1537
-IHJlY2Vp 1538
-WW91 1539
-IHlldA== 1540
-IGdhdmU= 1541
-QnV0 1542
-aGFk 1543
-IGFuc3dlcg== 1544
-IGFicw== 1545
-aWxl 1546
-Y2tldA== 1547
-IG5vb2Q= 1548
-IGNvdXJzZQ== 1549
-IGZvcm0= 1550
-IGV2ZXJ5dGhpbmc= 1551
-ZWN0aW9u 1552
-SWY= 1553
-cGFydA== 1554
-IHNpbmc= 1555
-IHNpdA== 1556
-IHB1cg== 1557
-aXA= 1558
-IGZpc2hpbmc= 1559
-IGVo 1560
-IHBhcg== 1561
-IHRvZ2V0aGVy 1562
-SGU= 1563
-IHdoZQ== 1564
-IHdoZXRoZXI= 1565
-IGJyYQ== 1566
-Illlcw== 1567
-IHB1bmlzaA== 1568
-U2hpcnQ= 1569
-IFllZG8= 1570
-IGZhcmV3 1571
-IGZhcmV3ZWxs 1572
-IGRhbmNl 1573
-IGxlc3M= 1574
-dXJhbA== 1575
-IGRlZg== 1576
-IGF0dGVtcHQ= 1577
-d2Vlbg== 1578
-IHNpZ24= 1579
-IHN5 1580
-ZmVyZW50 1581
-IGxlYXN0 1582
-c2Vy 1583
-b2I= 1584
-bmRpbmc= 1585
-IHNvcnJ5 1586
-IGp1bXBlZA== 1587
-IGphbg== 1588
-IGphbml0b3I= 1589
-aXplZA== 1590
-IHRvd2FyZA== 1591
-IG1vcg== 1592
-YXZpbmc= 1593
-IGJpdA== 1594
-IlRoaXM= 1595
-IHJlbWFyaw== 1596
-IGZ1dA== 1597
-IHdvbmRlcg== 1598
-IGZ1bg== 1599
-VGhlbg== 1600
-IGRlYw== 1601
-IHdob20= 1602
-IGRpZG4= 1603
-IHJlYw== 1604
-YmVj 1605
-Iklm 1606
-IGtuZXc= 1607
-YWZ0ZXI= 1608
-IHRodXM= 1609
-IGlzbg== 1610
-IHNpZ2h0 1611
-bWVk 1612
-W0Y= 1613
-dXNz 1614
-Y2lkZW50 1615
-dGhlbQ== 1616
-IGZpZg== 1617
-IGRyYXc= 1618
-IGhlYXI= 1619
-IHdyaXRpbmc= 1620
-IGdldHRpbmc= 1621
-c2g= 1622
-ZmVyZW5jZQ== 1623
-IHJhaXNlZA== 1624
-dGhleQ== 1625
-YXg= 1626
-IGZpbmU= 1627
-c2Vs 1628
-IE5vYmU= 1629
-IE5vYmVvaw== 1630
-IE5vYmVva2E= 1631
-b3JtYWw= 1632
-IGVC 1633
-aWNlbnNl 1634
-MDA= 1635
-IGJlc3Q= 1636
-d29y 1637
-Zmlj 1638
-dGVyZXN0 1639
-IHJlbWFy 1640
-Ymw= 1641
-YXJ0ZWQ= 1642
-IGRhcms= 1643
-IHlvdW5n 1644
-dXNo 1645
-IGJldA== 1646
-b3V0aA== 1647
-aG91c2U= 1648
-YXVnaHQ= 1649
-IHBoeXM= 1650
-IHN0cm9uZw== 1651
-IGZ1cg== 1652
-IHJvbGw= 1653
-Y292ZQ== 1654
-Y2hpZWY= 1655
-YXdh 1656
-IGZvbGxvd2Vk 1657
-IGZvbmQ= 1658
-IGZ1dHVyZQ== 1659
-aXJk 1660
-ZnVsbHk= 1661
-IGVmZm9ydA== 1662
-QWZ0ZXI= 1663
-b3dhcmQ= 1664
-IHJlYWxseQ== 1665
-IGFtb25n 1666
-IGFyb3VuZA== 1667
-IGNvbXBs 1668
-IGdheg== 1669
-IGJvdw== 1670
-YXRlcg== 1671
-IGluc2lzdA== 1672
-IHR1cm5lZA== 1673
-aGVs 1674
-cmVt 1675
-IGhvdXJz 1676
-IGRlY2lkZWQ= 1677
-eXM= 1678
-IG1vbnRo 1679
-LWE= 1680
-IGFkdg== 1681
-IGJlbGlldmU= 1682
-IHRlYWNoaW5n 1683
-IGVhc3k= 1684
-IGRpcmVjdGlvbg== 1685
-b29rZWQ= 1686
-IHdhcg== 1687
-IHVubGVzcw== 1688
-aGF2ZQ== 1689
-IHNxdWFyZQ== 1690
-dmls 1691
-IHF1aWV0 1692
-IGh1bmc= 1693
-IGdvZXM= 1694
-IHBhaWQ= 1695
-IHNoYWxs 1696
-Ik5v 1697
-IHB1bmlzaG1lbnQ= 1698
-cG9zZQ== 1699
-IHN3ZWV0 1700
-J3Zl 1701
-IldlbGw= 1702
-IGdlbnRsZQ== 1703
-IG5vcm1hbA== 1704
-YWdyYXBo 1705
-Y2hpdmU= 1706
-Y2hhbg== 1707
-IGluY2x1ZA== 1708
-d3c= 1709
-b3Jn 1710
-dGVt 1711
-QVI= 1712
-IFRI 1713
-IGVxdQ== 1714
-IHRvbmU= 1715
-IHBvc3NpYmxl 1716
-IGJlY29t 1717
-IEphcGFuZXNl 1718
-dmVycw== 1719
-IGZvbGxvd2luZw== 1720
-IHBhaW4= 1721
-IHdob2xl 1722
-d3I= 1723
-IHNlcmlvdXM= 1724
-IG5hcg== 1725
-IHRpcmVk 1726
-SW4= 1727
-IHBsYXk= 1728
-IHByb20= 1729
-IGdhbWU= 1730
-IFNvbWU= 1731
-IGhhcHBlbmVk 1732
-IGN1dA== 1733
-IHR3ZW50eQ== 1734
-IGRvb3I= 1735
-IG1vcm5pbmc= 1736
-aGluZA== 1737
-IGJyZQ== 1738
-IGluc2lkZQ== 1739
-b3Zl 1740
-YWx0aA== 1741
-dWs= 1742
-YXJnZQ== 1743
-YW1i 1744
-IGRhbQ== 1745
-IHdvcnJ5 1746
-YXRpdmU= 1747
-IGV4cGVjdGVk 1748
-IGZhbQ== 1749
-IHByYQ== 1750
-IHBvY2tldA== 1751
-b29rcw== 1752
-Y2hlZA== 1753
-IHNpbA== 1754
-b2w= 1755
-IGZhdg== 1756
-IGVsc2U= 1757
-IGhpZ2g= 1758
-IHJlYWw= 1759
-IGFsb25n 1760
-IG1lZA== 1761
-aGlr 1762
-aGVtYXQ= 1763
-aGVtYXRpY3M= 1764
-IGxpc3Q= 1765
-IHNpY2s= 1766
-b2ludA== 1767
-W0Zvb3Q= 1768
-W0Zvb3Rub3Q= 1769
-W0Zvb3Rub3Rl 1770
-Ll0K 1771
-bmlnaHQ= 1772
-c2Vz 1773
-aW9y 1774
-IHNheXM= 1775
-IG1vdXRo 1776
-aG93 1777
-bWluZw== 1778
-IGNsbw== 1779
-IGN1cg== 1780
-Z2luZw== 1781
-IHN1ZGRlbmx5 1782
-LWFo 1783
-YW1w 1784
-IGJsYWNr 1785
-cm9zcw== 1786
-IGZhYw== 1787
-c2VsdmVz 1788
-aWV3 1789
-aXNzaW9u 1790
-IGNvcHlyaWdodA== 1791
-IHBhcmFncmFwaA== 1792
-IEFyY2hpdmU= 1793
-IGRvbmF0aW9ucw== 1794
-UHJvamVjdA== 1795
-IGNvc3Q= 1796
-Lm9yZw== 1797
-TEk= 1798
-dWNlZA== 1799
-IHN1Yw== 1800
-eWxl 1801
-IGZvcmNl 1802
-am95 1803
-b3VjaA== 1804
-dHI= 1805
-SXQ= 1806
-IHRyYWQ= 1807
-IHByZXNlbnQ= 1808
-IGV4dA== 1809
-YXNlZA== 1810
-cmVkaXQ= 1811
-IGZhdWx0 1812
-aWI= 1813
-LW0= 1814
-dXJk 1815
-IHRyaWVk 1816
-dGltZQ== 1817
-IHByZXQ= 1818
-IHNwZWU= 1819
-b3dlcg== 1820
-IHdvcmRz 1821
-Q0hBUA== 1822
-Q0hBUFRFUg== 1823
-c2Nob29s 1824
-IGFzaw== 1825
-IGRvaW5n 1826
-YXRlbHk= 1827
-IHVudGls 1828
-Ym91dA== 1829
-IHRyZWU= 1830
-Y2FsbA== 1831
-YW1hc2g= 1832
-YW1hc2hpcg== 1833
-YW1hc2hpcm8= 1834
-c3Rl 1835
-IGJlaGluZA== 1836
-b2xk 1837
-IHdhbGw= 1838
-aXRvcnk= 1839
-IHJvbGxlZA== 1840
-IG1vdmU= 1841
-IGFwb2xvZ2l6ZQ== 1842
-IGxhcmdl 1843
-YW1ib28= 1844
-c3U= 1845
-IHNldHRsZWQ= 1846
-Ikhl 1847
-d28= 1848
-IHRoaW5raW5n 1849
-dXNlZA== 1850
-aWZpZWQ= 1851
-IGFsbW9zdA== 1852
-IHRyZQ== 1853
-IHRyZWF0 1854
-IG5vb2RsZQ== 1855
-IG5vdGU= 1856
-IEFsbA== 1857
-IGJlYXQ= 1858
-IG9iamVjdA== 1859
-IHNlZW1z 1860
-IGlkZQ== 1861
-WWVz 1862
-b3dz 1863
-IHJlbWFpbg== 1864
-IGJlZ2lu 1865
-dWdodA== 1866
-bWVudHM= 1867
-IGFsb25l 1868
-c3BlY3Q= 1869
-IG1hdGhlbWF0aWNz 1870
-IHJvdWdo 1871
-IG91dHNpZGU= 1872
-IGNvbWVz 1873
-YmFjaw== 1874
-IHdpbmQ= 1875
-c2Vk 1876
-IHdvdWxkbg== 1877
-ZWVy 1878
-aW51dA== 1879
-ZnJvbQ== 1880
-IHJlcGw= 1881
-IG5hcnJvdw== 1882
-IGluY2lkZW50 1883
-IGFpcg== 1884
-IHNlYQ== 1885
-dHM= 1886
-IHN1cnByaXNlZA== 1887
-IHRlYQ== 1888
-UmVk 1889
-IHRhbGtpbmc= 1890
-IGJvc3M= 1891
-cXVl 1892
-IHBpY3Q= 1893
-aXJ0eQ== 1894
-IGNl 1895
-IGxpbQ== 1896
-IFdoeQ== 1897
-IHBvaW50 1898
-IGxhdw== 1899
-Y2lhdGVk 1900
-IG1vb24= 1901
-aXJjdQ== 1902
-Z290 1903
-IElz 1904
-IGhhbmRz 1905
-IGhvbm9y 1906
-YXV0 1907
-cmdl 1908
-IHN0YXRl 1909
-IExpdGVyYXJ5 1910
-LkY= 1911
-VGhpcw== 1912
-bGluZQ== 1913
-Lmc= 1914
-Lmd1dGVuYmVyZw== 1915
-IE9G 1916
-RU4= 1917
-cmFjdGVy 1918
-IGJlbmU= 1919
-IEV2ZW4= 1920
-b3Vi 1921
-IG1ha2Vz 1922
-IGludGVyZXN0 1923
-b3Bl 1924
-bXM= 1925
-IHJlc3BvbnM= 1926
-IGZvcmU= 1927
-IHNvbWV3aGF0 1928
-IGhvbmVzdA== 1929
-b2Nr 1930
-aXJpdA== 1931
-IGhlbGQ= 1932
-IGFkZGVk 1933
-ZnU= 1934
-YWRlZA== 1935
-YWxz 1936
-YXR0 1937
-dGVybg== 1938
-IHBlcnNvbmFs 1939
-IGFzcw== 1940
-IFdpdGg= 1941
-dGlj 1942
-VG9reW8= 1943
-IHNob3V0 1944
-IHByZXR0eQ== 1945
-dW1i 1946
-IGVhcmx5 1947
-b3BwZWQ= 1948
-IGZ1cnRoZXI= 1949
-IGZyZQ== 1950
-ZXNpZGVz 1951
-IGJhbWJvbw== 1952
-IGly 1953
-bW9yZQ== 1954
-IGxpdmluZw== 1955
-IHJlY2VpdmVk 1956
-IGxpdmVk 1957
-IG1lYW50 1958
-IGNvd2FyZA== 1959
-cG9zaXRpb24= 1960
-IGxvYw== 1961
-aWxlZA== 1962
-IHRlbmRlcg== 1963
-IGNo 1964
-IEFmdGVy 1965
-Y2Vy 1966
-IGZhdm9y 1967
-d2hv 1968
-IGxpa2Vk 1969
-cmFuY2U= 1970
-IHByaQ== 1971
-a2lzaGE= 1972
-IHN0dWR5 1973
-IG9yZGVy 1974
-IGFmdGVyd2FyZA== 1975
-IGdyZWF0bHk= 1976
-IHVuYWJsZQ== 1977
-Z28= 1978
-IHdhaXQ= 1979
-ZXBpbmc= 1980
-aWRpbmc= 1981
-IGZvcnR5 1982
-IHNreQ== 1983
-IG9mZmljZQ== 1984
-d2lsbA== 1985
-IkQ= 1986
-d2Vs 1987
-IHN0YXRpb24= 1988
-Ym8= 1989
-aG90 1990
-c3VjaA== 1991
-IGxvdWQ= 1992
-IGF3 1993
-bGFuZA== 1994
-Pwo= 1995
-IHJlc3BlY3Q= 1996
-YW5jZXM= 1997
-aWVudA== 1998
-IG91Z2h0 1999
diff --git a/tests/assets/tokenizer/tokenizer.json b/tests/assets/tokenizer/tokenizer.json
new file mode 100644
index 000000000..a39c930b4
--- /dev/null
+++ b/tests/assets/tokenizer/tokenizer.json
@@ -0,0 +1,2037 @@
+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "ByteLevel",
+    "add_prefix_space": false,
+    "trim_offsets": true,
+    "use_regex": true
+  },
+  "post_processor": {
+    "type": "ByteLevel",
+    "add_prefix_space": true,
+    "trim_offsets": true,
+    "use_regex": true
+  },
+  "decoder": {
+    "type": "ByteLevel",
+    "add_prefix_space": true,
+    "trim_offsets": true,
+    "use_regex": true
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": null,
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "ignore_merges": false,
+    "vocab": {
+      "\u0000": 0,
+      "\u0001": 1,
+      "\u0002": 2,
+      "\u0003": 3,
+      "\u0004": 4,
+      "\u0005": 5,
+      "\u0006": 6,
+      "\u0007": 7,
+      "\b": 8,
+      "\t": 9,
+      "\n": 10,
+      "\u000b": 11,
+      "\f": 12,
+      "\r": 13,
+      "\u000e": 14,
+      "\u000f": 15,
+      "\u0010": 16,
+      "\u0011": 17,
+      "\u0012": 18,
+      "\u0013": 19,
+      "\u0014": 20,
+      "\u0015": 21,
+      "\u0016": 22,
+      "\u0017": 23,
+      "\u0018": 24,
+      "\u0019": 25,
+      "\u001a": 26,
+      "\u001b": 27,
+      "\u001c": 28,
+      "\u001d": 29,
+      "\u001e": 30,
+      "\u001f": 31,
+      " ": 32,
+      "!": 33,
+      "\"": 34,
+      "#": 35,
+      "$": 36,
+      "%": 37,
+      "&": 38,
+      "'": 39,
+      "(": 40,
+      ")": 41,
+      "*": 42,
+      "+": 43,
+      ",": 44,
+      "-": 45,
+      ".": 46,
+      "/": 47,
+      "0": 48,
+      "1": 49,
+      "2": 50,
+      "3": 51,
+      "4": 52,
+      "5": 53,
+      "6": 54,
+      "7": 55,
+      "8": 56,
+      "9": 57,
+      ":": 58,
+      ";": 59,
+      "<": 60,
+      "=": 61,
+      ">": 62,
+      "?": 63,
+      "@": 64,
+      "A": 65,
+      "B": 66,
+      "C": 67,
+      "D": 68,
+      "E": 69,
+      "F": 70,
+      "G": 71,
+      "H": 72,
+      "I": 73,
+      "J": 74,
+      "K": 75,
+      "L": 76,
+      "M": 77,
+      "N": 78,
+      "O": 79,
+      "P": 80,
+      "Q": 81,
+      "R": 82,
+      "S": 83,
+      "T": 84,
+      "U": 85,
+      "V": 86,
+      "W": 87,
+      "X": 88,
+      "Y": 89,
+      "Z": 90,
+      "[": 91,
+      "\\": 92,
+      "]": 93,
+      "^": 94,
+      "_": 95,
+      "`": 96,
+      "a": 97,
+      "b": 98,
+      "c": 99,
+      "d": 100,
+      "e": 101,
+      "f": 102,
+      "g": 103,
+      "h": 104,
+      "i": 105,
+      "j": 106,
+      "k": 107,
+      "l": 108,
+      "m": 109,
+      "n": 110,
+      "o": 111,
+      "p": 112,
+      "q": 113,
+      "r": 114,
+      "s": 115,
+      "t": 116,
+      "u": 117,
+      "v": 118,
+      "w": 119,
+      "x": 120,
+      "y": 121,
+      "z": 122,
+      "{": 123,
+      "|": 124,
+      "}": 125,
+      "~": 126,
+      "": 127,
+      "\\x80": 128,
+      "\\x81": 129,
+      "\\x82": 130,
+      "\\x83": 131,
+      "\\x84": 132,
+      "\\x85": 133,
+      "\\x86": 134,
+      "\\x87": 135,
+      "\\x88": 136,
+      "\\x89": 137,
+      "\\x8a": 138,
+      "\\x8b": 139,
+      "\\x8c": 140,
+      "\\x8d": 141,
+      "\\x8e": 142,
+      "\\x8f": 143,
+      "\\x90": 144,
+      "\\x91": 145,
+      "\\x92": 146,
+      "\\x93": 147,
+      "\\x94": 148,
+      "\\x95": 149,
+      "\\x96": 150,
+      "\\x97": 151,
+      "\\x98": 152,
+      "\\x99": 153,
+      "\\x9a": 154,
+      "\\x9b": 155,
+      "\\x9c": 156,
+      "\\x9d": 157,
+      "\\x9e": 158,
+      "\\x9f": 159,
+      "\\xa0": 160,
+      "\\xa1": 161,
+      "\\xa2": 162,
+      "\\xa3": 163,
+      "\\xa4": 164,
+      "\\xa5": 165,
+      "\\xa6": 166,
+      "\\xa7": 167,
+      "\\xa8": 168,
+      "\\xa9": 169,
+      "\\xaa": 170,
+      "\\xab": 171,
+      "\\xac": 172,
+      "\\xad": 173,
+      "\\xae": 174,
+      "\\xaf": 175,
+      "\\xb0": 176,
+      "\\xb1": 177,
+      "\\xb2": 178,
+      "\\xb3": 179,
+      "\\xb4": 180,
+      "\\xb5": 181,
+      "\\xb6": 182,
+      "\\xb7": 183,
+      "\\xb8": 184,
+      "\\xb9": 185,
+      "\\xba": 186,
+      "\\xbb": 187,
+      "\\xbc": 188,
+      "\\xbd": 189,
+      "\\xbe": 190,
+      "\\xbf": 191,
+      "\\xc0": 192,
+      "\\xc1": 193,
+      "\\xc2": 194,
+      "\\xc3": 195,
+      "\\xc4": 196,
+      "\\xc5": 197,
+      "\\xc6": 198,
+      "\\xc7": 199,
+      "\\xc8": 200,
+      "\\xc9": 201,
+      "\\xca": 202,
+      "\\xcb": 203,
+      "\\xcc": 204,
+      "\\xcd": 205,
+      "\\xce": 206,
+      "\\xcf": 207,
+      "\\xd0": 208,
+      "\\xd1": 209,
+      "\\xd2": 210,
+      "\\xd3": 211,
+      "\\xd4": 212,
+      "\\xd5": 213,
+      "\\xd6": 214,
+      "\\xd7": 215,
+      "\\xd8": 216,
+      "\\xd9": 217,
+      "\\xda": 218,
+      "\\xdb": 219,
+      "\\xdc": 220,
+      "\\xdd": 221,
+      "\\xde": 222,
+      "\\xdf": 223,
+      "\\xe0": 224,
+      "\\xe1": 225,
+      "\\xe2": 226,
+      "\\xe3": 227,
+      "\\xe4": 228,
+      "\\xe5": 229,
+      "\\xe6": 230,
+      "\\xe7": 231,
+      "\\xe8": 232,
+      "\\xe9": 233,
+      "\\xea": 234,
+      "\\xeb": 235,
+      "\\xec": 236,
+      "\\xed": 237,
+      "\\xee": 238,
+      "\\xef": 239,
+      "\\xf0": 240,
+      "\\xf1": 241,
+      "\\xf2": 242,
+      "\\xf3": 243,
+      "\\xf4": 244,
+      "\\xf5": 245,
+      "\\xf6": 246,
+      "\\xf7": 247,
+      "\\xf8": 248,
+      "\\xf9": 249,
+      "\\xfa": 250,
+      "\\xfb": 251,
+      "\\xfc": 252,
+      "\\xfd": 253,
+      "\\xfe": 254,
+      "\\xff": 255,
+      " t": 256,
+      "he": 257,
+      " a": 258,
+      "in": 259,
+      " s": 260,
+      " w": 261,
+      " the": 262,
+      " o": 263,
+      "re": 264,
+      " b": 265,
+      "ou": 266,
+      "ed": 267,
+      " m": 268,
+      "nd": 269,
+      " I": 270,
+      "ha": 271,
+      "it": 272,
+      "er": 273,
+      "ing": 274,
+      " f": 275,
+      "is": 276,
+      " to": 277,
+      "en": 278,
+      "on": 279,
+      "or": 280,
+      "as": 281,
+      " c": 282,
+      " of": 283,
+      " and": 284,
+      " d": 285,
+      "ll": 286,
+      "at": 287,
+      "an": 288,
+      "ar": 289,
+      " p": 290,
+      " n": 291,
+      " in": 292,
+      "le": 293,
+      "om": 294,
+      "ot": 295,
+      " be": 296,
+      " h": 297,
+      "ut": 298,
+      "ow": 299,
+      "es": 300,
+      "hat": 301,
+      " g": 302,
+      " he": 303,
+      " ha": 304,
+      " l": 305,
+      " was": 306,
+      "ld": 307,
+      "gh": 308,
+      "id": 309,
+      "ch": 310,
+      " th": 311,
+      " it": 312,
+      "ay": 313,
+      " on": 314,
+      "ce": 315,
+      "se": 316,
+      "ent": 317,
+      " st": 318,
+      "ly": 319,
+      "ve": 320,
+      "et": 321,
+      "st": 322,
+      " T": 323,
+      " e": 324,
+      " y": 325,
+      "ght": 326,
+      "ir": 327,
+      " me": 328,
+      "oo": 329,
+      "al": 330,
+      "ith": 331,
+      " re": 332,
+      "im": 333,
+      " that": 334,
+      " as": 335,
+      "ould": 336,
+      "ro": 337,
+      "ad": 338,
+      "ion": 339,
+      ".\n": 340,
+      "her": 341,
+      " my": 342,
+      "ct": 343,
+      " not": 344,
+      " with": 345,
+      " for": 346,
+      " u": 347,
+      "ke": 348,
+      " you": 349,
+      " S": 350,
+      " is": 351,
+      "ight": 352,
+      "\"\n": 353,
+      "am": 354,
+      "ic": 355,
+      "ur": 356,
+      " at": 357,
+      "..": 358,
+      "ac": 359,
+      "ter": 360,
+      " wh": 361,
+      " an": 362,
+      " we": 363,
+      " The": 364,
+      "if": 365,
+      " or": 366,
+      " but": 367,
+      "ver": 368,
+      " \"": 369,
+      " r": 370,
+      "out": 371,
+      "ome": 372,
+      " had": 373,
+      "pp": 374,
+      "qu": 375,
+      " su": 376,
+      " this": 377,
+      "red": 378,
+      "ard": 379,
+      " so": 380,
+      "ell": 381,
+      " would": 382,
+      " his": 383,
+      " sh": 384,
+      "ine": 385,
+      "ra": 386,
+      " se": 387,
+      " by": 388,
+      ".\"\n": 389,
+      " P": 390,
+      "hen": 391,
+      " A": 392,
+      " have": 393,
+      " fr": 394,
+      " sa": 395,
+      " H": 396,
+      " one": 397,
+      "em": 398,
+      "ked": 399,
+      "irt": 400,
+      "ect": 401,
+      " him": 402,
+      " li": 403,
+      " ab": 404,
+      "ation": 405,
+      "hing": 406,
+      "the": 407,
+      " R": 408,
+      " le": 409,
+      "ss": 410,
+      " W": 411,
+      "cu": 412,
+      "ill": 413,
+      "'t": 414,
+      "art": 415,
+      "all": 416,
+      ",\n": 417,
+      "own": 418,
+      "ore": 419,
+      " all": 420,
+      " k": 421,
+      " go": 422,
+      "hirt": 423,
+      "and": 424,
+      " out": 425,
+      "ame": 426,
+      "ain": 427,
+      " if": 428,
+      " no": 429,
+      " do": 430,
+      " they": 431,
+      "ool": 432,
+      "un": 433,
+      "to": 434,
+      " up": 435,
+      " Red": 436,
+      " ne": 437,
+      " K": 438,
+      " from": 439,
+      " Shirt": 440,
+      " wor": 441,
+      "ong": 442,
+      " there": 443,
+      " said": 444,
+      "ri": 445,
+      "ant": 446,
+      " B": 447,
+      " any": 448,
+      "ud": 449,
+      "ind": 450,
+      " whi": 451,
+      "ab": 452,
+      "ound": 453,
+      " about": 454,
+      " them": 455,
+      "cup": 456,
+      "ak": 457,
+      " de": 458,
+      " te": 459,
+      " M": 460,
+      "ake": 461,
+      "cupine": 462,
+      "ig": 463,
+      " were": 464,
+      "orcupine": 465,
+      "il": 466,
+      "chool": 467,
+      " ro": 468,
+      "ood": 469,
+      " are": 470,
+      "ive": 471,
+      " like": 472,
+      "yo": 473,
+      " hou": 474,
+      "'s": 475,
+      "one": 476,
+      "us": 477,
+      "el": 478,
+      "ul": 479,
+      "ack": 480,
+      "op": 481,
+      ",\"": 482,
+      "th": 483,
+      "acher": 484,
+      "um": 485,
+      "ang": 486,
+      " fa": 487,
+      "ag": 488,
+      " school": 489,
+      " j": 490,
+      "te": 491,
+      "ok": 492,
+      "ess": 493,
+      "ust": 494,
+      "ers": 495,
+      "....": 496,
+      " C": 497,
+      "ther": 498,
+      "han": 499,
+      " when": 500,
+      " sp": 501,
+      " man": 502,
+      " can": 503,
+      "ough": 504,
+      " who": 505,
+      " get": 506,
+      " did": 507,
+      " po": 508,
+      "ci": 509,
+      " al": 510,
+      "ist": 511,
+      " com": 512,
+      "lf": 513,
+      "au": 514,
+      " Porcupine": 515,
+      " which": 516,
+      "ven": 517,
+      " af": 518,
+      "wn": 519,
+      "ass": 520,
+      "ber": 521,
+      " ex": 522,
+      "ous": 523,
+      "est": 524,
+      "lo": 525,
+      " tr": 526,
+      "ellow": 527,
+      " say": 528,
+      "ought": 529,
+      " room": 530,
+      " some": 531,
+      "--": 532,
+      " O": 533,
+      "ate": 534,
+      " v": 535,
+      "hed": 536,
+      "ap": 537,
+      " tw": 538,
+      " bec": 539,
+      "ree": 540,
+      "ject": 541,
+      "ks": 542,
+      " con": 543,
+      " been": 544,
+      "ents": 545,
+      "ide": 546,
+      " could": 547,
+      " G": 548,
+      "ep": 549,
+      " pro": 550,
+      "nt": 551,
+      " house": 552,
+      " ag": 553,
+      " If": 554,
+      " kn": 555,
+      " fellow": 556,
+      " what": 557,
+      "way": 558,
+      "ish": 559,
+      " am": 560,
+      "ite": 561,
+      "nder": 562,
+      "ime": 563,
+      " pr": 564,
+      " teacher": 565,
+      "are": 566,
+      " bo": 567,
+      " she": 568,
+      " N": 569,
+      "ice": 570,
+      "ast": 571,
+      "ure": 572,
+      "ie": 573,
+      " such": 574,
+      "uten": 575,
+      "utenber": 576,
+      "utenberg": 577,
+      " qu": 578,
+      "lown": 579,
+      " wr": 580,
+      "pt": 581,
+      " He": 582,
+      " stud": 583,
+      "here": 584,
+      " more": 585,
+      "ry": 586,
+      "tter": 587,
+      " Y": 588,
+      " may": 589,
+      "ity": 590,
+      " loo": 591,
+      " other": 592,
+      "his": 593,
+      " Pro": 594,
+      " will": 595,
+      " It": 596,
+      "ort": 597,
+      " should": 598,
+      "very": 599,
+      "we": 600,
+      " pl": 601,
+      "ash": 602,
+      ".\"": 603,
+      " app": 604,
+      " day": 605,
+      "urn": 606,
+      "po": 607,
+      " her": 608,
+      "  ": 609,
+      "not": 610,
+      "ck": 611,
+      " un": 612,
+      "hi": 613,
+      "ving": 614,
+      " old": 615,
+      " time": 616,
+      "\"T": 617,
+      " way": 618,
+      "able": 619,
+      "?\"\n": 620,
+      " Clown": 621,
+      " only": 622,
+      "ub": 623,
+      "ach": 624,
+      " off": 625,
+      " than": 626,
+      "ally": 627,
+      " their": 628,
+      "be": 629,
+      "king": 630,
+      "other": 631,
+      "ary": 632,
+      "ans": 633,
+      "ated": 634,
+      "self": 635,
+      " going": 636,
+      "uch": 637,
+      "oll": 638,
+      " back": 639,
+      "iyo": 640,
+      "-t": 641,
+      "ance": 642,
+      "ade": 643,
+      " Project": 644,
+      "sp": 645,
+      " two": 646,
+      " thought": 647,
+      "so": 648,
+      " right": 649,
+      " head": 650,
+      "ved": 651,
+      " D": 652,
+      " pre": 653,
+      " see": 654,
+      " us": 655,
+      " students": 656,
+      "cip": 657,
+      " don": 658,
+      " night": 659,
+      "incip": 660,
+      " Kiyo": 661,
+      "pl": 662,
+      "ared": 663,
+      " Gutenberg": 664,
+      " co": 665,
+      " how": 666,
+      "omet": 667,
+      "ff": 668,
+      "\"I": 669,
+      ",--": 670,
+      " asked": 671,
+      "incipal": 672,
+      "ever": 673,
+      " ac": 674,
+      " F": 675,
+      " make": 676,
+      "itt": 677,
+      " might": 678,
+      "ge": 679,
+      "led": 680,
+      " after": 681,
+      "ign": 682,
+      " gr": 683,
+      " made": 684,
+      "dd": 685,
+      " know": 686,
+      " come": 687,
+      " br": 688,
+      "thing": 689,
+      " But": 690,
+      " mat": 691,
+      " On": 692,
+      "ory": 693,
+      "cl": 694,
+      " E": 695,
+      "ble": 696,
+      "og": 697,
+      " your": 698,
+      "ull": 699,
+      " work": 700,
+      "ear": 701,
+      " three": 702,
+      "ied": 703,
+      "but": 704,
+      "The": 705,
+      "pe": 706,
+      "ace": 707,
+      " start": 708,
+      "ick": 709,
+      " over": 710,
+      "our": 711,
+      " much": 712,
+      " want": 713,
+      "imp": 714,
+      " part": 715,
+      "ho": 716,
+      "ink": 717,
+      "ence": 718,
+      " down": 719,
+      " even": 720,
+      " principal": 721,
+      "ling": 722,
+      "ount": 723,
+      "ause": 724,
+      " cl": 725,
+      " bl": 726,
+      "-tm": 727,
+      "omething": 728,
+      " into": 729,
+      "orm": 730,
+      "okyo": 731,
+      " dis": 732,
+      " fe": 733,
+      " face": 734,
+      "......": 735,
+      "ress": 736,
+      "ment": 737,
+      "ire": 738,
+      " ar": 739,
+      "ty": 740,
+      " mo": 741,
+      "reat": 742,
+      " fir": 743,
+      "per": 744,
+      " our": 745,
+      "co": 746,
+      " then": 747,
+      " told": 748,
+      "ings": 749,
+      " take": 750,
+      " beg": 751,
+      "ner": 752,
+      "ition": 753,
+      "ose": 754,
+      " own": 755,
+      " again": 756,
+      " seem": 757,
+      "ise": 758,
+      " wat": 759,
+      "\"W": 760,
+      " far": 761,
+      "aking": 762,
+      "fore": 763,
+      "ady": 764,
+      "-s": 765,
+      "less": 766,
+      " ret": 767,
+      " sha": 768,
+      " came": 769,
+      "ger": 770,
+      " good": 771,
+      "ather": 772,
+      "ark": 773,
+      "row": 774,
+      " ke": 775,
+      "'m": 776,
+      " has": 777,
+      "ath": 778,
+      "pped": 779,
+      " went": 780,
+      " tell": 781,
+      "quash": 782,
+      " en": 783,
+      " first": 784,
+      " hot": 785,
+      "iz": 786,
+      " away": 787,
+      " something": 788,
+      " rem": 789,
+      " town": 790,
+      " sm": 791,
+      " This": 792,
+      " better": 793,
+      " Then": 794,
+      "was": 795,
+      "of": 796,
+      "bard": 797,
+      " L": 798,
+      "li": 799,
+      "fe": 800,
+      " Tokyo": 801,
+      " long": 802,
+      "ily": 803,
+      " sure": 804,
+      " looked": 805,
+      "ubbard": 806,
+      "ction": 807,
+      "ord": 808,
+      " many": 809,
+      "ious": 810,
+      " too": 811,
+      " here": 812,
+      "os": 813,
+      " under": 814,
+      "ase": 815,
+      "ng": 816,
+      "ped": 817,
+      "od": 818,
+      "me": 819,
+      " just": 820,
+      " now": 821,
+      "ince": 822,
+      " heard": 823,
+      " kind": 824,
+      " They": 825,
+      " before": 826,
+      "hy": 827,
+      " In": 828,
+      " ent": 829,
+      " board": 830,
+      "!\"": 831,
+      "ward": 832,
+      " being": 833,
+      " well": 834,
+      "erm": 835,
+      "ried": 836,
+      " wrong": 837,
+      "aid": 838,
+      "xt": 839,
+      " return": 840,
+      "ited": 841,
+      " yen": 842,
+      " matter": 843,
+      " call": 844,
+      " tal": 845,
+      " You": 846,
+      "ced": 847,
+      "ised": 848,
+      " cha": 849,
+      "ons": 850,
+      " same": 851,
+      " once": 852,
+      "day": 853,
+      "ft": 854,
+      " sw": 855,
+      " because": 856,
+      " think": 857,
+      " where": 858,
+      " No": 859,
+      " Hubbard": 860,
+      " Squash": 861,
+      " cop": 862,
+      "with": 863,
+      "ered": 864,
+      "ollow": 865,
+      " place": 866,
+      "idd": 867,
+      "cess": 868,
+      " show": 869,
+      "isha": 870,
+      " ra": 871,
+      " letter": 872,
+      "ne": 873,
+      "ves": 874,
+      "ating": 875,
+      "rang": 876,
+      " aff": 877,
+      " hand": 878,
+      " sc": 879,
+      " pers": 880,
+      "int": 881,
+      "pr": 882,
+      "side": 883,
+      "fter": 884,
+      " saying": 885,
+      " lau": 886,
+      "that": 887,
+      " without": 888,
+      "ron": 889,
+      "air": 890,
+      "lect": 891,
+      " What": 892,
+      "elt": 893,
+      " while": 894,
+      "oga": 895,
+      "aper": 896,
+      " pe": 897,
+      "oy": 898,
+      " sat": 899,
+      "ies": 900,
+      " add": 901,
+      " days": 902,
+      " spe": 903,
+      " ho": 904,
+      " ans": 905,
+      " har": 906,
+      " When": 907,
+      " anything": 908,
+      "pen": 909,
+      "]\n": 910,
+      "tain": 911,
+      " must": 912,
+      " new": 913,
+      "lic": 914,
+      " vo": 915,
+      "hile": 916,
+      "get": 917,
+      " As": 918,
+      " very": 919,
+      "'re": 920,
+      " every": 921,
+      "ave": 922,
+      "?\"": 923,
+      "adger": 924,
+      " Koga": 925,
+      " Mr": 926,
+      "rough": 927,
+      "ult": 928,
+      " follow": 929,
+      "ting": 930,
+      "ife": 931,
+      "iddle": 932,
+      "ful": 933,
+      "ank": 934,
+      " So": 935,
+      " seemed": 936,
+      " And": 937,
+      "ix": 938,
+      " set": 939,
+      " care": 940,
+      " res": 941,
+      " never": 942,
+      " found": 943,
+      " lo": 944,
+      "cid": 945,
+      "ined": 946,
+      " class": 947,
+      " myself": 948,
+      "aw": 949,
+      " wom": 950,
+      "ations": 951,
+      " left": 952,
+      " We": 953,
+      " teachers": 954,
+      "\"Y": 955,
+      "na": 956,
+      "ont": 957,
+      " des": 958,
+      " those": 959,
+      "ired": 960,
+      " sen": 961,
+      "ying": 962,
+      " these": 963,
+      "az": 964,
+      " There": 965,
+      "cept": 966,
+      " dang": 967,
+      " U": 968,
+      "\"H": 969,
+      "bod": 970,
+      "body": 971,
+      " having": 972,
+      "alary": 973,
+      " watch": 974,
+      " give": 975,
+      "age": 976,
+      " its": 977,
+      " appe": 978,
+      "ue": 979,
+      " count": 980,
+      " hard": 981,
+      " bel": 982,
+      "ott": 983,
+      " dist": 984,
+      "\"S": 985,
+      " Mad": 986,
+      "-n": 987,
+      "ribut": 988,
+      "ged": 989,
+      " att": 990,
+      "fere": 991,
+      "ither": 992,
+      " upon": 993,
+      " tem": 994,
+      " person": 995,
+      "ning": 996,
+      " che": 997,
+      "arly": 998,
+      "oney": 999,
+      " soon": 1000,
+      "ement": 1001,
+      " (": 1002,
+      " trans": 1003,
+      " exp": 1004,
+      " ser": 1005,
+      " reg": 1006,
+      "ason": 1007,
+      " saw": 1008,
+      " next": 1009,
+      "oot": 1010,
+      " half": 1011,
+      " took": 1012,
+      " bad": 1013,
+      " hour": 1014,
+      " salary": 1015,
+      " began": 1016,
+      "right": 1017,
+      "onna": 1018,
+      "-san": 1019,
+      " works": 1020,
+      " J": 1021,
+      "form": 1022,
+      "ical": 1023,
+      " tra": 1024,
+      "man": 1025,
+      " nothing": 1026,
+      " still": 1027,
+      "ears": 1028,
+      " supp": 1029,
+      " turn": 1030,
+      " felt": 1031,
+      " woman": 1032,
+      " started": 1033,
+      "ouble": 1034,
+      "ura": 1035,
+      "ishing": 1036,
+      ":\n": 1037,
+      "lectron": 1038,
+      "lectronic": 1039,
+      "ook": 1040,
+      " copy": 1041,
+      " full": 1042,
+      "cond": 1043,
+      "mat": 1044,
+      " middle": 1045,
+      " look": 1046,
+      " comm": 1047,
+      "wered": 1048,
+      " became": 1049,
+      " fellows": 1050,
+      "would": 1051,
+      " got": 1052,
+      " gl": 1053,
+      " gu": 1054,
+      " keep": 1055,
+      " ge": 1056,
+      " Madonna": 1057,
+      "iter": 1058,
+      "ished": 1059,
+      " underst": 1060,
+      " stra": 1061,
+      "sid": 1062,
+      " country": 1063,
+      "ople": 1064,
+      " prov": 1065,
+      " put": 1066,
+      "no": 1067,
+      "'ll": 1068,
+      " sle": 1069,
+      "range": 1070,
+      " She": 1071,
+      "pos": 1072,
+      " mind": 1073,
+      " pass": 1074,
+      " through": 1075,
+      " quite": 1076,
+      " ind": 1077,
+      " boarding": 1078,
+      "teacher": 1079,
+      "ple": 1080,
+      "Porcupine": 1081,
+      " ple": 1082,
+      " geisha": 1083,
+      "    ": 1084,
+      "ost": 1085,
+      "ense": 1086,
+      "No": 1087,
+      "ible": 1088,
+      " read": 1089,
+      " red": 1090,
+      "ention": 1091,
+      "ened": 1092,
+      "!\"\n": 1093,
+      " ref": 1094,
+      " ad": 1095,
+      " fl": 1096,
+      " stay": 1097,
+      "up": 1098,
+      " round": 1099,
+      " cle": 1100,
+      " open": 1101,
+      " ob": 1102,
+      "tend": 1103,
+      " find": 1104,
+      " per": 1105,
+      " called": 1106,
+      " sur": 1107,
+      "rew": 1108,
+      " paper": 1109,
+      " Badger": 1110,
+      " meet": 1111,
+      "iss": 1112,
+      "\"That": 1113,
+      "erms": 1114,
+      "TE": 1115,
+      "itten": 1116,
+      "ably": 1117,
+      "ness": 1118,
+      " cannot": 1119,
+      " simp": 1120,
+      "con": 1121,
+      " reason": 1122,
+      "you": 1123,
+      " home": 1124,
+      "by": 1125,
+      " fight": 1126,
+      "ittle": 1127,
+      " things": 1128,
+      " eas": 1129,
+      " imp": 1130,
+      "ressed": 1131,
+      " mean": 1132,
+      " appeared": 1133,
+      " nat": 1134,
+      " hel": 1135,
+      "ret": 1136,
+      "aken": 1137,
+      " straight": 1138,
+      " affair": 1139,
+      "iting": 1140,
+      " ed": 1141,
+      " since": 1142,
+      "log": 1143,
+      " pay": 1144,
+      " front": 1145,
+      "my": 1146,
+      " voice": 1147,
+      "ready": 1148,
+      " fool": 1149,
+      "oundation": 1150,
+      " electronic": 1151,
+      " terms": 1152,
+      " mar": 1153,
+      "apan": 1154,
+      "any": 1155,
+      " resp": 1156,
+      " end": 1157,
+      "app": 1158,
+      "what": 1159,
+      "str": 1160,
+      "rap": 1161,
+      "ial": 1162,
+      "icul": 1163,
+      " acc": 1164,
+      "oth": 1165,
+      " second": 1166,
+      " flo": 1167,
+      " six": 1168,
+      " feet": 1169,
+      "br": 1170,
+      "iet": 1171,
+      " little": 1172,
+      "les": 1173,
+      " money": 1174,
+      " decl": 1175,
+      " ey": 1176,
+      " comp": 1177,
+      "aring": 1178,
+      " agre": 1179,
+      "where": 1180,
+      " St": 1181,
+      " stre": 1182,
+      "ex": 1183,
+      "ract": 1184,
+      " int": 1185,
+      " dire": 1186,
+      " become": 1187,
+      " hon": 1188,
+      " consid": 1189,
+      "ertain": 1190,
+      "now": 1191,
+      " sl": 1192,
+      "itor": 1193,
+      "gg": 1194,
+      " jum": 1195,
+      " bu": 1196,
+      " thing": 1197,
+      " answered": 1198,
+      "oes": 1199,
+      "ya": 1200,
+      " That": 1201,
+      "ize": 1202,
+      "ond": 1203,
+      "act": 1204,
+      " eff": 1205,
+      " bang": 1206,
+      "about": 1207,
+      " bed": 1208,
+      "orrow": 1209,
+      "ung": 1210,
+      " To": 1211,
+      " kept": 1212,
+      " wal": 1213,
+      " bath": 1214,
+      " dra": 1215,
+      "\"A": 1216,
+      "rings": 1217,
+      "hopp": 1218,
+      " resign": 1219,
+      " din": 1220,
+      " lady": 1221,
+      ".E": 1222,
+      " use": 1223,
+      "lish": 1224,
+      "ors": 1225,
+      " written": 1226,
+      "ene": 1227,
+      "iv": 1228,
+      " dif": 1229,
+      " ste": 1230,
+      " story": 1231,
+      "com": 1232,
+      "res": 1233,
+      "ently": 1234,
+      " fact": 1235,
+      "hes": 1236,
+      "ways": 1237,
+      " why": 1238,
+      " though": 1239,
+      " str": 1240,
+      "onder": 1241,
+      "head": 1242,
+      " cour": 1243,
+      " mon": 1244,
+      " sk": 1245,
+      " belie": 1246,
+      " let": 1247,
+      "fer": 1248,
+      " requ": 1249,
+      " line": 1250,
+      "room": 1251,
+      "-day": 1252,
+      " done": 1253,
+      " does": 1254,
+      " One": 1255,
+      " dango": 1256,
+      "asshopp": 1257,
+      " consider": 1258,
+      " dinner": 1259,
+      " Foundation": 1260,
+      "**": 1261,
+      "empt": 1262,
+      "ese": 1263,
+      " word": 1264,
+      "rest": 1265,
+      " enough": 1266,
+      " great": 1267,
+      " name": 1268,
+      " pub": 1269,
+      " manner": 1270,
+      "wer": 1271,
+      "ict": 1272,
+      "iness": 1273,
+      " himself": 1274,
+      " people": 1275,
+      "ew": 1276,
+      " cor": 1277,
+      "estion": 1278,
+      " big": 1279,
+      "ee": 1280,
+      " ri": 1281,
+      "ides": 1282,
+      " brother": 1283,
+      " heart": 1284,
+      "ected": 1285,
+      "eed": 1286,
+      " others": 1287,
+      "sol": 1288,
+      "ted": 1289,
+      " eyes": 1290,
+      " trouble": 1291,
+      " teach": 1292,
+      " boat": 1293,
+      " four": 1294,
+      " already": 1295,
+      "rom": 1296,
+      "ghed": 1297,
+      " squ": 1298,
+      " pol": 1299,
+      "ces": 1300,
+      " Hott": 1301,
+      " leave": 1302,
+      " distribut": 1303,
+      "aster": 1304,
+      "CH": 1305,
+      "uc": 1306,
+      " im": 1307,
+      " however": 1308,
+      "there": 1309,
+      "apanese": 1310,
+      " last": 1311,
+      " cr": 1312,
+      "ility": 1313,
+      " simple": 1314,
+      " life": 1315,
+      "-c": 1316,
+      " regard": 1317,
+      " fin": 1318,
+      "ual": 1319,
+      " means": 1320,
+      " stand": 1321,
+      "atch": 1322,
+      " short": 1323,
+      "ned": 1324,
+      " seen": 1325,
+      " happ": 1326,
+      "-k": 1327,
+      " against": 1328,
+      "him": 1329,
+      "amed": 1330,
+      " stood": 1331,
+      " gra": 1332,
+      " mother": 1333,
+      " fish": 1334,
+      " water": 1335,
+      "ail": 1336,
+      "cei": 1337,
+      " rather": 1338,
+      " ins": 1339,
+      " feel": 1340,
+      " also": 1341,
+      " ord": 1342,
+      " coming": 1343,
+      "ics": 1344,
+      " either": 1345,
+      "nce": 1346,
+      " '": 1347,
+      " kid": 1348,
+      " laughed": 1349,
+      "like": 1350,
+      " Ar": 1351,
+      "gr": 1352,
+      " Hotta": 1353,
+      " talk": 1354,
+      "gether": 1355,
+      " Sir": 1356,
+      " pun": 1357,
+      "Pro": 1358,
+      "ats": 1359,
+      "most": 1360,
+      " rep": 1361,
+      " gi": 1362,
+      "isf": 1363,
+      "bably": 1364,
+      "akes": 1365,
+      " Not": 1366,
+      "ny": 1367,
+      " appear": 1368,
+      "mp": 1369,
+      "cha": 1370,
+      " act": 1371,
+      "bed": 1372,
+      "ief": 1373,
+      "uff": 1374,
+      " apo": 1375,
+      " met": 1376,
+      " returned": 1377,
+      " sound": 1378,
+      "usiness": 1379,
+      " laugh": 1380,
+      " clear": 1381,
+      " need": 1382,
+      "fess": 1383,
+      "ested": 1384,
+      " inv": 1385,
+      " accept": 1386,
+      "under": 1387,
+      ";\n": 1388,
+      " surpr": 1389,
+      "de": 1390,
+      " train": 1391,
+      " hotel": 1392,
+      " sleep": 1393,
+      " dr": 1394,
+      " hold": 1395,
+      "lock": 1396,
+      "pura": 1397,
+      " springs": 1398,
+      " ......": 1399,
+      " agreement": 1400,
+      " Dar": 1401,
+      " rest": 1402,
+      "clud": 1403,
+      "ator": 1404,
+      "av": 1405,
+      " orig": 1406,
+      " origin": 1407,
+      " el": 1408,
+      " nor": 1409,
+      " pres": 1410,
+      " understand": 1411,
+      " taken": 1412,
+      " light": 1413,
+      "ener": 1414,
+      "some": 1415,
+      " brought": 1416,
+      "raph": 1417,
+      " most": 1418,
+      "oke": 1419,
+      "-w": 1420,
+      " unt": 1421,
+      " father": 1422,
+      " used": 1423,
+      " eat": 1424,
+      " years": 1425,
+      " While": 1426,
+      " chan": 1427,
+      " sudd": 1428,
+      " sudden": 1429,
+      " apolog": 1430,
+      " sett": 1431,
+      " thin": 1432,
+      " My": 1433,
+      " ten": 1434,
+      "imes": 1435,
+      "for": 1436,
+      "oud": 1437,
+      "When": 1438,
+      " det": 1439,
+      " live": 1440,
+      " oc": 1441,
+      " five": 1442,
+      " cont": 1443,
+      " help": 1444,
+      " wa": 1445,
+      " passed": 1446,
+      " run": 1447,
+      " making": 1448,
+      " strange": 1449,
+      " taking": 1450,
+      " each": 1451,
+      "\"You": 1452,
+      " another": 1453,
+      "\"Say": 1454,
+      "\"The": 1455,
+      "ates": 1456,
+      " pleas": 1457,
+      "asshoppers": 1458,
+      " mom": 1459,
+      " moment": 1460,
+      "entle": 1461,
+      "nglish": 1462,
+      "CHA": 1463,
+      " original": 1464,
+      "ions": 1465,
+      "uring": 1466,
+      " public": 1467,
+      "uct": 1468,
+      "uck": 1469,
+      " question": 1470,
+      "ai": 1471,
+      "cy": 1472,
+      "ek": 1473,
+      " floor": 1474,
+      " car": 1475,
+      "ouse": 1476,
+      " side": 1477,
+      "-ya": 1478,
+      " certain": 1479,
+      "hys": 1480,
+      "-d": 1481,
+      "igh": 1482,
+      "agin": 1483,
+      "weet": 1484,
+      " poor": 1485,
+      " decid": 1486,
+      "ually": 1487,
+      " business": 1488,
+      "pro": 1489,
+      "plain": 1490,
+      " stop": 1491,
+      "!\n": 1492,
+      " How": 1493,
+      "\"What": 1494,
+      "can": 1495,
+      " Un": 1496,
+      "ps": 1497,
+      "und": 1498,
+      "-night": 1499,
+      " meeting": 1500,
+      "edo": 1501,
+      " raise": 1502,
+      "Gutenberg": 1503,
+      " Darling": 1504,
+      "ume": 1505,
+      " English": 1506,
+      "TER": 1507,
+      "ading": 1508,
+      " transl": 1509,
+      " able": 1510,
+      "ssible": 1511,
+      " satisf": 1512,
+      " wanted": 1513,
+      " sub": 1514,
+      " case": 1515,
+      "ific": 1516,
+      "iterary": 1517,
+      " maid": 1518,
+      " inc": 1519,
+      " pos": 1520,
+      " position": 1521,
+      " pat": 1522,
+      "ured": 1523,
+      "orry": 1524,
+      " account": 1525,
+      " both": 1526,
+      " frie": 1527,
+      " friend": 1528,
+      "this": 1529,
+      " always": 1530,
+      " particul": 1531,
+      "What": 1532,
+      " small": 1533,
+      "enty": 1534,
+      "ushed": 1535,
+      " mis": 1536,
+      "ully": 1537,
+      " recei": 1538,
+      "You": 1539,
+      " yet": 1540,
+      " gave": 1541,
+      "But": 1542,
+      "had": 1543,
+      " answer": 1544,
+      " abs": 1545,
+      "ile": 1546,
+      "cket": 1547,
+      " nood": 1548,
+      " course": 1549,
+      " form": 1550,
+      " everything": 1551,
+      "ection": 1552,
+      "If": 1553,
+      "part": 1554,
+      " sing": 1555,
+      " sit": 1556,
+      " pur": 1557,
+      "ip": 1558,
+      " fishing": 1559,
+      " eh": 1560,
+      " par": 1561,
+      " together": 1562,
+      "He": 1563,
+      " whe": 1564,
+      " whether": 1565,
+      " bra": 1566,
+      "\"Yes": 1567,
+      " punish": 1568,
+      "Shirt": 1569,
+      " Yedo": 1570,
+      " farew": 1571,
+      " farewell": 1572,
+      " dance": 1573,
+      " less": 1574,
+      "ural": 1575,
+      " def": 1576,
+      " attempt": 1577,
+      "ween": 1578,
+      " sign": 1579,
+      " sy": 1580,
+      "ferent": 1581,
+      " least": 1582,
+      "ser": 1583,
+      "ob": 1584,
+      "nding": 1585,
+      " sorry": 1586,
+      " jumped": 1587,
+      " jan": 1588,
+      " janitor": 1589,
+      "ized": 1590,
+      " toward": 1591,
+      " mor": 1592,
+      "aving": 1593,
+      " bit": 1594,
+      "\"This": 1595,
+      " remark": 1596,
+      " fut": 1597,
+      " wonder": 1598,
+      " fun": 1599,
+      "Then": 1600,
+      " dec": 1601,
+      " whom": 1602,
+      " didn": 1603,
+      " rec": 1604,
+      "bec": 1605,
+      "\"If": 1606,
+      " knew": 1607,
+      "after": 1608,
+      " thus": 1609,
+      " isn": 1610,
+      " sight": 1611,
+      "med": 1612,
+      "[F": 1613,
+      "uss": 1614,
+      "cident": 1615,
+      "them": 1616,
+      " fif": 1617,
+      " draw": 1618,
+      " hear": 1619,
+      " writing": 1620,
+      " getting": 1621,
+      "sh": 1622,
+      "ference": 1623,
+      " raised": 1624,
+      "they": 1625,
+      "ax": 1626,
+      " fine": 1627,
+      "sel": 1628,
+      " Nobe": 1629,
+      " Nobeok": 1630,
+      " Nobeoka": 1631,
+      "ormal": 1632,
+      " eB": 1633,
+      "icense": 1634,
+      "00": 1635,
+      " best": 1636,
+      "wor": 1637,
+      "fic": 1638,
+      "terest": 1639,
+      " remar": 1640,
+      "bl": 1641,
+      "arted": 1642,
+      " dark": 1643,
+      " young": 1644,
+      "ush": 1645,
+      " bet": 1646,
+      "outh": 1647,
+      "house": 1648,
+      "aught": 1649,
+      " phys": 1650,
+      " strong": 1651,
+      " fur": 1652,
+      " roll": 1653,
+      "cove": 1654,
+      "chief": 1655,
+      "awa": 1656,
+      " followed": 1657,
+      " fond": 1658,
+      " future": 1659,
+      "ird": 1660,
+      "fully": 1661,
+      " effort": 1662,
+      "After": 1663,
+      "oward": 1664,
+      " really": 1665,
+      " among": 1666,
+      " around": 1667,
+      " compl": 1668,
+      " gaz": 1669,
+      " bow": 1670,
+      "ater": 1671,
+      " insist": 1672,
+      " turned": 1673,
+      "hel": 1674,
+      "rem": 1675,
+      " hours": 1676,
+      " decided": 1677,
+      "ys": 1678,
+      " month": 1679,
+      "-a": 1680,
+      " adv": 1681,
+      " believe": 1682,
+      " teaching": 1683,
+      " easy": 1684,
+      " direction": 1685,
+      "ooked": 1686,
+      " war": 1687,
+      " unless": 1688,
+      "have": 1689,
+      " square": 1690,
+      "vil": 1691,
+      " quiet": 1692,
+      " hung": 1693,
+      " goes": 1694,
+      " paid": 1695,
+      " shall": 1696,
+      "\"No": 1697,
+      " punishment": 1698,
+      "pose": 1699,
+      " sweet": 1700,
+      "'ve": 1701,
+      "\"Well": 1702,
+      " gentle": 1703,
+      " normal": 1704,
+      "agraph": 1705,
+      "chive": 1706,
+      "chan": 1707,
+      " includ": 1708,
+      "ww": 1709,
+      "org": 1710,
+      "tem": 1711,
+      "AR": 1712,
+      " TH": 1713,
+      " equ": 1714,
+      " tone": 1715,
+      " possible": 1716,
+      " becom": 1717,
+      " Japanese": 1718,
+      "vers": 1719,
+      " following": 1720,
+      " pain": 1721,
+      " whole": 1722,
+      "wr": 1723,
+      " serious": 1724,
+      " nar": 1725,
+      " tired": 1726,
+      "In": 1727,
+      " play": 1728,
+      " prom": 1729,
+      " game": 1730,
+      " Some": 1731,
+      " happened": 1732,
+      " cut": 1733,
+      " twenty": 1734,
+      " door": 1735,
+      " morning": 1736,
+      "hind": 1737,
+      " bre": 1738,
+      " inside": 1739,
+      "ove": 1740,
+      "alth": 1741,
+      "uk": 1742,
+      "arge": 1743,
+      "amb": 1744,
+      " dam": 1745,
+      " worry": 1746,
+      "ative": 1747,
+      " expected": 1748,
+      " fam": 1749,
+      " pra": 1750,
+      " pocket": 1751,
+      "ooks": 1752,
+      "ched": 1753,
+      " sil": 1754,
+      "ol": 1755,
+      " fav": 1756,
+      " else": 1757,
+      " high": 1758,
+      " real": 1759,
+      " along": 1760,
+      " med": 1761,
+      "hik": 1762,
+      "hemat": 1763,
+      "hematics": 1764,
+      " list": 1765,
+      " sick": 1766,
+      "oint": 1767,
+      "[Foot": 1768,
+      "[Footnot": 1769,
+      "[Footnote": 1770,
+      ".]\n": 1771,
+      "night": 1772,
+      "ses": 1773,
+      "ior": 1774,
+      " says": 1775,
+      " mouth": 1776,
+      "how": 1777,
+      "ming": 1778,
+      " clo": 1779,
+      " cur": 1780,
+      "ging": 1781,
+      " suddenly": 1782,
+      "-ah": 1783,
+      "amp": 1784,
+      " black": 1785,
+      "ross": 1786,
+      " fac": 1787,
+      "selves": 1788,
+      "iew": 1789,
+      "ission": 1790,
+      " copyright": 1791,
+      " paragraph": 1792,
+      " Archive": 1793,
+      " donations": 1794,
+      "Project": 1795,
+      " cost": 1796,
+      ".org": 1797,
+      "LI": 1798,
+      "uced": 1799,
+      " suc": 1800,
+      "yle": 1801,
+      " force": 1802,
+      "joy": 1803,
+      "ouch": 1804,
+      "tr": 1805,
+      "It": 1806,
+      " trad": 1807,
+      " present": 1808,
+      " ext": 1809,
+      "ased": 1810,
+      "redit": 1811,
+      " fault": 1812,
+      "ib": 1813,
+      "-m": 1814,
+      "urd": 1815,
+      " tried": 1816,
+      "time": 1817,
+      " pret": 1818,
+      " spee": 1819,
+      "ower": 1820,
+      " words": 1821,
+      "CHAP": 1822,
+      "CHAPTER": 1823,
+      "school": 1824,
+      " ask": 1825,
+      " doing": 1826,
+      "ately": 1827,
+      " until": 1828,
+      "bout": 1829,
+      " tree": 1830,
+      "call": 1831,
+      "amash": 1832,
+      "amashir": 1833,
+      "amashiro": 1834,
+      "ste": 1835,
+      " behind": 1836,
+      "old": 1837,
+      " wall": 1838,
+      "itory": 1839,
+      " rolled": 1840,
+      " move": 1841,
+      " apologize": 1842,
+      " large": 1843,
+      "amboo": 1844,
+      "su": 1845,
+      " settled": 1846,
+      "\"He": 1847,
+      "wo": 1848,
+      " thinking": 1849,
+      "used": 1850,
+      "ified": 1851,
+      " almost": 1852,
+      " tre": 1853,
+      " treat": 1854,
+      " noodle": 1855,
+      " note": 1856,
+      " All": 1857,
+      " beat": 1858,
+      " object": 1859,
+      " seems": 1860,
+      " ide": 1861,
+      "Yes": 1862,
+      "ows": 1863,
+      " remain": 1864,
+      " begin": 1865,
+      "ught": 1866,
+      "ments": 1867,
+      " alone": 1868,
+      "spect": 1869,
+      " mathematics": 1870,
+      " rough": 1871,
+      " outside": 1872,
+      " comes": 1873,
+      "back": 1874,
+      " wind": 1875,
+      "sed": 1876,
+      " wouldn": 1877,
+      "eer": 1878,
+      "inut": 1879,
+      "from": 1880,
+      " repl": 1881,
+      " narrow": 1882,
+      " incident": 1883,
+      " air": 1884,
+      " sea": 1885,
+      "ts": 1886,
+      " surprised": 1887,
+      " tea": 1888,
+      "Red": 1889,
+      " talking": 1890,
+      " boss": 1891,
+      "que": 1892,
+      " pict": 1893,
+      "irty": 1894,
+      " ce": 1895,
+      " lim": 1896,
+      " Why": 1897,
+      " point": 1898,
+      " law": 1899,
+      "ciated": 1900,
+      " moon": 1901,
+      "ircu": 1902,
+      "got": 1903,
+      " Is": 1904,
+      " hands": 1905,
+      " honor": 1906,
+      "aut": 1907,
+      "rge": 1908,
+      " state": 1909,
+      " Literary": 1910,
+      ".F": 1911,
+      "This": 1912,
+      "line": 1913,
+      ".g": 1914,
+      ".gutenberg": 1915,
+      " OF": 1916,
+      "EN": 1917,
+      "racter": 1918,
+      " bene": 1919,
+      " Even": 1920,
+      "oub": 1921,
+      " makes": 1922,
+      " interest": 1923,
+      "ope": 1924,
+      "ms": 1925,
+      " respons": 1926,
+      " fore": 1927,
+      " somewhat": 1928,
+      " honest": 1929,
+      "ock": 1930,
+      "irit": 1931,
+      " held": 1932,
+      " added": 1933,
+      "fu": 1934,
+      "aded": 1935,
+      "als": 1936,
+      "att": 1937,
+      "tern": 1938,
+      " personal": 1939,
+      " ass": 1940,
+      " With": 1941,
+      "tic": 1942,
+      "Tokyo": 1943,
+      " shout": 1944,
+      " pretty": 1945,
+      "umb": 1946,
+      " early": 1947,
+      "opped": 1948,
+      " further": 1949,
+      " fre": 1950,
+      "esides": 1951,
+      " bamboo": 1952,
+      " ir": 1953,
+      "more": 1954,
+      " living": 1955,
+      " received": 1956,
+      " lived": 1957,
+      " meant": 1958,
+      " coward": 1959,
+      "position": 1960,
+      " loc": 1961,
+      "iled": 1962,
+      " tender": 1963,
+      " ch": 1964,
+      " After": 1965,
+      "cer": 1966,
+      " favor": 1967,
+      "who": 1968,
+      " liked": 1969,
+      "rance": 1970,
+      " pri": 1971,
+      "kisha": 1972,
+      " study": 1973,
+      " order": 1974,
+      " afterward": 1975,
+      " greatly": 1976,
+      " unable": 1977,
+      "go": 1978,
+      " wait": 1979,
+      "eping": 1980,
+      "iding": 1981,
+      " forty": 1982,
+      " sky": 1983,
+      " office": 1984,
+      "will": 1985,
+      "\"D": 1986,
+      "wel": 1987,
+      " station": 1988,
+      "bo": 1989,
+      "hot": 1990,
+      "such": 1991,
+      " loud": 1992,
+      " aw": 1993,
+      "land": 1994,
+      "?\n": 1995,
+      " respect": 1996,
+      "ances": 1997
+    },
+    "merges": [
+    ]
+  }
+}
diff --git a/tests/assets/tokenizer/tokenizer_config.json b/tests/assets/tokenizer/tokenizer_config.json
new file mode 100644
index 000000000..da6379b3f
--- /dev/null
+++ b/tests/assets/tokenizer/tokenizer_config.json
@@ -0,0 +1,29 @@
+{
+  "added_tokens_decoder": {
+    "128000": {
+      "content": "<|begin_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128001": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}
diff --git a/tests/unit_tests/test_dataset_checkpointing.py b/tests/unit_tests/test_dataset_checkpointing.py
index 00998fc49..36dcd8f86 100644
--- a/tests/unit_tests/test_dataset_checkpointing.py
+++ b/tests/unit_tests/test_dataset_checkpointing.py
@@ -8,9 +8,9 @@
 
 import torch
 from datasets import load_dataset
+from torchtitan.components.tokenizer import HuggingFaceTokenizer
 from torchtitan.config_manager import ConfigManager
 from torchtitan.datasets.hf_datasets import build_hf_dataloader, DatasetConfig, DATASETS
-from torchtitan.datasets.tokenizer.tiktoken import TikTokenizer
 
 
 class TestDatasetCheckpointing(unittest.TestCase):
@@ -58,7 +58,7 @@ def test_c4_resumption(self):
                         assert torch.equal(labels, expected_labels)
 
     def _build_dataloader(self, dataset_name, batch_size, seq_len, world_size, rank):
-        tokenizer = TikTokenizer("./tests/assets/test_tiktoken.model")
+        tokenizer = HuggingFaceTokenizer("./tests/assets/tokenizer")
         config_manager = ConfigManager()
         config = config_manager.parse_args(
             [
diff --git a/tests/unit_tests/test_tokenizer.py b/tests/unit_tests/test_tokenizer.py
index 8efd48167..72fa28a46 100644
--- a/tests/unit_tests/test_tokenizer.py
+++ b/tests/unit_tests/test_tokenizer.py
@@ -19,7 +19,7 @@
     parametrize,
 )
 
-from torchtitan.components.tokenizer import build_hf_tokenizer
+from torchtitan.components.tokenizer import HuggingFaceTokenizer
 
 
 class TestTokenizerIntegration(unittest.TestCase):
@@ -278,7 +278,7 @@ def test_download_and_build_tokenizer(self, test_repo_id):
         model_name = test_repo_id.split("/")[-1]
         tokenizer_dir = "tokenizer" if model_name == "FLUX.1-dev" else "."
         tokenizer_path = os.path.join(self.temp_dir, model_name, tokenizer_dir)
-        our_tokenizer = build_hf_tokenizer(tokenizer_path)
+        our_tokenizer = HuggingFaceTokenizer(tokenizer_path)
 
         # Step 3: Load tokenizer using official Tokenizer library (if available)
         official_tokenizer = None
@@ -308,101 +308,6 @@ def test_download_and_build_tokenizer(self, test_repo_id):
                 our_tokenizer, transformers_tokenizer, test_repo_id
             )
 
-    def test_backward_comptability(self):
-        from torchtitan.datasets.tokenizer.tiktoken import TikTokenizer
-
-        # The existing tokenizer lives under assets/original/tokenizer.model
-        # This test ensures that the new tokenizer can load the old tokenizer
-        # and produce the same results
-
-        # Get the base project directory (two levels up from test file)
-        base_project_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
-        old_tokenizer_path = os.path.join(
-            base_project_dir, "assets", "tokenizer", "original", "tokenizer.model"
-        )
-
-        # Skip test if the old tokenizer path cannot be found
-        if not os.path.exists(old_tokenizer_path):
-            self.skipTest(f"Old tokenizer file not found at {old_tokenizer_path}")
-
-        print(old_tokenizer_path)
-        old_tokenizer = TikTokenizer(old_tokenizer_path)
-
-        # Download and load a new tokenizer for comparison (using Meta-Llama-3.1-8B)
-        test_repo_id = "meta-llama/Meta-Llama-3.1-8B"
-        try:
-            download_hf_tokenizer_files(
-                repo_id=test_repo_id,
-                local_dir=self.temp_dir,
-            )
-
-            # Load the new tokenizer
-            model_name = test_repo_id.split("/")[-1]
-            new_tokenizer_path = os.path.join(self.temp_dir, model_name)
-            new_tokenizer = build_hf_tokenizer(new_tokenizer_path)
-
-            # Compare encoding and decoding functionality only (TikTokenizer doesn't support vocab operations)
-            test_texts = [
-                "Hello world!",
-                "This is a test.",
-                "The quick brown fox jumps over the lazy dog.",
-                "Special characters: @#$%^&*()",
-                "Numbers: 123456789",
-                "Mixed: Hello123 World!@#",
-                "",  # Empty string
-                " ",  # Single space
-                "  ",  # Multiple spaces
-            ]
-
-            for text in test_texts:
-                # Encode with both tokenizers
-                # TikTokenizer requires bos and eos parameters
-                old_tokens = old_tokenizer.encode(text, bos=True, eos=False)
-                # HuggingFaceTokenizer has optional add_bos and add_eos parameters
-                new_tokens = new_tokenizer.encode(text)
-
-                self.assertEqual(
-                    old_tokens,
-                    new_tokens,
-                    f"Encoded tokens should match for text '{text}' in backward compatibility test",
-                )
-
-                # Test decoding
-                old_decoded = old_tokenizer.decode(old_tokens)
-                new_decoded = new_tokenizer.decode(
-                    new_tokens, skip_special_tokens=False
-                )
-
-                self.assertEqual(
-                    old_decoded,
-                    new_decoded,
-                    f"Decoded text should match for '{text}' in backward compatibility test",
-                )
-
-            # Test edge cases
-            edge_cases = [
-                "🚀🌟✨",  # Emojis
-                "café naïve résumé",  # Accented characters
-                "こんにちは世界",  # Non-Latin scripts (Japanese)
-                "Здравствуй мир",  # Cyrillic
-                "\n\t\r",  # Whitespace characters
-                "a"
-                * 100,  # Long repeated character (reduced from 1000 to avoid tiktoken limits)
-            ]
-
-            for text in edge_cases:
-                old_tokens = old_tokenizer.encode(text, bos=True, eos=False)
-                new_tokens = new_tokenizer.encode(text)
-
-                self.assertEqual(
-                    old_tokens,
-                    new_tokens,
-                    f"Edge case tokens should match for text '{text[:50]}...' in backward compatibility test",
-                )
-
-        except HTTPError as e:
-            self.skipTest(f"Could not download new tokenizer for comparison: {e}")
-
 
 instantiate_parametrized_tests(TestTokenizerIntegration)
 
diff --git a/tests/unit_tests/test_train_spec.py b/tests/unit_tests/test_train_spec.py
index 15780d10a..c364af385 100644
--- a/tests/unit_tests/test_train_spec.py
+++ b/tests/unit_tests/test_train_spec.py
@@ -12,9 +12,9 @@
 from torchtitan.components.loss import build_cross_entropy_loss
 from torchtitan.components.lr_scheduler import build_lr_schedulers
 from torchtitan.components.optimizer import build_optimizers, OptimizersContainer
+from torchtitan.components.tokenizer import build_hf_tokenizer
 from torchtitan.config_manager import JobConfig
 from torchtitan.datasets.hf_datasets import build_hf_dataloader
-from torchtitan.datasets.tokenizer.tiktoken import build_tiktoken_tokenizer
 from torchtitan.models.llama3 import parallelize_llama, pipeline_llama
 from torchtitan.protocols.train_spec import (
     apply_to_train_specs,
@@ -67,7 +67,7 @@ def test_register_train_spec(self):
             build_optimizers_fn=build_optimizers,
             build_lr_schedulers_fn=build_lr_schedulers,
             build_dataloader_fn=build_hf_dataloader,
-            build_tokenizer_fn=build_tiktoken_tokenizer,
+            build_tokenizer_fn=build_hf_tokenizer,
             build_loss_fn=build_cross_entropy_loss,
         )
         register_train_spec(spec)
@@ -88,7 +88,7 @@ def test_optim_hook(self):
             build_optimizers_fn=fake_build_optimizers,
             build_lr_schedulers_fn=build_lr_schedulers,
             build_dataloader_fn=build_hf_dataloader,
-            build_tokenizer_fn=build_tiktoken_tokenizer,
+            build_tokenizer_fn=build_hf_tokenizer,
             build_loss_fn=build_cross_entropy_loss,
         )
         register_train_spec(spec)
diff --git a/torchtitan/components/tokenizer.py b/torchtitan/components/tokenizer.py
index def7594ae..45ecf34f9 100644
--- a/torchtitan/components/tokenizer.py
+++ b/torchtitan/components/tokenizer.py
@@ -6,18 +6,22 @@
 
 
 import json
+
+import logging
 import os
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
-from tokenizers import AddedToken, Tokenizer as HfTokenizer
+from tokenizers import AddedToken, Tokenizer
+from torchtitan.config_manager import JobConfig
 from typing_extensions import override
 
+logger = logging.getLogger(__name__)
+
 
-class Tokenizer(ABC):
-    # basic tokenizer interface, for typing purpose mainly
+class BaseTokenizer(ABC):
+    # base tokenizer interface, for typing purpose mainly
     def __init__(self):
-        self._n_words = 8
         self.eos_id = 0
 
     @abstractmethod
@@ -28,12 +32,12 @@ def encode(self, *args, **kwargs) -> list[int]:
     def decode(self, *args, **kwargs) -> str:
         ...
 
-    @property
-    def n_words(self) -> int:
-        return self._n_words
+    @abstractmethod
+    def get_vocab_size(self) -> int:
+        ...
 
 
-class HuggingFaceTokenizer(Tokenizer):
+class HuggingFaceTokenizer(BaseTokenizer):
     """
     A tokenizer wrapper that handles BOS/EOS token inference and encoding.
 
@@ -49,6 +53,7 @@ def __init__(
         self,
         tokenizer_path: str,
     ):
+        super().__init__()
         self.tokenizer_path = tokenizer_path
 
         # Initialize BOS/EOS token attributes (frequently used)
@@ -76,7 +81,7 @@ def _load_config(self, config_path: str) -> Optional[dict]:
                 return json.load(f)
         return None
 
-    def _load_tokenizer_from_path(self, tokenizer_path: str) -> HfTokenizer:
+    def _load_tokenizer_from_path(self, tokenizer_path: str) -> Tokenizer:
         """Load tokenizer from various file formats."""
         if not os.path.exists(tokenizer_path):
             raise FileNotFoundError(f"Tokenizer path '{tokenizer_path}' does not exist")
@@ -87,87 +92,79 @@ def _load_tokenizer_from_path(self, tokenizer_path: str) -> HfTokenizer:
         vocab_json_path = os.path.join(tokenizer_path, "vocab.json")
         merges_txt_path = os.path.join(tokenizer_path, "merges.txt")
 
-        try:
-            # Strategy 1: Load from tokenizer.json (preferred for modern tokenizers)
-            if os.path.exists(tokenizer_json_path):
-                print("Loading tokenizer from tokenizer.json")
-                return HfTokenizer.from_file(tokenizer_json_path)
-            # Strategy 2: Load from vocab files (with or without merges.txt)
-            elif os.path.exists(vocab_json_path) or os.path.exists(vocab_txt_path):
-                # Load vocabulary
-                if os.path.exists(vocab_json_path):
-                    print("Loading vocabulary from vocab.json")
-                    with open(vocab_json_path, "r") as f:
-                        vocab = json.load(f)
-                    vocab_source = "vocab.json"
-                else:
-                    print("Loading vocabulary from vocab.txt")
-                    vocab = {}
-                    with open(vocab_txt_path, "r") as f:
-                        for i, line in enumerate(f):
-                            token = line.strip()
-                            if token:
-                                vocab[token] = i
-                    vocab_source = "vocab.txt"
-
-                # Strategy 2a: Use BPE if merges.txt exists
-                if os.path.exists(merges_txt_path):
-                    print(f"Loading BPE tokenizer from {vocab_source} + merges.txt")
-                    from tokenizers import decoders, pre_tokenizers, processors
-                    from tokenizers.models import BPE
-
-                    # Load merges from file and convert to tuples
-                    merges = []
-                    with open(merges_txt_path, "r") as f:
-                        for line in f:
-                            line = line.strip()
-                            if line and not line.startswith(
-                                "#"
-                            ):  # Skip comments and empty lines
-                                parts = line.split()
-                                if len(parts) >= 2:
-                                    merges.append((parts[0], parts[1]))
-
-                    # Create BPE model
-                    bpe_model = BPE(vocab=vocab, merges=merges)
-                    tokenizer = HfTokenizer(bpe_model)
-
-                    # Configure GPT-2 style components for proper space handling
-                    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
-                        add_prefix_space=False
-                    )
-                    tokenizer.decoder = decoders.ByteLevel()
-                    tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
-
-                    return tokenizer
-
-                # Strategy 2b: Use WordLevel if no merges.txt
-                else:
-                    print(f"Loading WordLevel tokenizer from {vocab_source}")
-                    from tokenizers.models import WordLevel
-
-                    word_level_model = WordLevel(vocab=vocab, unk_token="[UNK]")
-                    return HfTokenizer(word_level_model)
-
+        # Strategy 1: Load from tokenizer.json (preferred for modern tokenizers)
+        if os.path.exists(tokenizer_json_path):
+            logger.info("Loading tokenizer from tokenizer.json")
+            return Tokenizer.from_file(tokenizer_json_path)
+        # Strategy 2: Load from vocab files (with or without merges.txt)
+        elif os.path.exists(vocab_json_path) or os.path.exists(vocab_txt_path):
+            # Load vocabulary
+            if os.path.exists(vocab_json_path):
+                logger.info("Loading vocabulary from vocab.json")
+                with open(vocab_json_path, "r") as f:
+                    vocab = json.load(f)
+                vocab_source = "vocab.json"
             else:
-                # List available files for debugging
-                available_files = [
-                    f
-                    for f in os.listdir(tokenizer_path)
-                    if os.path.isfile(os.path.join(tokenizer_path, f))
-                ]
-                raise FileNotFoundError(
-                    f"No supported tokenizer files found in '{tokenizer_path}'. "
-                    f"Available files: {available_files}. "
-                    "Looking for: tokenizer.json, tokenizer.model, vocab.txt+merges.txt, or vocab.json+merges.txt"
+                logger.info("Loading vocabulary from vocab.txt")
+                vocab = {}
+                with open(vocab_txt_path, "r") as f:
+                    for i, line in enumerate(f):
+                        token = line.strip()
+                        if token:
+                            vocab[token] = i
+                vocab_source = "vocab.txt"
+
+            # Strategy 2a: Use BPE if merges.txt exists
+            if os.path.exists(merges_txt_path):
+                logger.info(f"Loading BPE tokenizer from {vocab_source} + merges.txt")
+                from tokenizers import decoders, pre_tokenizers, processors
+                from tokenizers.models import BPE
+
+                # Load merges from file and convert to tuples
+                merges = []
+                with open(merges_txt_path, "r") as f:
+                    for line in f:
+                        line = line.strip()
+                        if line and not line.startswith(
+                            "#"
+                        ):  # Skip comments and empty lines
+                            parts = line.split()
+                            if len(parts) >= 2:
+                                merges.append((parts[0], parts[1]))
+
+                # Create BPE model
+                bpe_model = BPE(vocab=vocab, merges=merges)
+                tokenizer = Tokenizer(bpe_model)
+
+                # Configure GPT-2 style components for proper space handling
+                tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
+                    add_prefix_space=False
                 )
+                tokenizer.decoder = decoders.ByteLevel()
+                tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
 
-        except Exception as e:
-            if isinstance(e, FileNotFoundError):
-                raise e
-            raise Exception(
-                f"Failed to load tokenizer from '{tokenizer_path}': {e}"
-            ) from e
+                return tokenizer
+
+            # Strategy 2b: Use WordLevel if no merges.txt
+            else:
+                logger.info(f"Loading WordLevel tokenizer from {vocab_source}")
+                from tokenizers.models import WordLevel
+
+                word_level_model = WordLevel(vocab=vocab, unk_token="[UNK]")
+                return Tokenizer(word_level_model)
+
+        else:
+            # List available files for debugging
+            available_files = [
+                f
+                for f in os.listdir(tokenizer_path)
+                if os.path.isfile(os.path.join(tokenizer_path, f))
+            ]
+            raise FileNotFoundError(
+                f"No supported tokenizer files found in '{tokenizer_path}'. "
+                f"Available files: {available_files}. "
+                "Looking for: tokenizer.json, tokenizer.model, vocab.txt+merges.txt, or vocab.json+merges.txt"
+            )
 
     def _get_token_from_config(self, config: dict[str, Any], key: str) -> Optional[str]:
         """
@@ -387,11 +384,11 @@ def decode(self, *args, **kwargs) -> str:
     @property
     def vocab_size(self) -> int:
         """Get the vocabulary size."""
-        return len(self.tokenizer.get_vocab())
+        return self.tokenizer.get_vocab_size()
 
     def get_vocab_size(self) -> int:
         """Get the vocabulary size."""
-        return len(self.tokenizer.get_vocab())
+        return self.tokenizer.get_vocab_size()
 
     def get_vocab(self) -> dict[str, int]:
         """Get the vocabulary as a dictionary."""
@@ -406,7 +403,9 @@ def id_to_token(self, token_id: int) -> Optional[str]:
         return self.tokenizer.id_to_token(token_id)
 
 
-def build_hf_tokenizer(tokenizer_path: str) -> HuggingFaceTokenizer:
+def build_hf_tokenizer(
+    job_config: JobConfig,
+) -> Union[HuggingFaceTokenizer, BaseTokenizer]:
     """
     Builds a HuggingFaceTokenizer from the specified path.
 
@@ -415,11 +414,10 @@ def build_hf_tokenizer(tokenizer_path: str) -> HuggingFaceTokenizer:
     from various file formats and infers special token behavior.
 
     Args:
-        tokenizer_path (str): Path to the directory containing tokenizer files.
-                             Should contain one or more of the supported file types.
+        JobConfig: A JobConfig object containing the path to the tokenizer directory.
 
     Returns:
         tokenizer (HuggingFaceTokenizer): Loaded tokenizer instance with intelligent BOS/EOS handling
     """
-    tokenizer = HuggingFaceTokenizer(tokenizer_path)
+    tokenizer = HuggingFaceTokenizer(job_config.model.tokenizer_path)
     return tokenizer
diff --git a/torchtitan/components/validate.py b/torchtitan/components/validate.py
index 77d89c454..904c65ca5 100644
--- a/torchtitan/components/validate.py
+++ b/torchtitan/components/validate.py
@@ -11,7 +11,7 @@
 from torch.distributed.fsdp import FSDPModule
 from torchtitan.components.dataloader import BaseDataLoader
 from torchtitan.components.loss import LossFunction
-from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.config_manager import JobConfig
 from torchtitan.datasets.hf_datasets import build_hf_validation_dataloader
 from torchtitan.distributed import ParallelDims, utils as dist_utils
@@ -48,7 +48,7 @@ def __init__(
         job_config: JobConfig,
         dp_world_size: int,
         dp_rank: int,
-        tokenizer: Tokenizer,
+        tokenizer: BaseTokenizer,
         parallel_dims: ParallelDims,
         world_mesh: torch.distributed.DeviceMesh,
         loss_fn: LossFunction,
@@ -142,7 +142,7 @@ def build_validator(
     job_config: JobConfig,
     dp_world_size: int,
     dp_rank: int,
-    tokenizer: Tokenizer,
+    tokenizer: BaseTokenizer,
     parallel_dims: ParallelDims,
     world_mesh: torch.distributed.DeviceMesh,
     loss_fn: LossFunction,
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
index d5e60bf23..8a2ebe434 100644
--- a/torchtitan/config_manager.py
+++ b/torchtitan/config_manager.py
@@ -884,7 +884,15 @@ def _validate_config(self) -> None:
                 self.config.model.tokenizer_path = old_tokenizer_path
                 logger.warning(
                     f"Temporarily switching to previous default tokenizer path {old_tokenizer_path}. "
-                    "Please update your config."
+                    "Please download the new tokenizer model (python scripts/download_tokenizer.py) and update your config."
+                )
+        else:
+            # Check if we are using tokenizer.model, if so then we need to alert users to redownload the tokenizer
+            if self.config.model.tokenizer_path.endswith("tokenizer.model"):
+                raise Exception(
+                    "You are using the old tokenizer.model, please redownload the tokenizer ",
+                    "(python scripts/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3.1-8B) ",
+                    " and update your config to the directory of the downloaded tokenizer.",
                 )
 
     @staticmethod
diff --git a/torchtitan/datasets/hf_datasets.py b/torchtitan/datasets/hf_datasets.py
index 9f692d81d..dbef80a6e 100644
--- a/torchtitan/datasets/hf_datasets.py
+++ b/torchtitan/datasets/hf_datasets.py
@@ -17,7 +17,7 @@
 from torch.utils.data import IterableDataset
 
 from torchtitan.components.dataloader import ParallelAwareDataloader
-from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.config_manager import JobConfig
 from torchtitan.tools.logging import logger
 
@@ -80,7 +80,7 @@ def __init__(
         self,
         dataset_name: str,
         dataset_path: str | None,
-        tokenizer: Tokenizer,
+        tokenizer: BaseTokenizer,
         seq_len: int = 2048,
         dp_rank: int = 0,
         dp_world_size: int = 1,
@@ -123,7 +123,9 @@ def __iter__(self):
             for sample in self._get_data_iter():
                 # Use the dataset-specific text processor
                 sample_text = self._text_processor(sample)
-                sample_tokens = self._tokenizer.encode(sample_text, bos=True, eos=True)
+                sample_tokens = self._tokenizer.encode(
+                    sample_text, add_bos=True, add_eos=True
+                )
                 self._token_buffer.extend(sample_tokens)
                 self._sample_idx += 1
 
@@ -174,7 +176,7 @@ def state_dict(self):
 def build_hf_dataloader(
     dp_world_size: int,
     dp_rank: int,
-    tokenizer: Tokenizer,
+    tokenizer: BaseTokenizer,
     job_config: JobConfig,
     infinite: bool = True,
 ) -> ParallelAwareDataloader:
@@ -205,7 +207,7 @@ def build_hf_dataloader(
 def build_hf_validation_dataloader(
     dp_world_size: int,
     dp_rank: int,
-    tokenizer: Tokenizer,
+    tokenizer: BaseTokenizer,
     job_config: JobConfig,
 ) -> ParallelAwareDataloader:
     """Build a validation data loader for HuggingFace datasets."""
diff --git a/torchtitan/datasets/tokenizer/tiktoken.py b/torchtitan/datasets/tokenizer/tiktoken.py
deleted file mode 100644
index 401757a93..000000000
--- a/torchtitan/datasets/tokenizer/tiktoken.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
-
-import os
-from collections.abc import Collection, Iterator, Sequence, Set as AbstractSet
-from pathlib import Path
-from typing import cast, Literal
-
-import tiktoken
-from tiktoken.load import load_tiktoken_bpe
-
-from torchtitan.components.tokenizer import Tokenizer
-from torchtitan.config_manager import JobConfig
-from torchtitan.tools.logging import logger
-
-
-class TikTokenizer(Tokenizer):
-    """
-    Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
-
-    Args:
-        model_path (str): The path to the Tiktoken model file.
-    """
-
-    special_tokens: dict[str, int]
-
-    num_reserved_special_tokens = 256
-
-    pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: E501, B950
-
-    def __init__(self, model_path: str):
-        super().__init__()
-        assert os.path.exists(
-            model_path
-        ), f"The tokenizer path does not exist: {model_path}"
-        assert os.path.isfile(model_path), model_path
-
-        mergeable_ranks = load_tiktoken_bpe(model_path)
-        num_base_tokens = len(mergeable_ranks)
-        special_tokens = [
-            "<|begin_of_text|>",
-            "<|end_of_text|>",
-            "<|reserved_special_token_0|>",
-            "<|reserved_special_token_1|>",
-            "<|reserved_special_token_2|>",
-            "<|reserved_special_token_3|>",
-            "<|start_header_id|>",
-            "<|end_header_id|>",
-            "<|reserved_special_token_4|>",
-            "<|eot_id|>",  # end of turn
-        ] + [
-            f"<|reserved_special_token_{i}|>"
-            for i in range(5, self.num_reserved_special_tokens - 5)
-        ]
-        self.special_tokens = {
-            token: num_base_tokens + i for i, token in enumerate(special_tokens)
-        }
-        self.model = tiktoken.Encoding(
-            name=Path(model_path).name,
-            pat_str=self.pat_str,
-            mergeable_ranks=mergeable_ranks,
-            special_tokens=self.special_tokens,
-        )
-
-        self._n_words: int = self.model.n_vocab
-        # BOS / EOS token IDs
-        self.bos_id: int = self.special_tokens["<|begin_of_text|>"]
-        self.eos_id: int = self.special_tokens["<|end_of_text|>"]
-        self.pad_id: int = -1
-        self.stop_tokens = {
-            self.special_tokens["<|end_of_text|>"],
-            self.special_tokens["<|eot_id|>"],
-        }
-        logger.info(
-            f"TikTokenizer built: #words {self.n_words}, BOS ID {self.bos_id}, EOS ID {self.eos_id}"
-        )
-
-    def encode(
-        self,
-        s: str,
-        *,
-        bos: bool,
-        eos: bool,
-        allowed_special: Literal["all"] | AbstractSet[str] | None = None,
-        disallowed_special: Literal["all"] | Collection[str] | None = None,
-    ) -> list[int]:
-        """
-        Encodes a string into a list of token IDs.
-
-        Args:
-            s (str): The input string to be encoded.
-            bos (bool): Whether to prepend the beginning-of-sequence token.
-            eos (bool): Whether to append the end-of-sequence token.
-            allowed_tokens ("all"|set[str]): allowed special tokens in string
-            disallowed_tokens ("all"|set[str]): special tokens that raise an error when in string
-
-        Returns:
-            list[int]: A list of token IDs.
-
-        By default, setting disallowed_special=() encodes a string by ignoring
-        special tokens. Specifically:
-        - Setting `disallowed_special` to () will cause all text corresponding
-          to special tokens to be encoded as natural text (insteading of raising
-          an error).
-        - Setting `allowed_special` to "all" will treat all text corresponding
-          to special tokens to be encoded as special tokens.
-        """
-        assert type(s) is str
-        allowed_special = allowed_special or set()
-        disallowed_special = disallowed_special or ()
-
-        # The tiktoken tokenizer can handle <=400k chars without
-        # pyo3_runtime.PanicException.
-        TIKTOKEN_MAX_ENCODE_CHARS = 400_000
-
-        # https://github.com/openai/tiktoken/issues/195
-        # Here we iterate over subsequences and split if we exceed the limit
-        # of max consecutive non-whitespace or whitespace characters.
-        MAX_NO_WHITESPACES_CHARS = 25_000
-
-        substrs = (
-            substr
-            for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
-            for substr in self._split_whitespaces_or_nonwhitespaces(
-                s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
-            )
-        )
-        t: list[int] = []
-        for substr in substrs:
-            t.extend(
-                self.model.encode(
-                    substr,
-                    allowed_special=allowed_special,
-                    disallowed_special=disallowed_special,
-                )
-            )
-        if bos:
-            t.insert(0, self.bos_id)
-        if eos:
-            t.append(self.eos_id)
-        return t
-
-    def decode(self, t: Sequence[int]) -> str:
-        """
-        Decodes a list of token IDs into a string.
-
-        Args:
-            t (List[int]): The list of token IDs to be decoded.
-
-        Returns:
-            str: The decoded string.
-        """
-        # Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
-        return self.model.decode(cast(list[int], t))
-
-    @staticmethod
-    def _split_whitespaces_or_nonwhitespaces(
-        s: str, max_consecutive_slice_len: int
-    ) -> Iterator[str]:
-        """
-        Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
-        consecutive whitespaces or consecutive non-whitespaces.
-        """
-        current_slice_len = 0
-        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
-        slice_start = 0
-
-        for i in range(len(s)):
-            is_now_space = s[i].isspace()
-
-            if current_slice_is_space ^ is_now_space:
-                current_slice_len = 1
-                current_slice_is_space = is_now_space
-            else:
-                current_slice_len += 1
-                if current_slice_len > max_consecutive_slice_len:
-                    yield s[slice_start:i]
-                    slice_start = i
-                    current_slice_len = 1
-        yield s[slice_start:]
-
-
-def build_tiktoken_tokenizer(job_config: JobConfig) -> TikTokenizer:
-    return TikTokenizer(job_config.model.tokenizer_path)
diff --git a/torchtitan/experiments/deepseek_v3/model_args.py b/torchtitan/experiments/deepseek_v3/model_args.py
index 21e2dbd95..b7fd7f1a7 100644
--- a/torchtitan/experiments/deepseek_v3/model_args.py
+++ b/torchtitan/experiments/deepseek_v3/model_args.py
@@ -9,7 +9,7 @@
 
 from torch import nn
 
-from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.config_manager import JobConfig
 
 from torchtitan.protocols.train_spec import BaseModelArgs
@@ -59,8 +59,10 @@ class TransformerModelArgs(BaseModelArgs):
     use_grouped_mm: bool = True  # grouped mm or for-loop for the experts computation
     load_balance_coeff: float | None = 1e-3
 
-    def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None:
-        self.vocab_size = tokenizer.n_words
+    def update_from_config(
+        self, job_config: JobConfig, tokenizer: BaseTokenizer
+    ) -> None:
+        self.vocab_size = tokenizer.get_vocab_size()
         self.max_seq_len = job_config.training.seq_len
         self.eos_id = tokenizer.eos_id
 
diff --git a/torchtitan/experiments/deepseek_v3/train_configs/deepseek_v2.toml b/torchtitan/experiments/deepseek_v3/train_configs/deepseek_v2.toml
index 1402e57e1..6b8390178 100644
--- a/torchtitan/experiments/deepseek_v3/train_configs/deepseek_v2.toml
+++ b/torchtitan/experiments/deepseek_v3/train_configs/deepseek_v2.toml
@@ -22,7 +22,7 @@ enable_wandb = false
 name = "deepseek_v2"
 flavor = "deepseek-ai/DeepSeek-V2-Lite"
 # test tokenizer.model, for debug purpose only
-tokenizer_path = "./tests/assets/test_tiktoken.model"
+tokenizer_path = "./tests/assets/tokenizer"
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/experiments/flux/dataset/flux_dataset.py b/torchtitan/experiments/flux/dataset/flux_dataset.py
index fcdac0b9d..83aa7ae06 100644
--- a/torchtitan/experiments/flux/dataset/flux_dataset.py
+++ b/torchtitan/experiments/flux/dataset/flux_dataset.py
@@ -20,7 +20,7 @@
 from torch.utils.data import IterableDataset
 from torchtitan.components.dataloader import ParallelAwareDataloader
 
-from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.config_manager import JobConfig
 from torchtitan.experiments.flux.dataset.tokenizer import (
     build_flux_tokenizer,
@@ -161,8 +161,8 @@ def __init__(
         self,
         dataset_name: str,
         dataset_path: Optional[str],
-        t5_tokenizer: Tokenizer,
-        clip_tokenizer: Tokenizer,
+        t5_tokenizer: BaseTokenizer,
+        clip_tokenizer: BaseTokenizer,
         job_config: Optional[JobConfig] = None,
         dp_rank: int = 0,
         dp_world_size: int = 1,
diff --git a/torchtitan/experiments/flux/dataset/tokenizer.py b/torchtitan/experiments/flux/dataset/tokenizer.py
index 3903c8a17..3d69b0ac5 100644
--- a/torchtitan/experiments/flux/dataset/tokenizer.py
+++ b/torchtitan/experiments/flux/dataset/tokenizer.py
@@ -11,20 +11,19 @@
 from typing import List
 
 import torch
-from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.components.tokenizer import BaseTokenizer, HuggingFaceTokenizer
 from torchtitan.config_manager import JobConfig
-from torchtitan.datasets.tokenizer.tiktoken import TikTokenizer
 from transformers import CLIPTokenizer, T5Tokenizer
 
 
-class FluxTestTokenizer(Tokenizer):
+class FluxTestTokenizer(BaseTokenizer):
     """
     Flux Tokenizer for test purpose. This is a simple wrapper around the TikTokenizer,
      to make it has same interface as the T5 and CLIP tokenizer used for Flux.
     """
 
     def __init__(self, model_path: str = "t5-small", max_length: int = 77, **hf_kwargs):
-        self.tiktokenizer = TikTokenizer(model_path, **hf_kwargs)
+        self.tiktokenizer = HuggingFaceTokenizer(model_path, **hf_kwargs)
         self._max_length = max_length
         self.pad_id = 0
 
@@ -43,11 +42,14 @@ def _pad_and_chunk_tokens(
 
         return tokens
 
+    def get_vocab_size(self) -> int:
+        return self.tiktokenizer.vocab_size
+
     def encode(self, text: str) -> torch.Tensor:
         """
         Use TikTokenizer to encode the text into tokens, and then pad and chunk the tokens to max_length.
         """
-        tokens = self.tiktokenizer.encode(text, bos=True, eos=True)
+        tokens = self.tiktokenizer.encode(text, add_bos=True, add_eos=True)
         tokens = self._pad_and_chunk_tokens(tokens, self._max_length, self.pad_id)
         return torch.tensor(tokens)
 
@@ -58,7 +60,7 @@ def decode(self, t: List[int]) -> str:
         return self.tiktokenizer.decode(t)
 
 
-class FluxTokenizer(Tokenizer):
+class FluxTokenizer(BaseTokenizer):
     """
     Tokenizing and encoding/decoding text using the T5 or Clip tokenizer.
 
@@ -83,6 +85,9 @@ def __init__(self, model_path: str = "t5-small", max_length: int = 77, **hf_kwar
                 model_path, max_length=max_length, **hf_kwargs
             )
 
+    def get_vocab_size(self) -> int:
+        return self._tokenizer.vocab_size
+
     def encode(
         self,
         s: str,
@@ -108,7 +113,7 @@ def decode(self, t: List[int]) -> str:
         return self._tokenizer.decode(t)
 
 
-def build_flux_tokenizer(job_config: JobConfig) -> tuple[Tokenizer, Tokenizer]:
+def build_flux_tokenizer(job_config: JobConfig) -> tuple[BaseTokenizer, BaseTokenizer]:
     """
     Build the tokenizer for Flux.
     """
diff --git a/torchtitan/experiments/flux/sampling.py b/torchtitan/experiments/flux/sampling.py
index 382832a0c..f9f1b9086 100644
--- a/torchtitan/experiments/flux/sampling.py
+++ b/torchtitan/experiments/flux/sampling.py
@@ -14,7 +14,7 @@
 
 from torch import Tensor
 
-from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.config_manager import JobConfig
 from torchtitan.tools.logging import logger
 
@@ -78,8 +78,8 @@ def generate_image(
     model: FluxModel,
     prompt: str,
     autoencoder: AutoEncoder,
-    t5_tokenizer: Tokenizer,
-    clip_tokenizer: Tokenizer,
+    t5_tokenizer: BaseTokenizer,
+    clip_tokenizer: BaseTokenizer,
     t5_encoder: FluxEmbedder,
     clip_encoder: FluxEmbedder,
 ) -> torch.Tensor:
diff --git a/torchtitan/experiments/flux/tests/integration_tests.py b/torchtitan/experiments/flux/tests/integration_tests.py
index 4bb588a0a..9ba7ee378 100755
--- a/torchtitan/experiments/flux/tests/integration_tests.py
+++ b/torchtitan/experiments/flux/tests/integration_tests.py
@@ -106,7 +106,7 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str):
     t5_encoder_version_arg = (
         "--encoder.t5_encoder torchtitan/experiments/flux/tests/assets/t5-v1_1-xxl/"
     )
-    tokenzier_path_arg = "--model.tokenizer_path tests/assets/test_tiktoken.model"
+    tokenzier_path_arg = "--model.tokenizer_path tests/assets/tokenizer"
 
     all_ranks = ",".join(map(str, range(test_flavor.ngpu)))
 
diff --git a/torchtitan/experiments/llama4/README.md b/torchtitan/experiments/llama4/README.md
index 4b42f7c3f..23b75b859 100644
--- a/torchtitan/experiments/llama4/README.md
+++ b/torchtitan/experiments/llama4/README.md
@@ -12,7 +12,7 @@ https://github.com/pytorch/torchtitan/issues/1118
 #### Download Llama 4 tokenizer
 ```bash
 # Llama 4 tokenizer.model
-python scripts/download_tokenizer.py --repo_id meta-llama/Llama-4-Scout-17B-16E --tokenizer_path "" --hf_token=...
+python scripts/download_tokenizer.py --repo_id meta-llama/Llama-4-Scout-17B-16E --hf_token=...
 ```
 
 #### To be added
diff --git a/torchtitan/experiments/llama4/__init__.py b/torchtitan/experiments/llama4/__init__.py
index 329c4e9d7..9f7affc09 100644
--- a/torchtitan/experiments/llama4/__init__.py
+++ b/torchtitan/experiments/llama4/__init__.py
@@ -6,8 +6,8 @@
 
 from torchtitan.components.loss import build_cross_entropy_loss
 from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.tokenizer import build_hf_tokenizer
 from torchtitan.datasets.hf_datasets import build_hf_dataloader
-from torchtitan.datasets.tokenizer.tiktoken import build_tiktoken_tokenizer
 from torchtitan.models.llama3 import pipeline_llama
 from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
 
@@ -101,7 +101,7 @@
         build_optimizers_fn=build_llama4_optimizers,
         build_lr_schedulers_fn=build_lr_schedulers,
         build_dataloader_fn=build_hf_dataloader,
-        build_tokenizer_fn=build_tiktoken_tokenizer,
+        build_tokenizer_fn=build_hf_tokenizer,
         build_loss_fn=build_cross_entropy_loss,
     )
 )
diff --git a/torchtitan/experiments/llama4/model/args.py b/torchtitan/experiments/llama4/model/args.py
index 96168fcaf..a7f99e732 100644
--- a/torchtitan/experiments/llama4/model/args.py
+++ b/torchtitan/experiments/llama4/model/args.py
@@ -8,7 +8,7 @@
 from dataclasses import dataclass
 
 from torch import nn
-from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.config_manager import JobConfig
 
 from torchtitan.protocols.train_spec import BaseModelArgs
@@ -58,8 +58,10 @@ class TransformerModelArgs(BaseModelArgs):
     use_grouped_mm: bool = True  # grouped mm or for-loop for the experts computation
     load_balance_coeff: float | None = 1e-3
 
-    def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None:
-        self.vocab_size = tokenizer.n_words
+    def update_from_config(
+        self, job_config: JobConfig, tokenizer: BaseTokenizer
+    ) -> None:
+        self.vocab_size = tokenizer.get_vocab_size()
         self.max_seq_len = job_config.training.seq_len
         self.eos_id = tokenizer.eos_id
 
diff --git a/torchtitan/experiments/llama4/train_configs/debug_model.toml b/torchtitan/experiments/llama4/train_configs/debug_model.toml
index 7fbe95e19..d72406d8c 100644
--- a/torchtitan/experiments/llama4/train_configs/debug_model.toml
+++ b/torchtitan/experiments/llama4/train_configs/debug_model.toml
@@ -22,7 +22,7 @@ enable_wandb = false
 name = "llama4"
 flavor = "debugmodel"
 # test tokenizer.model, for debug purpose only
-tokenizer_path = "./tests/assets/test_tiktoken.model"
+tokenizer_path = "./tests/assets/tokenizer"
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/experiments/llama4/train_configs/llama4_17bx128e.toml b/torchtitan/experiments/llama4/train_configs/llama4_17bx128e.toml
index 4b9fc4d4f..707fea92e 100644
--- a/torchtitan/experiments/llama4/train_configs/llama4_17bx128e.toml
+++ b/torchtitan/experiments/llama4/train_configs/llama4_17bx128e.toml
@@ -17,7 +17,7 @@ save_tb_folder = "tb"
 [model]
 name = "llama4"
 flavor = "17bx128e"
-tokenizer_path = "./assets/tokenizer/tokenizer.model"
+tokenizer_path = "./assets/tokenizer/Llama-4-Scout-17B-16E"
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/experiments/llama4/train_configs/llama4_17bx16e.toml b/torchtitan/experiments/llama4/train_configs/llama4_17bx16e.toml
index 0f9402456..b4b14358c 100644
--- a/torchtitan/experiments/llama4/train_configs/llama4_17bx16e.toml
+++ b/torchtitan/experiments/llama4/train_configs/llama4_17bx16e.toml
@@ -17,7 +17,7 @@ save_tb_folder = "tb"
 [model]
 name = "llama4"
 flavor = "17bx16e"
-tokenizer_path = "./assets/tokenizer/tokenizer.model"
+tokenizer_path = "./assets/tokenizer/Llama-4-Scout-17B-16E"
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/experiments/multimodal/__init__.py b/torchtitan/experiments/multimodal/__init__.py
index fe08681bb..f3ba2a2d4 100644
--- a/torchtitan/experiments/multimodal/__init__.py
+++ b/torchtitan/experiments/multimodal/__init__.py
@@ -9,7 +9,7 @@
 from torchtitan.components.loss import build_cross_entropy_loss
 from torchtitan.components.lr_scheduler import build_lr_schedulers
 from torchtitan.components.optimizer import build_optimizers
-from torchtitan.datasets.tokenizer.tiktoken import build_tiktoken_tokenizer
+from torchtitan.components.tokenizer import build_hf_tokenizer
 from torchtitan.models.llama3 import parallelize_llama, pipeline_llama
 from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
 
@@ -31,7 +31,7 @@
         build_optimizers_fn=build_optimizers,
         build_lr_schedulers_fn=build_lr_schedulers,
         build_dataloader_fn=build_mm_dataloader,
-        build_tokenizer_fn=build_tiktoken_tokenizer,
+        build_tokenizer_fn=build_hf_tokenizer,
         build_loss_fn=build_cross_entropy_loss,
     )
 )
diff --git a/torchtitan/experiments/multimodal/mm_dataset.py b/torchtitan/experiments/multimodal/mm_dataset.py
index 519272c74..5daf1d0ea 100644
--- a/torchtitan/experiments/multimodal/mm_dataset.py
+++ b/torchtitan/experiments/multimodal/mm_dataset.py
@@ -13,7 +13,7 @@
 from datasets.distributed import split_dataset_by_node
 
 from mm_collator import MultiModalCollator
-from tokenizer.tiktoken import IGNORE_INDEX, Tokenizer
+from tokenizer.tiktoken import BaseTokenizer, IGNORE_INDEX
 from torch.distributed.checkpoint.stateful import Stateful
 from torch.utils.data import IterableDataset
 from transform import CLIPTransform
@@ -110,7 +110,7 @@ def __init__(
         self,
         dataset_name: str,
         dataset_path: Optional[str],
-        tokenizer: Tokenizer,
+        tokenizer: BaseTokenizer,
         image_token: str = "<|image|>",
         tile_size: int = 448,
         max_num_tiles: int = 4,
@@ -178,8 +178,8 @@ def __iter__(self):
                 # Tokenize
                 tokens = self._tokenizer.encode(
                     sample["text"],
-                    bos=True,
-                    eos=True,
+                    add_bos=True,
+                    add_eos=True,
                     allowed_special=set(["<|image|>"]),
                 )
                 sample["input_ids"] = torch.LongTensor(tokens[:-1])
@@ -233,7 +233,7 @@ def state_dict(self):
 def build_mm_dataloader(
     dp_world_size: int,
     dp_rank: int,
-    tokenizer: Tokenizer,
+    tokenizer: BaseTokenizer,
     job_config: JobConfig,
     infinite: bool = True,
 ) -> ParallelAwareDataloader:
diff --git a/torchtitan/experiments/multimodal/tokenizer/tiktoken.py b/torchtitan/experiments/multimodal/tokenizer/tiktoken.py
index 9d494a06f..b6de11e52 100644
--- a/torchtitan/experiments/multimodal/tokenizer/tiktoken.py
+++ b/torchtitan/experiments/multimodal/tokenizer/tiktoken.py
@@ -7,6 +7,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
 
+# TODO: Refactor this file since we have updated the tokenizer to
+# depend on Hugging Face Tokenizer (https://github.com/pytorch/torchtitan/pull/1333)
+
 import os
 from pathlib import Path
 from typing import (
@@ -28,7 +31,7 @@
 import torch
 from tiktoken.load import load_tiktoken_bpe
 
-from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.config_manager import JobConfig
 from torchtitan.tools.logging import logger
 
@@ -36,7 +39,7 @@
 IGNORE_INDEX = -100
 
 
-class TikTokenizer(Tokenizer):
+class TikTokenizer(BaseTokenizer):
     """
     Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
 
diff --git a/torchtitan/experiments/simple_fsdp/__init__.py b/torchtitan/experiments/simple_fsdp/__init__.py
index 9ed592326..80a2b3c3a 100644
--- a/torchtitan/experiments/simple_fsdp/__init__.py
+++ b/torchtitan/experiments/simple_fsdp/__init__.py
@@ -9,8 +9,8 @@
 from torchtitan.components.loss import build_cross_entropy_loss
 from torchtitan.components.lr_scheduler import build_lr_schedulers
 from torchtitan.components.optimizer import build_optimizers
+from torchtitan.components.tokenizer import build_hf_tokenizer
 from torchtitan.datasets.hf_datasets import build_hf_dataloader
-from torchtitan.datasets.tokenizer.tiktoken import build_tiktoken_tokenizer
 from torchtitan.models.llama3 import llama3_configs, pipeline_llama
 from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
 
@@ -27,7 +27,7 @@
         build_optimizers_fn=build_optimizers,
         build_lr_schedulers_fn=build_lr_schedulers,
         build_dataloader_fn=build_hf_dataloader,
-        build_tokenizer_fn=build_tiktoken_tokenizer,
+        build_tokenizer_fn=build_hf_tokenizer,
         build_loss_fn=build_cross_entropy_loss,
     )
 )
diff --git a/torchtitan/models/llama3/__init__.py b/torchtitan/models/llama3/__init__.py
index eec35cbf1..2e9a11d47 100644
--- a/torchtitan/models/llama3/__init__.py
+++ b/torchtitan/models/llama3/__init__.py
@@ -9,9 +9,9 @@
 from torchtitan.components.loss import build_cross_entropy_loss
 from torchtitan.components.lr_scheduler import build_lr_schedulers
 from torchtitan.components.optimizer import build_optimizers
+from torchtitan.components.tokenizer import build_hf_tokenizer
 from torchtitan.components.validate import build_validator
 from torchtitan.datasets.hf_datasets import build_hf_dataloader
-from torchtitan.datasets.tokenizer.tiktoken import build_tiktoken_tokenizer
 from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
 
 from .infra.parallelize import parallelize_llama
@@ -80,7 +80,7 @@
         build_optimizers_fn=build_optimizers,
         build_lr_schedulers_fn=build_lr_schedulers,
         build_dataloader_fn=build_hf_dataloader,
-        build_tokenizer_fn=build_tiktoken_tokenizer,
+        build_tokenizer_fn=build_hf_tokenizer,
         build_loss_fn=build_cross_entropy_loss,
         build_validator_fn=build_validator,
     )
diff --git a/torchtitan/models/llama3/model/args.py b/torchtitan/models/llama3/model/args.py
index 20e3bcbcd..38f7e3321 100644
--- a/torchtitan/models/llama3/model/args.py
+++ b/torchtitan/models/llama3/model/args.py
@@ -11,7 +11,7 @@
 
 from torch import nn
 
-from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.config_manager import JobConfig
 from torchtitan.protocols.train_spec import BaseModelArgs
 
@@ -37,8 +37,10 @@ class TransformerModelArgs(BaseModelArgs):
     attn_mask_type: str = "causal"
     eos_id: int = 0
 
-    def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None:
-        self.vocab_size = tokenizer.n_words
+    def update_from_config(
+        self, job_config: JobConfig, tokenizer: BaseTokenizer
+    ) -> None:
+        self.vocab_size = tokenizer.get_vocab_size()
         self.max_seq_len = job_config.training.seq_len
         self.eos_id = tokenizer.eos_id
 
diff --git a/torchtitan/models/llama3/train_configs/debug_model.toml b/torchtitan/models/llama3/train_configs/debug_model.toml
index 3710c689f..b9d26c7d9 100644
--- a/torchtitan/models/llama3/train_configs/debug_model.toml
+++ b/torchtitan/models/llama3/train_configs/debug_model.toml
@@ -23,8 +23,8 @@ enable_wandb = false
 [model]
 name = "llama3"
 flavor = "debugmodel"
-# test tokenizer.model, for debug purpose only
-tokenizer_path = "./tests/assets/test_tiktoken.model"
+# test folder with tokenizer.json, for debug purpose only
+tokenizer_path = "./tests/assets/tokenizer"
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/models/llama3/train_configs/llama3_405b.toml b/torchtitan/models/llama3/train_configs/llama3_405b.toml
index 61203e985..8b12113c5 100644
--- a/torchtitan/models/llama3/train_configs/llama3_405b.toml
+++ b/torchtitan/models/llama3/train_configs/llama3_405b.toml
@@ -18,7 +18,7 @@ save_tb_folder = "tb"
 [model]
 name = "llama3"
 flavor = "405B"
-tokenizer_path = "./assets/tokenizer/original/tokenizer.model"
+tokenizer_path = "./assets/tokenizer/Llama-3.1-8B"
 converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/models/llama3/train_configs/llama3_70b.toml b/torchtitan/models/llama3/train_configs/llama3_70b.toml
index 55386f929..e65d7a1ad 100644
--- a/torchtitan/models/llama3/train_configs/llama3_70b.toml
+++ b/torchtitan/models/llama3/train_configs/llama3_70b.toml
@@ -18,7 +18,7 @@ save_tb_folder = "tb"
 [model]
 name = "llama3"
 flavor = "70B"
-tokenizer_path = "./assets/tokenizer/original/tokenizer.model"
+tokenizer_path = "./assets/tokenizer/Llama-3.1-8B"
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/models/llama3/train_configs/llama3_8b.toml b/torchtitan/models/llama3/train_configs/llama3_8b.toml
index 63b4ce6da..553017779 100644
--- a/torchtitan/models/llama3/train_configs/llama3_8b.toml
+++ b/torchtitan/models/llama3/train_configs/llama3_8b.toml
@@ -18,7 +18,7 @@ save_tb_folder = "tb"
 [model]
 name = "llama3"
 flavor = "8B"
-tokenizer_path = "./assets/tokenizer/original/tokenizer.model"
+tokenizer_path = "./assets/tokenizer/Llama-3.1-8B"
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py
index 2cabd698a..e7caa89f0 100644
--- a/torchtitan/protocols/train_spec.py
+++ b/torchtitan/protocols/train_spec.py
@@ -22,7 +22,7 @@
 from torchtitan.components.lr_scheduler import LRSchedulersContainer
 from torchtitan.components.metrics import MetricsProcessor
 from torchtitan.components.optimizer import OptimizersContainer
-from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.components.validate import BaseValidator
 from torchtitan.config_manager import JobConfig
 from torchtitan.distributed import ParallelDims
@@ -41,7 +41,9 @@ class BaseModelArgs:
     _enforced: str = "This field is used to enforce all fields have defaults."
 
     @abstractmethod
-    def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None:
+    def update_from_config(
+        self, job_config: JobConfig, tokenizer: BaseTokenizer
+    ) -> None:
         pass
 
     @abstractmethod
@@ -71,7 +73,7 @@ def init_weights(self, buffer_device: torch.device | None = None) -> None:
     ..., tuple[_PipelineSchedule, list[nn.Module], bool, bool]
 ]
 DataLoaderBuilder: TypeAlias = Callable[..., BaseDataLoader]
-TokenizerBuilder: TypeAlias = Callable[..., Tokenizer]
+TokenizerBuilder: TypeAlias = Callable[..., BaseTokenizer]
 MetricsProcessorBuilder: TypeAlias = Callable[..., MetricsProcessor]
 OptimizersBuilder: TypeAlias = Callable[
     [list[nn.Module], JobConfig, ParallelDims, DeviceMesh, FTManager],