diff --git a/.ci/docker/requirements-dev.txt b/.ci/docker/requirements-dev.txt index bd6112228..6d53b2f81 100644 --- a/.ci/docker/requirements-dev.txt +++ b/.ci/docker/requirements-dev.txt @@ -3,3 +3,4 @@ pytest==7.3.2 pytest-cov pre-commit tomli-w >= 1.1.0 +transformers diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt index 11eae863f..c33bfe4d8 100644 --- a/.ci/docker/requirements.txt +++ b/.ci/docker/requirements.txt @@ -2,8 +2,6 @@ torchdata >= 0.8.0 datasets >= 3.6.0 tomli >= 1.1.0 ; python_version < "3.11" tensorboard -tiktoken -blobfile tabulate wandb fsspec diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d317f0bfe..d92f532bb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,7 +14,7 @@ We actively welcome your pull requests. 2. If you've added code that should be tested, add tests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. -5. Make sure your code lints (`pre-commit run --all-files`). +5. Make sure your code lints (`pre-commit run --files $(git diff --name-only HEAD~1)`). 6. If you haven't already, complete the Contributor License Agreement ("CLA"). ### Contributor License Agreement ("CLA") diff --git a/pyproject.toml b/pyproject.toml index fc810b981..f153465d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,7 @@ dependencies = [ "datasets>=2.21.0", # Tokenization - "blobfile", - "tiktoken", + "tokenizers", # Miscellaneous "tomli>=1.1.0", diff --git a/scripts/download_tokenizer.py b/scripts/download_tokenizer.py index a28dc4992..664bd369b 100644 --- a/scripts/download_tokenizer.py +++ b/scripts/download_tokenizer.py @@ -108,7 +108,7 @@ def is_tokenizer_file(filename: str) -> bool: print(f"Successfully downloaded {filename} to {file_path}") downloaded_files.append(filename) except HTTPError as e: - if e.response.status_code == 404: + if e.response and e.response.status_code == 404: print(f"File {filename} not found, skipping...") continue else: @@ -122,7 +122,7 @@ def is_tokenizer_file(filename: str) -> bool: print(f"Warning: No tokenizer files could be downloaded from {repo_id}") except HTTPError as e: - if e.response.status_code == 401: + if e.response and e.response.status_code == 401: print( "You need to pass a valid `--hf_token=...` to download private checkpoints." ) diff --git a/scripts/generate/test_generate.py b/scripts/generate/test_generate.py index 157d000be..ef31c1850 100644 --- a/scripts/generate/test_generate.py +++ b/scripts/generate/test_generate.py @@ -165,7 +165,7 @@ def test_generate( input_ids = ( ( torch.tensor( - tokenizer.encode(prompt, bos=True, eos=False), dtype=torch.long + tokenizer.encode(prompt, add_bos=True, add_eos=False), dtype=torch.long ) .view(1, -1) .repeat(batch_size, 1) diff --git a/tests/assets/test_tiktoken.model b/tests/assets/test_tiktoken.model deleted file mode 100644 index 4bfad6254..000000000 --- a/tests/assets/test_tiktoken.model +++ /dev/null @@ -1,2000 +0,0 @@ -AA== 0 -AQ== 1 -Ag== 2 -Aw== 3 -BA== 4 -BQ== 5 -Bg== 6 -Bw== 7 -CA== 8 -CQ== 9 -Cg== 10 -Cw== 11 -DA== 12 -DQ== 13 -Dg== 14 -Dw== 15 -EA== 16 -EQ== 17 -Eg== 18 -Ew== 19 -FA== 20 -FQ== 21 -Fg== 22 -Fw== 23 -GA== 24 -GQ== 25 -Gg== 26 -Gw== 27 -HA== 28 -HQ== 29 -Hg== 30 -Hw== 31 -IA== 32 -IQ== 33 -Ig== 34 -Iw== 35 -JA== 36 -JQ== 37 -Jg== 38 -Jw== 39 -KA== 40 -KQ== 41 -Kg== 42 -Kw== 43 -LA== 44 -LQ== 45 -Lg== 46 -Lw== 47 -MA== 48 -MQ== 49 -Mg== 50 -Mw== 51 -NA== 52 -NQ== 53 -Ng== 54 -Nw== 55 -OA== 56 -OQ== 57 -Og== 58 -Ow== 59 -PA== 60 -PQ== 61 -Pg== 62 -Pw== 63 -QA== 64 -QQ== 65 -Qg== 66 -Qw== 67 -RA== 68 -RQ== 69 -Rg== 70 -Rw== 71 -SA== 72 -SQ== 73 -Sg== 74 -Sw== 75 -TA== 76 -TQ== 77 -Tg== 78 -Tw== 79 -UA== 80 -UQ== 81 -Ug== 82 -Uw== 83 -VA== 84 -VQ== 85 -Vg== 86 -Vw== 87 -WA== 88 -WQ== 89 -Wg== 90 -Ww== 91 -XA== 92 -XQ== 93 -Xg== 94 -Xw== 95 -YA== 96 -YQ== 97 -Yg== 98 -Yw== 99 -ZA== 100 -ZQ== 101 -Zg== 102 -Zw== 103 -aA== 104 -aQ== 105 -ag== 106 -aw== 107 -bA== 108 -bQ== 109 -bg== 110 -bw== 111 -cA== 112 -cQ== 113 -cg== 114 -cw== 115 -dA== 116 -dQ== 117 -dg== 118 -dw== 119 -eA== 120 -eQ== 121 -eg== 122 -ew== 123 -fA== 124 -fQ== 125 -fg== 126 -fw== 127 -gA== 128 -gQ== 129 -gg== 130 -gw== 131 -hA== 132 -hQ== 133 -hg== 134 -hw== 135 -iA== 136 -iQ== 137 -ig== 138 -iw== 139 -jA== 140 -jQ== 141 -jg== 142 -jw== 143 -kA== 144 -kQ== 145 -kg== 146 -kw== 147 -lA== 148 -lQ== 149 -lg== 150 -lw== 151 -mA== 152 -mQ== 153 -mg== 154 -mw== 155 -nA== 156 -nQ== 157 -ng== 158 -nw== 159 -oA== 160 -oQ== 161 -og== 162 -ow== 163 -pA== 164 -pQ== 165 -pg== 166 -pw== 167 -qA== 168 -qQ== 169 -qg== 170 -qw== 171 -rA== 172 -rQ== 173 -rg== 174 -rw== 175 -sA== 176 -sQ== 177 -sg== 178 -sw== 179 -tA== 180 -tQ== 181 -tg== 182 -tw== 183 -uA== 184 -uQ== 185 -ug== 186 -uw== 187 -vA== 188 -vQ== 189 -vg== 190 -vw== 191 -wA== 192 -wQ== 193 -wg== 194 -ww== 195 -xA== 196 -xQ== 197 -xg== 198 -xw== 199 -yA== 200 -yQ== 201 -yg== 202 -yw== 203 -zA== 204 -zQ== 205 -zg== 206 -zw== 207 -0A== 208 -0Q== 209 -0g== 210 -0w== 211 -1A== 212 -1Q== 213 -1g== 214 -1w== 215 -2A== 216 -2Q== 217 -2g== 218 -2w== 219 -3A== 220 -3Q== 221 -3g== 222 -3w== 223 -4A== 224 -4Q== 225 -4g== 226 -4w== 227 -5A== 228 -5Q== 229 -5g== 230 -5w== 231 -6A== 232 -6Q== 233 -6g== 234 -6w== 235 -7A== 236 -7Q== 237 -7g== 238 -7w== 239 -8A== 240 -8Q== 241 -8g== 242 -8w== 243 -9A== 244 -9Q== 245 -9g== 246 -9w== 247 -+A== 248 -+Q== 249 -+g== 250 -+w== 251 -/A== 252 -/Q== 253 -/g== 254 -/w== 255 -IHQ= 256 -aGU= 257 -IGE= 258 -aW4= 259 -IHM= 260 -IHc= 261 -IHRoZQ== 262 -IG8= 263 -cmU= 264 -IGI= 265 -b3U= 266 -ZWQ= 267 -IG0= 268 -bmQ= 269 -IEk= 270 -aGE= 271 -aXQ= 272 -ZXI= 273 -aW5n 274 -IGY= 275 -aXM= 276 -IHRv 277 -ZW4= 278 -b24= 279 -b3I= 280 -YXM= 281 -IGM= 282 -IG9m 283 -IGFuZA== 284 -IGQ= 285 -bGw= 286 -YXQ= 287 -YW4= 288 -YXI= 289 -IHA= 290 -IG4= 291 -IGlu 292 -bGU= 293 -b20= 294 -b3Q= 295 -IGJl 296 -IGg= 297 -dXQ= 298 -b3c= 299 -ZXM= 300 -aGF0 301 -IGc= 302 -IGhl 303 -IGhh 304 -IGw= 305 -IHdhcw== 306 -bGQ= 307 -Z2g= 308 -aWQ= 309 -Y2g= 310 -IHRo 311 -IGl0 312 -YXk= 313 -IG9u 314 -Y2U= 315 -c2U= 316 -ZW50 317 -IHN0 318 -bHk= 319 -dmU= 320 -ZXQ= 321 -c3Q= 322 -IFQ= 323 -IGU= 324 -IHk= 325 -Z2h0 326 -aXI= 327 -IG1l 328 -b28= 329 -YWw= 330 -aXRo 331 -IHJl 332 -aW0= 333 -IHRoYXQ= 334 -IGFz 335 -b3VsZA== 336 -cm8= 337 -YWQ= 338 -aW9u 339 -Lgo= 340 -aGVy 341 -IG15 342 -Y3Q= 343 -IG5vdA== 344 -IHdpdGg= 345 -IGZvcg== 346 -IHU= 347 -a2U= 348 -IHlvdQ== 349 -IFM= 350 -IGlz 351 -aWdodA== 352 -Igo= 353 -YW0= 354 -aWM= 355 -dXI= 356 -IGF0 357 -Li4= 358 -YWM= 359 -dGVy 360 -IHdo 361 -IGFu 362 -IHdl 363 -IFRoZQ== 364 -aWY= 365 -IG9y 366 -IGJ1dA== 367 -dmVy 368 -ICI= 369 -IHI= 370 -b3V0 371 -b21l 372 -IGhhZA== 373 -cHA= 374 -cXU= 375 -IHN1 376 -IHRoaXM= 377 -cmVk 378 -YXJk 379 -IHNv 380 -ZWxs 381 -IHdvdWxk 382 -IGhpcw== 383 -IHNo 384 -aW5l 385 -cmE= 386 -IHNl 387 -IGJ5 388 -LiIK 389 -IFA= 390 -aGVu 391 -IEE= 392 -IGhhdmU= 393 -IGZy 394 -IHNh 395 -IEg= 396 -IG9uZQ== 397 -ZW0= 398 -a2Vk 399 -aXJ0 400 -ZWN0 401 -IGhpbQ== 402 -IGxp 403 -IGFi 404 -YXRpb24= 405 -aGluZw== 406 -dGhl 407 -IFI= 408 -IGxl 409 -c3M= 410 -IFc= 411 -Y3U= 412 -aWxs 413 -J3Q= 414 -YXJ0 415 -YWxs 416 -LAo= 417 -b3du 418 -b3Jl 419 -IGFsbA== 420 -IGs= 421 -IGdv 422 -aGlydA== 423 -YW5k 424 -IG91dA== 425 -YW1l 426 -YWlu 427 -IGlm 428 -IG5v 429 -IGRv 430 -IHRoZXk= 431 -b29s 432 -dW4= 433 -dG8= 434 -IHVw 435 -IFJlZA== 436 -IG5l 437 -IEs= 438 -IGZyb20= 439 -IFNoaXJ0 440 -IHdvcg== 441 -b25n 442 -IHRoZXJl 443 -IHNhaWQ= 444 -cmk= 445 -YW50 446 -IEI= 447 -IGFueQ== 448 -dWQ= 449 -aW5k 450 -IHdoaQ== 451 -YWI= 452 -b3VuZA== 453 -IGFib3V0 454 -IHRoZW0= 455 -Y3Vw 456 -YWs= 457 -IGRl 458 -IHRl 459 -IE0= 460 -YWtl 461 -Y3VwaW5l 462 -aWc= 463 -IHdlcmU= 464 -b3JjdXBpbmU= 465 -aWw= 466 -Y2hvb2w= 467 -IHJv 468 -b29k 469 -IGFyZQ== 470 -aXZl 471 -IGxpa2U= 472 -eW8= 473 -IGhvdQ== 474 -J3M= 475 -b25l 476 -dXM= 477 -ZWw= 478 -dWw= 479 -YWNr 480 -b3A= 481 -LCI= 482 -dGg= 483 -YWNoZXI= 484 -dW0= 485 -YW5n 486 -IGZh 487 -YWc= 488 -IHNjaG9vbA== 489 -IGo= 490 -dGU= 491 -b2s= 492 -ZXNz 493 -dXN0 494 -ZXJz 495 -Li4uLg== 496 -IEM= 497 -dGhlcg== 498 -aGFu 499 -IHdoZW4= 500 -IHNw 501 -IG1hbg== 502 -IGNhbg== 503 -b3VnaA== 504 -IHdobw== 505 -IGdldA== 506 -IGRpZA== 507 -IHBv 508 -Y2k= 509 -IGFs 510 -aXN0 511 -IGNvbQ== 512 -bGY= 513 -YXU= 514 -IFBvcmN1cGluZQ== 515 -IHdoaWNo 516 -dmVu 517 -IGFm 518 -d24= 519 -YXNz 520 -YmVy 521 -IGV4 522 -b3Vz 523 -ZXN0 524 -bG8= 525 -IHRy 526 -ZWxsb3c= 527 -IHNheQ== 528 -b3VnaHQ= 529 -IHJvb20= 530 -IHNvbWU= 531 -LS0= 532 -IE8= 533 -YXRl 534 -IHY= 535 -aGVk 536 -YXA= 537 -IHR3 538 -IGJlYw== 539 -cmVl 540 -amVjdA== 541 -a3M= 542 -IGNvbg== 543 -IGJlZW4= 544 -ZW50cw== 545 -aWRl 546 -IGNvdWxk 547 -IEc= 548 -ZXA= 549 -IHBybw== 550 -bnQ= 551 -IGhvdXNl 552 -IGFn 553 -IElm 554 -IGtu 555 -IGZlbGxvdw== 556 -IHdoYXQ= 557 -d2F5 558 -aXNo 559 -IGFt 560 -aXRl 561 -bmRlcg== 562 -aW1l 563 -IHBy 564 -IHRlYWNoZXI= 565 -YXJl 566 -IGJv 567 -IHNoZQ== 568 -IE4= 569 -aWNl 570 -YXN0 571 -dXJl 572 -aWU= 573 -IHN1Y2g= 574 -dXRlbg== 575 -dXRlbmJlcg== 576 -dXRlbmJlcmc= 577 -IHF1 578 -bG93bg== 579 -IHdy 580 -cHQ= 581 -IEhl 582 -IHN0dWQ= 583 -aGVyZQ== 584 -IG1vcmU= 585 -cnk= 586 -dHRlcg== 587 -IFk= 588 -IG1heQ== 589 -aXR5 590 -IGxvbw== 591 -IG90aGVy 592 -aGlz 593 -IFBybw== 594 -IHdpbGw= 595 -IEl0 596 -b3J0 597 -IHNob3VsZA== 598 -dmVyeQ== 599 -d2U= 600 -IHBs 601 -YXNo 602 -LiI= 603 -IGFwcA== 604 -IGRheQ== 605 -dXJu 606 -cG8= 607 -IGhlcg== 608 -ICA= 609 -bm90 610 -Y2s= 611 -IHVu 612 -aGk= 613 -dmluZw== 614 -IG9sZA== 615 -IHRpbWU= 616 -IlQ= 617 -IHdheQ== 618 -YWJsZQ== 619 -PyIK 620 -IENsb3du 621 -IG9ubHk= 622 -dWI= 623 -YWNo 624 -IG9mZg== 625 -IHRoYW4= 626 -YWxseQ== 627 -IHRoZWly 628 -YmU= 629 -a2luZw== 630 -b3RoZXI= 631 -YXJ5 632 -YW5z 633 -YXRlZA== 634 -c2VsZg== 635 -IGdvaW5n 636 -dWNo 637 -b2xs 638 -IGJhY2s= 639 -aXlv 640 -LXQ= 641 -YW5jZQ== 642 -YWRl 643 -IFByb2plY3Q= 644 -c3A= 645 -IHR3bw== 646 -IHRob3VnaHQ= 647 -c28= 648 -IHJpZ2h0 649 -IGhlYWQ= 650 -dmVk 651 -IEQ= 652 -IHByZQ== 653 -IHNlZQ== 654 -IHVz 655 -IHN0dWRlbnRz 656 -Y2lw 657 -IGRvbg== 658 -IG5pZ2h0 659 -aW5jaXA= 660 -IEtpeW8= 661 -cGw= 662 -YXJlZA== 663 -IEd1dGVuYmVyZw== 664 -IGNv 665 -IGhvdw== 666 -b21ldA== 667 -ZmY= 668 -Ikk= 669 -LC0t 670 -IGFza2Vk 671 -aW5jaXBhbA== 672 -ZXZlcg== 673 -IGFj 674 -IEY= 675 -IG1ha2U= 676 -aXR0 677 -IG1pZ2h0 678 -Z2U= 679 -bGVk 680 -IGFmdGVy 681 -aWdu 682 -IGdy 683 -IG1hZGU= 684 -ZGQ= 685 -IGtub3c= 686 -IGNvbWU= 687 -IGJy 688 -dGhpbmc= 689 -IEJ1dA== 690 -IG1hdA== 691 -IE9u 692 -b3J5 693 -Y2w= 694 -IEU= 695 -Ymxl 696 -b2c= 697 -IHlvdXI= 698 -dWxs 699 -IHdvcms= 700 -ZWFy 701 -IHRocmVl 702 -aWVk 703 -YnV0 704 -VGhl 705 -cGU= 706 -YWNl 707 -IHN0YXJ0 708 -aWNr 709 -IG92ZXI= 710 -b3Vy 711 -IG11Y2g= 712 -IHdhbnQ= 713 -aW1w 714 -IHBhcnQ= 715 -aG8= 716 -aW5r 717 -ZW5jZQ== 718 -IGRvd24= 719 -IGV2ZW4= 720 -IHByaW5jaXBhbA== 721 -bGluZw== 722 -b3VudA== 723 -YXVzZQ== 724 -IGNs 725 -IGJs 726 -LXRt 727 -b21ldGhpbmc= 728 -IGludG8= 729 -b3Jt 730 -b2t5bw== 731 -IGRpcw== 732 -IGZl 733 -IGZhY2U= 734 -Li4uLi4u 735 -cmVzcw== 736 -bWVudA== 737 -aXJl 738 -IGFy 739 -dHk= 740 -IG1v 741 -cmVhdA== 742 -IGZpcg== 743 -cGVy 744 -IG91cg== 745 -Y28= 746 -IHRoZW4= 747 -IHRvbGQ= 748 -aW5ncw== 749 -IHRha2U= 750 -IGJlZw== 751 -bmVy 752 -aXRpb24= 753 -b3Nl 754 -IG93bg== 755 -IGFnYWlu 756 -IHNlZW0= 757 -aXNl 758 -IHdhdA== 759 -Ilc= 760 -IGZhcg== 761 -YWtpbmc= 762 -Zm9yZQ== 763 -YWR5 764 -LXM= 765 -bGVzcw== 766 -IHJldA== 767 -IHNoYQ== 768 -IGNhbWU= 769 -Z2Vy 770 -IGdvb2Q= 771 -YXRoZXI= 772 -YXJr 773 -cm93 774 -IGtl 775 -J20= 776 -IGhhcw== 777 -YXRo 778 -cHBlZA== 779 -IHdlbnQ= 780 -IHRlbGw= 781 -cXVhc2g= 782 -IGVu 783 -IGZpcnN0 784 -IGhvdA== 785 -aXo= 786 -IGF3YXk= 787 -IHNvbWV0aGluZw== 788 -IHJlbQ== 789 -IHRvd24= 790 -IHNt 791 -IFRoaXM= 792 -IGJldHRlcg== 793 -IFRoZW4= 794 -d2Fz 795 -b2Y= 796 -YmFyZA== 797 -IEw= 798 -bGk= 799 -ZmU= 800 -IFRva3lv 801 -IGxvbmc= 802 -aWx5 803 -IHN1cmU= 804 -IGxvb2tlZA== 805 -dWJiYXJk 806 -Y3Rpb24= 807 -b3Jk 808 -IG1hbnk= 809 -aW91cw== 810 -IHRvbw== 811 -IGhlcmU= 812 -b3M= 813 -IHVuZGVy 814 -YXNl 815 -bmc= 816 -cGVk 817 -b2Q= 818 -bWU= 819 -IGp1c3Q= 820 -IG5vdw== 821 -aW5jZQ== 822 -IGhlYXJk 823 -IGtpbmQ= 824 -IFRoZXk= 825 -IGJlZm9yZQ== 826 -aHk= 827 -IElu 828 -IGVudA== 829 -IGJvYXJk 830 -ISI= 831 -d2FyZA== 832 -IGJlaW5n 833 -IHdlbGw= 834 -ZXJt 835 -cmllZA== 836 -IHdyb25n 837 -YWlk 838 -eHQ= 839 -IHJldHVybg== 840 -aXRlZA== 841 -IHllbg== 842 -IG1hdHRlcg== 843 -IGNhbGw= 844 -IHRhbA== 845 -IFlvdQ== 846 -Y2Vk 847 -aXNlZA== 848 -IGNoYQ== 849 -b25z 850 -IHNhbWU= 851 -IG9uY2U= 852 -ZGF5 853 -ZnQ= 854 -IHN3 855 -IGJlY2F1c2U= 856 -IHRoaW5r 857 -IHdoZXJl 858 -IE5v 859 -IEh1YmJhcmQ= 860 -IFNxdWFzaA== 861 -IGNvcA== 862 -d2l0aA== 863 -ZXJlZA== 864 -b2xsb3c= 865 -IHBsYWNl 866 -aWRk 867 -Y2Vzcw== 868 -IHNob3c= 869 -aXNoYQ== 870 -IHJh 871 -IGxldHRlcg== 872 -bmU= 873 -dmVz 874 -YXRpbmc= 875 -cmFuZw== 876 -IGFmZg== 877 -IGhhbmQ= 878 -IHNj 879 -IHBlcnM= 880 -aW50 881 -cHI= 882 -c2lkZQ== 883 -ZnRlcg== 884 -IHNheWluZw== 885 -IGxhdQ== 886 -dGhhdA== 887 -IHdpdGhvdXQ= 888 -cm9u 889 -YWly 890 -bGVjdA== 891 -IFdoYXQ= 892 -ZWx0 893 -IHdoaWxl 894 -b2dh 895 -YXBlcg== 896 -IHBl 897 -b3k= 898 -IHNhdA== 899 -aWVz 900 -IGFkZA== 901 -IGRheXM= 902 -IHNwZQ== 903 -IGhv 904 -IGFucw== 905 -IGhhcg== 906 -IFdoZW4= 907 -IGFueXRoaW5n 908 -cGVu 909 -XQo= 910 -dGFpbg== 911 -IG11c3Q= 912 -IG5ldw== 913 -bGlj 914 -IHZv 915 -aGlsZQ== 916 -Z2V0 917 -IEFz 918 -IHZlcnk= 919 -J3Jl 920 -IGV2ZXJ5 921 -YXZl 922 -PyI= 923 -YWRnZXI= 924 -IEtvZ2E= 925 -IE1y 926 -cm91Z2g= 927 -dWx0 928 -IGZvbGxvdw== 929 -dGluZw== 930 -aWZl 931 -aWRkbGU= 932 -ZnVs 933 -YW5r 934 -IFNv 935 -IHNlZW1lZA== 936 -IEFuZA== 937 -aXg= 938 -IHNldA== 939 -IGNhcmU= 940 -IHJlcw== 941 -IG5ldmVy 942 -IGZvdW5k 943 -IGxv 944 -Y2lk 945 -aW5lZA== 946 -IGNsYXNz 947 -IG15c2VsZg== 948 -YXc= 949 -IHdvbQ== 950 -YXRpb25z 951 -IGxlZnQ= 952 -IFdl 953 -IHRlYWNoZXJz 954 -Ilk= 955 -bmE= 956 -b250 957 -IGRlcw== 958 -IHRob3Nl 959 -aXJlZA== 960 -IHNlbg== 961 -eWluZw== 962 -IHRoZXNl 963 -YXo= 964 -IFRoZXJl 965 -Y2VwdA== 966 -IGRhbmc= 967 -IFU= 968 -Ikg= 969 -Ym9k 970 -Ym9keQ== 971 -IGhhdmluZw== 972 -YWxhcnk= 973 -IHdhdGNo 974 -IGdpdmU= 975 -YWdl 976 -IGl0cw== 977 -IGFwcGU= 978 -dWU= 979 -IGNvdW50 980 -IGhhcmQ= 981 -IGJlbA== 982 -b3R0 983 -IGRpc3Q= 984 -IlM= 985 -IE1hZA== 986 -LW4= 987 -cmlidXQ= 988 -Z2Vk 989 -IGF0dA== 990 -ZmVyZQ== 991 -aXRoZXI= 992 -IHVwb24= 993 -IHRlbQ== 994 -IHBlcnNvbg== 995 -bmluZw== 996 -IGNoZQ== 997 -YXJseQ== 998 -b25leQ== 999 -IHNvb24= 1000 -ZW1lbnQ= 1001 -ICg= 1002 -IHRyYW5z 1003 -IGV4cA== 1004 -IHNlcg== 1005 -IHJlZw== 1006 -YXNvbg== 1007 -IHNhdw== 1008 -IG5leHQ= 1009 -b290 1010 -IGhhbGY= 1011 -IHRvb2s= 1012 -IGJhZA== 1013 -IGhvdXI= 1014 -IHNhbGFyeQ== 1015 -IGJlZ2Fu 1016 -cmlnaHQ= 1017 -b25uYQ== 1018 -LXNhbg== 1019 -IHdvcmtz 1020 -IEo= 1021 -Zm9ybQ== 1022 -aWNhbA== 1023 -IHRyYQ== 1024 -bWFu 1025 -IG5vdGhpbmc= 1026 -IHN0aWxs 1027 -ZWFycw== 1028 -IHN1cHA= 1029 -IHR1cm4= 1030 -IGZlbHQ= 1031 -IHdvbWFu 1032 -IHN0YXJ0ZWQ= 1033 -b3VibGU= 1034 -dXJh 1035 -aXNoaW5n 1036 -Ogo= 1037 -bGVjdHJvbg== 1038 -bGVjdHJvbmlj 1039 -b29r 1040 -IGNvcHk= 1041 -IGZ1bGw= 1042 -Y29uZA== 1043 -bWF0 1044 -IG1pZGRsZQ== 1045 -IGxvb2s= 1046 -IGNvbW0= 1047 -d2VyZWQ= 1048 -IGJlY2FtZQ== 1049 -IGZlbGxvd3M= 1050 -d291bGQ= 1051 -IGdvdA== 1052 -IGds 1053 -IGd1 1054 -IGtlZXA= 1055 -IGdl 1056 -IE1hZG9ubmE= 1057 -aXRlcg== 1058 -aXNoZWQ= 1059 -IHVuZGVyc3Q= 1060 -IHN0cmE= 1061 -c2lk 1062 -IGNvdW50cnk= 1063 -b3BsZQ== 1064 -IHByb3Y= 1065 -IHB1dA== 1066 -bm8= 1067 -J2xs 1068 -IHNsZQ== 1069 -cmFuZ2U= 1070 -IFNoZQ== 1071 -cG9z 1072 -IG1pbmQ= 1073 -IHBhc3M= 1074 -IHRocm91Z2g= 1075 -IHF1aXRl 1076 -IGluZA== 1077 -IGJvYXJkaW5n 1078 -dGVhY2hlcg== 1079 -cGxl 1080 -UG9yY3VwaW5l 1081 -IHBsZQ== 1082 -IGdlaXNoYQ== 1083 -ICAgIA== 1084 -b3N0 1085 -ZW5zZQ== 1086 -Tm8= 1087 -aWJsZQ== 1088 -IHJlYWQ= 1089 -IHJlZA== 1090 -ZW50aW9u 1091 -ZW5lZA== 1092 -ISIK 1093 -IHJlZg== 1094 -IGFk 1095 -IGZs 1096 -IHN0YXk= 1097 -dXA= 1098 -IHJvdW5k 1099 -IGNsZQ== 1100 -IG9wZW4= 1101 -IG9i 1102 -dGVuZA== 1103 -IGZpbmQ= 1104 -IHBlcg== 1105 -IGNhbGxlZA== 1106 -IHN1cg== 1107 -cmV3 1108 -IHBhcGVy 1109 -IEJhZGdlcg== 1110 -IG1lZXQ= 1111 -aXNz 1112 -IlRoYXQ= 1113 -ZXJtcw== 1114 -VEU= 1115 -aXR0ZW4= 1116 -YWJseQ== 1117 -bmVzcw== 1118 -IGNhbm5vdA== 1119 -IHNpbXA= 1120 -Y29u 1121 -IHJlYXNvbg== 1122 -eW91 1123 -IGhvbWU= 1124 -Ynk= 1125 -IGZpZ2h0 1126 -aXR0bGU= 1127 -IHRoaW5ncw== 1128 -IGVhcw== 1129 -IGltcA== 1130 -cmVzc2Vk 1131 -IG1lYW4= 1132 -IGFwcGVhcmVk 1133 -IG5hdA== 1134 -IGhlbA== 1135 -cmV0 1136 -YWtlbg== 1137 -IHN0cmFpZ2h0 1138 -IGFmZmFpcg== 1139 -aXRpbmc= 1140 -IGVk 1141 -IHNpbmNl 1142 -bG9n 1143 -IHBheQ== 1144 -IGZyb250 1145 -bXk= 1146 -IHZvaWNl 1147 -cmVhZHk= 1148 -IGZvb2w= 1149 -b3VuZGF0aW9u 1150 -IGVsZWN0cm9uaWM= 1151 -IHRlcm1z 1152 -IG1hcg== 1153 -YXBhbg== 1154 -YW55 1155 -IHJlc3A= 1156 -IGVuZA== 1157 -YXBw 1158 -d2hhdA== 1159 -c3Ry 1160 -cmFw 1161 -aWFs 1162 -aWN1bA== 1163 -IGFjYw== 1164 -b3Ro 1165 -IHNlY29uZA== 1166 -IGZsbw== 1167 -IHNpeA== 1168 -IGZlZXQ= 1169 -YnI= 1170 -aWV0 1171 -IGxpdHRsZQ== 1172 -bGVz 1173 -IG1vbmV5 1174 -IGRlY2w= 1175 -IGV5 1176 -IGNvbXA= 1177 -YXJpbmc= 1178 -IGFncmU= 1179 -d2hlcmU= 1180 -IFN0 1181 -IHN0cmU= 1182 -ZXg= 1183 -cmFjdA== 1184 -IGludA== 1185 -IGRpcmU= 1186 -IGJlY29tZQ== 1187 -IGhvbg== 1188 -IGNvbnNpZA== 1189 -ZXJ0YWlu 1190 -bm93 1191 -IHNs 1192 -aXRvcg== 1193 -Z2c= 1194 -IGp1bQ== 1195 -IGJ1 1196 -IHRoaW5n 1197 -IGFuc3dlcmVk 1198 -b2Vz 1199 -eWE= 1200 -IFRoYXQ= 1201 -aXpl 1202 -b25k 1203 -YWN0 1204 -IGVmZg== 1205 -IGJhbmc= 1206 -YWJvdXQ= 1207 -IGJlZA== 1208 -b3Jyb3c= 1209 -dW5n 1210 -IFRv 1211 -IGtlcHQ= 1212 -IHdhbA== 1213 -IGJhdGg= 1214 -IGRyYQ== 1215 -IkE= 1216 -cmluZ3M= 1217 -aG9wcA== 1218 -IHJlc2lnbg== 1219 -IGRpbg== 1220 -IGxhZHk= 1221 -LkU= 1222 -IHVzZQ== 1223 -bGlzaA== 1224 -b3Jz 1225 -IHdyaXR0ZW4= 1226 -ZW5l 1227 -aXY= 1228 -IGRpZg== 1229 -IHN0ZQ== 1230 -IHN0b3J5 1231 -Y29t 1232 -cmVz 1233 -ZW50bHk= 1234 -IGZhY3Q= 1235 -aGVz 1236 -d2F5cw== 1237 -IHdoeQ== 1238 -IHRob3VnaA== 1239 -IHN0cg== 1240 -b25kZXI= 1241 -aGVhZA== 1242 -IGNvdXI= 1243 -IG1vbg== 1244 -IHNr 1245 -IGJlbGll 1246 -IGxldA== 1247 -ZmVy 1248 -IHJlcXU= 1249 -IGxpbmU= 1250 -cm9vbQ== 1251 -LWRheQ== 1252 -IGRvbmU= 1253 -IGRvZXM= 1254 -IE9uZQ== 1255 -IGRhbmdv 1256 -YXNzaG9wcA== 1257 -IGNvbnNpZGVy 1258 -IGRpbm5lcg== 1259 -IEZvdW5kYXRpb24= 1260 -Kio= 1261 -ZW1wdA== 1262 -ZXNl 1263 -IHdvcmQ= 1264 -cmVzdA== 1265 -IGVub3VnaA== 1266 -IGdyZWF0 1267 -IG5hbWU= 1268 -IHB1Yg== 1269 -IG1hbm5lcg== 1270 -d2Vy 1271 -aWN0 1272 -aW5lc3M= 1273 -IGhpbXNlbGY= 1274 -IHBlb3BsZQ== 1275 -ZXc= 1276 -IGNvcg== 1277 -ZXN0aW9u 1278 -IGJpZw== 1279 -ZWU= 1280 -IHJp 1281 -aWRlcw== 1282 -IGJyb3RoZXI= 1283 -IGhlYXJ0 1284 -ZWN0ZWQ= 1285 -ZWVk 1286 -IG90aGVycw== 1287 -c29s 1288 -dGVk 1289 -IGV5ZXM= 1290 -IHRyb3VibGU= 1291 -IHRlYWNo 1292 -IGJvYXQ= 1293 -IGZvdXI= 1294 -IGFscmVhZHk= 1295 -cm9t 1296 -Z2hlZA== 1297 -IHNxdQ== 1298 -IHBvbA== 1299 -Y2Vz 1300 -IEhvdHQ= 1301 -IGxlYXZl 1302 -IGRpc3RyaWJ1dA== 1303 -YXN0ZXI= 1304 -Q0g= 1305 -dWM= 1306 -IGlt 1307 -IGhvd2V2ZXI= 1308 -dGhlcmU= 1309 -YXBhbmVzZQ== 1310 -IGxhc3Q= 1311 -IGNy 1312 -aWxpdHk= 1313 -IHNpbXBsZQ== 1314 -IGxpZmU= 1315 -LWM= 1316 -IHJlZ2FyZA== 1317 -IGZpbg== 1318 -dWFs 1319 -IG1lYW5z 1320 -IHN0YW5k 1321 -YXRjaA== 1322 -IHNob3J0 1323 -bmVk 1324 -IHNlZW4= 1325 -IGhhcHA= 1326 -LWs= 1327 -IGFnYWluc3Q= 1328 -aGlt 1329 -YW1lZA== 1330 -IHN0b29k 1331 -IGdyYQ== 1332 -IG1vdGhlcg== 1333 -IGZpc2g= 1334 -IHdhdGVy 1335 -YWls 1336 -Y2Vp 1337 -IHJhdGhlcg== 1338 -IGlucw== 1339 -IGZlZWw= 1340 -IGFsc28= 1341 -IG9yZA== 1342 -IGNvbWluZw== 1343 -aWNz 1344 -IGVpdGhlcg== 1345 -bmNl 1346 -ICc= 1347 -IGtpZA== 1348 -IGxhdWdoZWQ= 1349 -bGlrZQ== 1350 -IEFy 1351 -Z3I= 1352 -IEhvdHRh 1353 -IHRhbGs= 1354 -Z2V0aGVy 1355 -IFNpcg== 1356 -IHB1bg== 1357 -UHJv 1358 -YXRz 1359 -bW9zdA== 1360 -IHJlcA== 1361 -IGdp 1362 -aXNm 1363 -YmFibHk= 1364 -YWtlcw== 1365 -IE5vdA== 1366 -bnk= 1367 -IGFwcGVhcg== 1368 -bXA= 1369 -Y2hh 1370 -IGFjdA== 1371 -YmVk 1372 -aWVm 1373 -dWZm 1374 -IGFwbw== 1375 -IG1ldA== 1376 -IHJldHVybmVk 1377 -IHNvdW5k 1378 -dXNpbmVzcw== 1379 -IGxhdWdo 1380 -IGNsZWFy 1381 -IG5lZWQ= 1382 -ZmVzcw== 1383 -ZXN0ZWQ= 1384 -IGludg== 1385 -IGFjY2VwdA== 1386 -dW5kZXI= 1387 -Owo= 1388 -IHN1cnBy 1389 -ZGU= 1390 -IHRyYWlu 1391 -IGhvdGVs 1392 -IHNsZWVw 1393 -IGRy 1394 -IGhvbGQ= 1395 -bG9jaw== 1396 -cHVyYQ== 1397 -IHNwcmluZ3M= 1398 -IC4uLi4uLg== 1399 -IGFncmVlbWVudA== 1400 -IERhcg== 1401 -IHJlc3Q= 1402 -Y2x1ZA== 1403 -YXRvcg== 1404 -YXY= 1405 -IG9yaWc= 1406 -IG9yaWdpbg== 1407 -IGVs 1408 -IG5vcg== 1409 -IHByZXM= 1410 -IHVuZGVyc3RhbmQ= 1411 -IHRha2Vu 1412 -IGxpZ2h0 1413 -ZW5lcg== 1414 -c29tZQ== 1415 -IGJyb3VnaHQ= 1416 -cmFwaA== 1417 -IG1vc3Q= 1418 -b2tl 1419 -LXc= 1420 -IHVudA== 1421 -IGZhdGhlcg== 1422 -IHVzZWQ= 1423 -IGVhdA== 1424 -IHllYXJz 1425 -IFdoaWxl 1426 -IGNoYW4= 1427 -IHN1ZGQ= 1428 -IHN1ZGRlbg== 1429 -IGFwb2xvZw== 1430 -IHNldHQ= 1431 -IHRoaW4= 1432 -IE15 1433 -IHRlbg== 1434 -aW1lcw== 1435 -Zm9y 1436 -b3Vk 1437 -V2hlbg== 1438 -IGRldA== 1439 -IGxpdmU= 1440 -IG9j 1441 -IGZpdmU= 1442 -IGNvbnQ= 1443 -IGhlbHA= 1444 -IHdh 1445 -IHBhc3NlZA== 1446 -IHJ1bg== 1447 -IG1ha2luZw== 1448 -IHN0cmFuZ2U= 1449 -IHRha2luZw== 1450 -IGVhY2g= 1451 -IllvdQ== 1452 -IGFub3RoZXI= 1453 -IlNheQ== 1454 -IlRoZQ== 1455 -YXRlcw== 1456 -IHBsZWFz 1457 -YXNzaG9wcGVycw== 1458 -IG1vbQ== 1459 -IG1vbWVudA== 1460 -ZW50bGU= 1461 -bmdsaXNo 1462 -Q0hB 1463 -IG9yaWdpbmFs 1464 -aW9ucw== 1465 -dXJpbmc= 1466 -IHB1YmxpYw== 1467 -dWN0 1468 -dWNr 1469 -IHF1ZXN0aW9u 1470 -YWk= 1471 -Y3k= 1472 -ZWs= 1473 -IGZsb29y 1474 -IGNhcg== 1475 -b3VzZQ== 1476 -IHNpZGU= 1477 -LXlh 1478 -IGNlcnRhaW4= 1479 -aHlz 1480 -LWQ= 1481 -aWdo 1482 -YWdpbg== 1483 -d2VldA== 1484 -IHBvb3I= 1485 -IGRlY2lk 1486 -dWFsbHk= 1487 -IGJ1c2luZXNz 1488 -cHJv 1489 -cGxhaW4= 1490 -IHN0b3A= 1491 -IQo= 1492 -IEhvdw== 1493 -IldoYXQ= 1494 -Y2Fu 1495 -IFVu 1496 -cHM= 1497 -dW5k 1498 -LW5pZ2h0 1499 -IG1lZXRpbmc= 1500 -ZWRv 1501 -IHJhaXNl 1502 -R3V0ZW5iZXJn 1503 -IERhcmxpbmc= 1504 -dW1l 1505 -IEVuZ2xpc2g= 1506 -VEVS 1507 -YWRpbmc= 1508 -IHRyYW5zbA== 1509 -IGFibGU= 1510 -c3NpYmxl 1511 -IHNhdGlzZg== 1512 -IHdhbnRlZA== 1513 -IHN1Yg== 1514 -IGNhc2U= 1515 -aWZpYw== 1516 -aXRlcmFyeQ== 1517 -IG1haWQ= 1518 -IGluYw== 1519 -IHBvcw== 1520 -IHBvc2l0aW9u 1521 -IHBhdA== 1522 -dXJlZA== 1523 -b3JyeQ== 1524 -IGFjY291bnQ= 1525 -IGJvdGg= 1526 -IGZyaWU= 1527 -IGZyaWVuZA== 1528 -dGhpcw== 1529 -IGFsd2F5cw== 1530 -IHBhcnRpY3Vs 1531 -V2hhdA== 1532 -IHNtYWxs 1533 -ZW50eQ== 1534 -dXNoZWQ= 1535 -IG1pcw== 1536 -dWxseQ== 1537 -IHJlY2Vp 1538 -WW91 1539 -IHlldA== 1540 -IGdhdmU= 1541 -QnV0 1542 -aGFk 1543 -IGFuc3dlcg== 1544 -IGFicw== 1545 -aWxl 1546 -Y2tldA== 1547 -IG5vb2Q= 1548 -IGNvdXJzZQ== 1549 -IGZvcm0= 1550 -IGV2ZXJ5dGhpbmc= 1551 -ZWN0aW9u 1552 -SWY= 1553 -cGFydA== 1554 -IHNpbmc= 1555 -IHNpdA== 1556 -IHB1cg== 1557 -aXA= 1558 -IGZpc2hpbmc= 1559 -IGVo 1560 -IHBhcg== 1561 -IHRvZ2V0aGVy 1562 -SGU= 1563 -IHdoZQ== 1564 -IHdoZXRoZXI= 1565 -IGJyYQ== 1566 -Illlcw== 1567 -IHB1bmlzaA== 1568 -U2hpcnQ= 1569 -IFllZG8= 1570 -IGZhcmV3 1571 -IGZhcmV3ZWxs 1572 -IGRhbmNl 1573 -IGxlc3M= 1574 -dXJhbA== 1575 -IGRlZg== 1576 -IGF0dGVtcHQ= 1577 -d2Vlbg== 1578 -IHNpZ24= 1579 -IHN5 1580 -ZmVyZW50 1581 -IGxlYXN0 1582 -c2Vy 1583 -b2I= 1584 -bmRpbmc= 1585 -IHNvcnJ5 1586 -IGp1bXBlZA== 1587 -IGphbg== 1588 -IGphbml0b3I= 1589 -aXplZA== 1590 -IHRvd2FyZA== 1591 -IG1vcg== 1592 -YXZpbmc= 1593 -IGJpdA== 1594 -IlRoaXM= 1595 -IHJlbWFyaw== 1596 -IGZ1dA== 1597 -IHdvbmRlcg== 1598 -IGZ1bg== 1599 -VGhlbg== 1600 -IGRlYw== 1601 -IHdob20= 1602 -IGRpZG4= 1603 -IHJlYw== 1604 -YmVj 1605 -Iklm 1606 -IGtuZXc= 1607 -YWZ0ZXI= 1608 -IHRodXM= 1609 -IGlzbg== 1610 -IHNpZ2h0 1611 -bWVk 1612 -W0Y= 1613 -dXNz 1614 -Y2lkZW50 1615 -dGhlbQ== 1616 -IGZpZg== 1617 -IGRyYXc= 1618 -IGhlYXI= 1619 -IHdyaXRpbmc= 1620 -IGdldHRpbmc= 1621 -c2g= 1622 -ZmVyZW5jZQ== 1623 -IHJhaXNlZA== 1624 -dGhleQ== 1625 -YXg= 1626 -IGZpbmU= 1627 -c2Vs 1628 -IE5vYmU= 1629 -IE5vYmVvaw== 1630 -IE5vYmVva2E= 1631 -b3JtYWw= 1632 -IGVC 1633 -aWNlbnNl 1634 -MDA= 1635 -IGJlc3Q= 1636 -d29y 1637 -Zmlj 1638 -dGVyZXN0 1639 -IHJlbWFy 1640 -Ymw= 1641 -YXJ0ZWQ= 1642 -IGRhcms= 1643 -IHlvdW5n 1644 -dXNo 1645 -IGJldA== 1646 -b3V0aA== 1647 -aG91c2U= 1648 -YXVnaHQ= 1649 -IHBoeXM= 1650 -IHN0cm9uZw== 1651 -IGZ1cg== 1652 -IHJvbGw= 1653 -Y292ZQ== 1654 -Y2hpZWY= 1655 -YXdh 1656 -IGZvbGxvd2Vk 1657 -IGZvbmQ= 1658 -IGZ1dHVyZQ== 1659 -aXJk 1660 -ZnVsbHk= 1661 -IGVmZm9ydA== 1662 -QWZ0ZXI= 1663 -b3dhcmQ= 1664 -IHJlYWxseQ== 1665 -IGFtb25n 1666 -IGFyb3VuZA== 1667 -IGNvbXBs 1668 -IGdheg== 1669 -IGJvdw== 1670 -YXRlcg== 1671 -IGluc2lzdA== 1672 -IHR1cm5lZA== 1673 -aGVs 1674 -cmVt 1675 -IGhvdXJz 1676 -IGRlY2lkZWQ= 1677 -eXM= 1678 -IG1vbnRo 1679 -LWE= 1680 -IGFkdg== 1681 -IGJlbGlldmU= 1682 -IHRlYWNoaW5n 1683 -IGVhc3k= 1684 -IGRpcmVjdGlvbg== 1685 -b29rZWQ= 1686 -IHdhcg== 1687 -IHVubGVzcw== 1688 -aGF2ZQ== 1689 -IHNxdWFyZQ== 1690 -dmls 1691 -IHF1aWV0 1692 -IGh1bmc= 1693 -IGdvZXM= 1694 -IHBhaWQ= 1695 -IHNoYWxs 1696 -Ik5v 1697 -IHB1bmlzaG1lbnQ= 1698 -cG9zZQ== 1699 -IHN3ZWV0 1700 -J3Zl 1701 -IldlbGw= 1702 -IGdlbnRsZQ== 1703 -IG5vcm1hbA== 1704 -YWdyYXBo 1705 -Y2hpdmU= 1706 -Y2hhbg== 1707 -IGluY2x1ZA== 1708 -d3c= 1709 -b3Jn 1710 -dGVt 1711 -QVI= 1712 -IFRI 1713 -IGVxdQ== 1714 -IHRvbmU= 1715 -IHBvc3NpYmxl 1716 -IGJlY29t 1717 -IEphcGFuZXNl 1718 -dmVycw== 1719 -IGZvbGxvd2luZw== 1720 -IHBhaW4= 1721 -IHdob2xl 1722 -d3I= 1723 -IHNlcmlvdXM= 1724 -IG5hcg== 1725 -IHRpcmVk 1726 -SW4= 1727 -IHBsYXk= 1728 -IHByb20= 1729 -IGdhbWU= 1730 -IFNvbWU= 1731 -IGhhcHBlbmVk 1732 -IGN1dA== 1733 -IHR3ZW50eQ== 1734 -IGRvb3I= 1735 -IG1vcm5pbmc= 1736 -aGluZA== 1737 -IGJyZQ== 1738 -IGluc2lkZQ== 1739 -b3Zl 1740 -YWx0aA== 1741 -dWs= 1742 -YXJnZQ== 1743 -YW1i 1744 -IGRhbQ== 1745 -IHdvcnJ5 1746 -YXRpdmU= 1747 -IGV4cGVjdGVk 1748 -IGZhbQ== 1749 -IHByYQ== 1750 -IHBvY2tldA== 1751 -b29rcw== 1752 -Y2hlZA== 1753 -IHNpbA== 1754 -b2w= 1755 -IGZhdg== 1756 -IGVsc2U= 1757 -IGhpZ2g= 1758 -IHJlYWw= 1759 -IGFsb25n 1760 -IG1lZA== 1761 -aGlr 1762 -aGVtYXQ= 1763 -aGVtYXRpY3M= 1764 -IGxpc3Q= 1765 -IHNpY2s= 1766 -b2ludA== 1767 -W0Zvb3Q= 1768 -W0Zvb3Rub3Q= 1769 -W0Zvb3Rub3Rl 1770 -Ll0K 1771 -bmlnaHQ= 1772 -c2Vz 1773 -aW9y 1774 -IHNheXM= 1775 -IG1vdXRo 1776 -aG93 1777 -bWluZw== 1778 -IGNsbw== 1779 -IGN1cg== 1780 -Z2luZw== 1781 -IHN1ZGRlbmx5 1782 -LWFo 1783 -YW1w 1784 -IGJsYWNr 1785 -cm9zcw== 1786 -IGZhYw== 1787 -c2VsdmVz 1788 -aWV3 1789 -aXNzaW9u 1790 -IGNvcHlyaWdodA== 1791 -IHBhcmFncmFwaA== 1792 -IEFyY2hpdmU= 1793 -IGRvbmF0aW9ucw== 1794 -UHJvamVjdA== 1795 -IGNvc3Q= 1796 -Lm9yZw== 1797 -TEk= 1798 -dWNlZA== 1799 -IHN1Yw== 1800 -eWxl 1801 -IGZvcmNl 1802 -am95 1803 -b3VjaA== 1804 -dHI= 1805 -SXQ= 1806 -IHRyYWQ= 1807 -IHByZXNlbnQ= 1808 -IGV4dA== 1809 -YXNlZA== 1810 -cmVkaXQ= 1811 -IGZhdWx0 1812 -aWI= 1813 -LW0= 1814 -dXJk 1815 -IHRyaWVk 1816 -dGltZQ== 1817 -IHByZXQ= 1818 -IHNwZWU= 1819 -b3dlcg== 1820 -IHdvcmRz 1821 -Q0hBUA== 1822 -Q0hBUFRFUg== 1823 -c2Nob29s 1824 -IGFzaw== 1825 -IGRvaW5n 1826 -YXRlbHk= 1827 -IHVudGls 1828 -Ym91dA== 1829 -IHRyZWU= 1830 -Y2FsbA== 1831 -YW1hc2g= 1832 -YW1hc2hpcg== 1833 -YW1hc2hpcm8= 1834 -c3Rl 1835 -IGJlaGluZA== 1836 -b2xk 1837 -IHdhbGw= 1838 -aXRvcnk= 1839 -IHJvbGxlZA== 1840 -IG1vdmU= 1841 -IGFwb2xvZ2l6ZQ== 1842 -IGxhcmdl 1843 -YW1ib28= 1844 -c3U= 1845 -IHNldHRsZWQ= 1846 -Ikhl 1847 -d28= 1848 -IHRoaW5raW5n 1849 -dXNlZA== 1850 -aWZpZWQ= 1851 -IGFsbW9zdA== 1852 -IHRyZQ== 1853 -IHRyZWF0 1854 -IG5vb2RsZQ== 1855 -IG5vdGU= 1856 -IEFsbA== 1857 -IGJlYXQ= 1858 -IG9iamVjdA== 1859 -IHNlZW1z 1860 -IGlkZQ== 1861 -WWVz 1862 -b3dz 1863 -IHJlbWFpbg== 1864 -IGJlZ2lu 1865 -dWdodA== 1866 -bWVudHM= 1867 -IGFsb25l 1868 -c3BlY3Q= 1869 -IG1hdGhlbWF0aWNz 1870 -IHJvdWdo 1871 -IG91dHNpZGU= 1872 -IGNvbWVz 1873 -YmFjaw== 1874 -IHdpbmQ= 1875 -c2Vk 1876 -IHdvdWxkbg== 1877 -ZWVy 1878 -aW51dA== 1879 -ZnJvbQ== 1880 -IHJlcGw= 1881 -IG5hcnJvdw== 1882 -IGluY2lkZW50 1883 -IGFpcg== 1884 -IHNlYQ== 1885 -dHM= 1886 -IHN1cnByaXNlZA== 1887 -IHRlYQ== 1888 -UmVk 1889 -IHRhbGtpbmc= 1890 -IGJvc3M= 1891 -cXVl 1892 -IHBpY3Q= 1893 -aXJ0eQ== 1894 -IGNl 1895 -IGxpbQ== 1896 -IFdoeQ== 1897 -IHBvaW50 1898 -IGxhdw== 1899 -Y2lhdGVk 1900 -IG1vb24= 1901 -aXJjdQ== 1902 -Z290 1903 -IElz 1904 -IGhhbmRz 1905 -IGhvbm9y 1906 -YXV0 1907 -cmdl 1908 -IHN0YXRl 1909 -IExpdGVyYXJ5 1910 -LkY= 1911 -VGhpcw== 1912 -bGluZQ== 1913 -Lmc= 1914 -Lmd1dGVuYmVyZw== 1915 -IE9G 1916 -RU4= 1917 -cmFjdGVy 1918 -IGJlbmU= 1919 -IEV2ZW4= 1920 -b3Vi 1921 -IG1ha2Vz 1922 -IGludGVyZXN0 1923 -b3Bl 1924 -bXM= 1925 -IHJlc3BvbnM= 1926 -IGZvcmU= 1927 -IHNvbWV3aGF0 1928 -IGhvbmVzdA== 1929 -b2Nr 1930 -aXJpdA== 1931 -IGhlbGQ= 1932 -IGFkZGVk 1933 -ZnU= 1934 -YWRlZA== 1935 -YWxz 1936 -YXR0 1937 -dGVybg== 1938 -IHBlcnNvbmFs 1939 -IGFzcw== 1940 -IFdpdGg= 1941 -dGlj 1942 -VG9reW8= 1943 -IHNob3V0 1944 -IHByZXR0eQ== 1945 -dW1i 1946 -IGVhcmx5 1947 -b3BwZWQ= 1948 -IGZ1cnRoZXI= 1949 -IGZyZQ== 1950 -ZXNpZGVz 1951 -IGJhbWJvbw== 1952 -IGly 1953 -bW9yZQ== 1954 -IGxpdmluZw== 1955 -IHJlY2VpdmVk 1956 -IGxpdmVk 1957 -IG1lYW50 1958 -IGNvd2FyZA== 1959 -cG9zaXRpb24= 1960 -IGxvYw== 1961 -aWxlZA== 1962 -IHRlbmRlcg== 1963 -IGNo 1964 -IEFmdGVy 1965 -Y2Vy 1966 -IGZhdm9y 1967 -d2hv 1968 -IGxpa2Vk 1969 -cmFuY2U= 1970 -IHByaQ== 1971 -a2lzaGE= 1972 -IHN0dWR5 1973 -IG9yZGVy 1974 -IGFmdGVyd2FyZA== 1975 -IGdyZWF0bHk= 1976 -IHVuYWJsZQ== 1977 -Z28= 1978 -IHdhaXQ= 1979 -ZXBpbmc= 1980 -aWRpbmc= 1981 -IGZvcnR5 1982 -IHNreQ== 1983 -IG9mZmljZQ== 1984 -d2lsbA== 1985 -IkQ= 1986 -d2Vs 1987 -IHN0YXRpb24= 1988 -Ym8= 1989 -aG90 1990 -c3VjaA== 1991 -IGxvdWQ= 1992 -IGF3 1993 -bGFuZA== 1994 -Pwo= 1995 -IHJlc3BlY3Q= 1996 -YW5jZXM= 1997 -aWVudA== 1998 -IG91Z2h0 1999 diff --git a/tests/assets/tokenizer/tokenizer.json b/tests/assets/tokenizer/tokenizer.json new file mode 100644 index 000000000..a39c930b4 --- /dev/null +++ b/tests/assets/tokenizer/tokenizer.json @@ -0,0 +1,2037 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [], + "normalizer": null, + "pre_tokenizer": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": true + }, + "post_processor": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "\u0000": 0, + "\u0001": 1, + "\u0002": 2, + "\u0003": 3, + "\u0004": 4, + "\u0005": 5, + "\u0006": 6, + "\u0007": 7, + "\b": 8, + "\t": 9, + "\n": 10, + "\u000b": 11, + "\f": 12, + "\r": 13, + "\u000e": 14, + "\u000f": 15, + "\u0010": 16, + "\u0011": 17, + "\u0012": 18, + "\u0013": 19, + "\u0014": 20, + "\u0015": 21, + "\u0016": 22, + "\u0017": 23, + "\u0018": 24, + "\u0019": 25, + "\u001a": 26, + "\u001b": 27, + "\u001c": 28, + "\u001d": 29, + "\u001e": 30, + "\u001f": 31, + " ": 32, + "!": 33, + "\"": 34, + "#": 35, + "$": 36, + "%": 37, + "&": 38, + "'": 39, + "(": 40, + ")": 41, + "*": 42, + "+": 43, + ",": 44, + "-": 45, + ".": 46, + "/": 47, + "0": 48, + "1": 49, + "2": 50, + "3": 51, + "4": 52, + "5": 53, + "6": 54, + "7": 55, + "8": 56, + "9": 57, + ":": 58, + ";": 59, + "<": 60, + "=": 61, + ">": 62, + "?": 63, + "@": 64, + "A": 65, + "B": 66, + "C": 67, + "D": 68, + "E": 69, + "F": 70, + "G": 71, + "H": 72, + "I": 73, + "J": 74, + "K": 75, + "L": 76, + "M": 77, + "N": 78, + "O": 79, + "P": 80, + "Q": 81, + "R": 82, + "S": 83, + "T": 84, + "U": 85, + "V": 86, + "W": 87, + "X": 88, + "Y": 89, + "Z": 90, + "[": 91, + "\\": 92, + "]": 93, + "^": 94, + "_": 95, + "`": 96, + "a": 97, + "b": 98, + "c": 99, + "d": 100, + "e": 101, + "f": 102, + "g": 103, + "h": 104, + "i": 105, + "j": 106, + "k": 107, + "l": 108, + "m": 109, + "n": 110, + "o": 111, + "p": 112, + "q": 113, + "r": 114, + "s": 115, + "t": 116, + "u": 117, + "v": 118, + "w": 119, + "x": 120, + "y": 121, + "z": 122, + "{": 123, + "|": 124, + "}": 125, + "~": 126, + "": 127, + "\\x80": 128, + "\\x81": 129, + "\\x82": 130, + "\\x83": 131, + "\\x84": 132, + "\\x85": 133, + "\\x86": 134, + "\\x87": 135, + "\\x88": 136, + "\\x89": 137, + "\\x8a": 138, + "\\x8b": 139, + "\\x8c": 140, + "\\x8d": 141, + "\\x8e": 142, + "\\x8f": 143, + "\\x90": 144, + "\\x91": 145, + "\\x92": 146, + "\\x93": 147, + "\\x94": 148, + "\\x95": 149, + "\\x96": 150, + "\\x97": 151, + "\\x98": 152, + "\\x99": 153, + "\\x9a": 154, + "\\x9b": 155, + "\\x9c": 156, + "\\x9d": 157, + "\\x9e": 158, + "\\x9f": 159, + "\\xa0": 160, + "\\xa1": 161, + "\\xa2": 162, + "\\xa3": 163, + "\\xa4": 164, + "\\xa5": 165, + "\\xa6": 166, + "\\xa7": 167, + "\\xa8": 168, + "\\xa9": 169, + "\\xaa": 170, + "\\xab": 171, + "\\xac": 172, + "\\xad": 173, + "\\xae": 174, + "\\xaf": 175, + "\\xb0": 176, + "\\xb1": 177, + "\\xb2": 178, + "\\xb3": 179, + "\\xb4": 180, + "\\xb5": 181, + "\\xb6": 182, + "\\xb7": 183, + "\\xb8": 184, + "\\xb9": 185, + "\\xba": 186, + "\\xbb": 187, + "\\xbc": 188, + "\\xbd": 189, + "\\xbe": 190, + "\\xbf": 191, + "\\xc0": 192, + "\\xc1": 193, + "\\xc2": 194, + "\\xc3": 195, + "\\xc4": 196, + "\\xc5": 197, + "\\xc6": 198, + "\\xc7": 199, + "\\xc8": 200, + "\\xc9": 201, + "\\xca": 202, + "\\xcb": 203, + "\\xcc": 204, + "\\xcd": 205, + "\\xce": 206, + "\\xcf": 207, + "\\xd0": 208, + "\\xd1": 209, + "\\xd2": 210, + "\\xd3": 211, + "\\xd4": 212, + "\\xd5": 213, + "\\xd6": 214, + "\\xd7": 215, + "\\xd8": 216, + "\\xd9": 217, + "\\xda": 218, + "\\xdb": 219, + "\\xdc": 220, + "\\xdd": 221, + "\\xde": 222, + "\\xdf": 223, + "\\xe0": 224, + "\\xe1": 225, + "\\xe2": 226, + "\\xe3": 227, + "\\xe4": 228, + "\\xe5": 229, + "\\xe6": 230, + "\\xe7": 231, + "\\xe8": 232, + "\\xe9": 233, + "\\xea": 234, + "\\xeb": 235, + "\\xec": 236, + "\\xed": 237, + "\\xee": 238, + "\\xef": 239, + "\\xf0": 240, + "\\xf1": 241, + "\\xf2": 242, + "\\xf3": 243, + "\\xf4": 244, + "\\xf5": 245, + "\\xf6": 246, + "\\xf7": 247, + "\\xf8": 248, + "\\xf9": 249, + "\\xfa": 250, + "\\xfb": 251, + "\\xfc": 252, + "\\xfd": 253, + "\\xfe": 254, + "\\xff": 255, + " t": 256, + "he": 257, + " a": 258, + "in": 259, + " s": 260, + " w": 261, + " the": 262, + " o": 263, + "re": 264, + " b": 265, + "ou": 266, + "ed": 267, + " m": 268, + "nd": 269, + " I": 270, + "ha": 271, + "it": 272, + "er": 273, + "ing": 274, + " f": 275, + "is": 276, + " to": 277, + "en": 278, + "on": 279, + "or": 280, + "as": 281, + " c": 282, + " of": 283, + " and": 284, + " d": 285, + "ll": 286, + "at": 287, + "an": 288, + "ar": 289, + " p": 290, + " n": 291, + " in": 292, + "le": 293, + "om": 294, + "ot": 295, + " be": 296, + " h": 297, + "ut": 298, + "ow": 299, + "es": 300, + "hat": 301, + " g": 302, + " he": 303, + " ha": 304, + " l": 305, + " was": 306, + "ld": 307, + "gh": 308, + "id": 309, + "ch": 310, + " th": 311, + " it": 312, + "ay": 313, + " on": 314, + "ce": 315, + "se": 316, + "ent": 317, + " st": 318, + "ly": 319, + "ve": 320, + "et": 321, + "st": 322, + " T": 323, + " e": 324, + " y": 325, + "ght": 326, + "ir": 327, + " me": 328, + "oo": 329, + "al": 330, + "ith": 331, + " re": 332, + "im": 333, + " that": 334, + " as": 335, + "ould": 336, + "ro": 337, + "ad": 338, + "ion": 339, + ".\n": 340, + "her": 341, + " my": 342, + "ct": 343, + " not": 344, + " with": 345, + " for": 346, + " u": 347, + "ke": 348, + " you": 349, + " S": 350, + " is": 351, + "ight": 352, + "\"\n": 353, + "am": 354, + "ic": 355, + "ur": 356, + " at": 357, + "..": 358, + "ac": 359, + "ter": 360, + " wh": 361, + " an": 362, + " we": 363, + " The": 364, + "if": 365, + " or": 366, + " but": 367, + "ver": 368, + " \"": 369, + " r": 370, + "out": 371, + "ome": 372, + " had": 373, + "pp": 374, + "qu": 375, + " su": 376, + " this": 377, + "red": 378, + "ard": 379, + " so": 380, + "ell": 381, + " would": 382, + " his": 383, + " sh": 384, + "ine": 385, + "ra": 386, + " se": 387, + " by": 388, + ".\"\n": 389, + " P": 390, + "hen": 391, + " A": 392, + " have": 393, + " fr": 394, + " sa": 395, + " H": 396, + " one": 397, + "em": 398, + "ked": 399, + "irt": 400, + "ect": 401, + " him": 402, + " li": 403, + " ab": 404, + "ation": 405, + "hing": 406, + "the": 407, + " R": 408, + " le": 409, + "ss": 410, + " W": 411, + "cu": 412, + "ill": 413, + "'t": 414, + "art": 415, + "all": 416, + ",\n": 417, + "own": 418, + "ore": 419, + " all": 420, + " k": 421, + " go": 422, + "hirt": 423, + "and": 424, + " out": 425, + "ame": 426, + "ain": 427, + " if": 428, + " no": 429, + " do": 430, + " they": 431, + "ool": 432, + "un": 433, + "to": 434, + " up": 435, + " Red": 436, + " ne": 437, + " K": 438, + " from": 439, + " Shirt": 440, + " wor": 441, + "ong": 442, + " there": 443, + " said": 444, + "ri": 445, + "ant": 446, + " B": 447, + " any": 448, + "ud": 449, + "ind": 450, + " whi": 451, + "ab": 452, + "ound": 453, + " about": 454, + " them": 455, + "cup": 456, + "ak": 457, + " de": 458, + " te": 459, + " M": 460, + "ake": 461, + "cupine": 462, + "ig": 463, + " were": 464, + "orcupine": 465, + "il": 466, + "chool": 467, + " ro": 468, + "ood": 469, + " are": 470, + "ive": 471, + " like": 472, + "yo": 473, + " hou": 474, + "'s": 475, + "one": 476, + "us": 477, + "el": 478, + "ul": 479, + "ack": 480, + "op": 481, + ",\"": 482, + "th": 483, + "acher": 484, + "um": 485, + "ang": 486, + " fa": 487, + "ag": 488, + " school": 489, + " j": 490, + "te": 491, + "ok": 492, + "ess": 493, + "ust": 494, + "ers": 495, + "....": 496, + " C": 497, + "ther": 498, + "han": 499, + " when": 500, + " sp": 501, + " man": 502, + " can": 503, + "ough": 504, + " who": 505, + " get": 506, + " did": 507, + " po": 508, + "ci": 509, + " al": 510, + "ist": 511, + " com": 512, + "lf": 513, + "au": 514, + " Porcupine": 515, + " which": 516, + "ven": 517, + " af": 518, + "wn": 519, + "ass": 520, + "ber": 521, + " ex": 522, + "ous": 523, + "est": 524, + "lo": 525, + " tr": 526, + "ellow": 527, + " say": 528, + "ought": 529, + " room": 530, + " some": 531, + "--": 532, + " O": 533, + "ate": 534, + " v": 535, + "hed": 536, + "ap": 537, + " tw": 538, + " bec": 539, + "ree": 540, + "ject": 541, + "ks": 542, + " con": 543, + " been": 544, + "ents": 545, + "ide": 546, + " could": 547, + " G": 548, + "ep": 549, + " pro": 550, + "nt": 551, + " house": 552, + " ag": 553, + " If": 554, + " kn": 555, + " fellow": 556, + " what": 557, + "way": 558, + "ish": 559, + " am": 560, + "ite": 561, + "nder": 562, + "ime": 563, + " pr": 564, + " teacher": 565, + "are": 566, + " bo": 567, + " she": 568, + " N": 569, + "ice": 570, + "ast": 571, + "ure": 572, + "ie": 573, + " such": 574, + "uten": 575, + "utenber": 576, + "utenberg": 577, + " qu": 578, + "lown": 579, + " wr": 580, + "pt": 581, + " He": 582, + " stud": 583, + "here": 584, + " more": 585, + "ry": 586, + "tter": 587, + " Y": 588, + " may": 589, + "ity": 590, + " loo": 591, + " other": 592, + "his": 593, + " Pro": 594, + " will": 595, + " It": 596, + "ort": 597, + " should": 598, + "very": 599, + "we": 600, + " pl": 601, + "ash": 602, + ".\"": 603, + " app": 604, + " day": 605, + "urn": 606, + "po": 607, + " her": 608, + " ": 609, + "not": 610, + "ck": 611, + " un": 612, + "hi": 613, + "ving": 614, + " old": 615, + " time": 616, + "\"T": 617, + " way": 618, + "able": 619, + "?\"\n": 620, + " Clown": 621, + " only": 622, + "ub": 623, + "ach": 624, + " off": 625, + " than": 626, + "ally": 627, + " their": 628, + "be": 629, + "king": 630, + "other": 631, + "ary": 632, + "ans": 633, + "ated": 634, + "self": 635, + " going": 636, + "uch": 637, + "oll": 638, + " back": 639, + "iyo": 640, + "-t": 641, + "ance": 642, + "ade": 643, + " Project": 644, + "sp": 645, + " two": 646, + " thought": 647, + "so": 648, + " right": 649, + " head": 650, + "ved": 651, + " D": 652, + " pre": 653, + " see": 654, + " us": 655, + " students": 656, + "cip": 657, + " don": 658, + " night": 659, + "incip": 660, + " Kiyo": 661, + "pl": 662, + "ared": 663, + " Gutenberg": 664, + " co": 665, + " how": 666, + "omet": 667, + "ff": 668, + "\"I": 669, + ",--": 670, + " asked": 671, + "incipal": 672, + "ever": 673, + " ac": 674, + " F": 675, + " make": 676, + "itt": 677, + " might": 678, + "ge": 679, + "led": 680, + " after": 681, + "ign": 682, + " gr": 683, + " made": 684, + "dd": 685, + " know": 686, + " come": 687, + " br": 688, + "thing": 689, + " But": 690, + " mat": 691, + " On": 692, + "ory": 693, + "cl": 694, + " E": 695, + "ble": 696, + "og": 697, + " your": 698, + "ull": 699, + " work": 700, + "ear": 701, + " three": 702, + "ied": 703, + "but": 704, + "The": 705, + "pe": 706, + "ace": 707, + " start": 708, + "ick": 709, + " over": 710, + "our": 711, + " much": 712, + " want": 713, + "imp": 714, + " part": 715, + "ho": 716, + "ink": 717, + "ence": 718, + " down": 719, + " even": 720, + " principal": 721, + "ling": 722, + "ount": 723, + "ause": 724, + " cl": 725, + " bl": 726, + "-tm": 727, + "omething": 728, + " into": 729, + "orm": 730, + "okyo": 731, + " dis": 732, + " fe": 733, + " face": 734, + "......": 735, + "ress": 736, + "ment": 737, + "ire": 738, + " ar": 739, + "ty": 740, + " mo": 741, + "reat": 742, + " fir": 743, + "per": 744, + " our": 745, + "co": 746, + " then": 747, + " told": 748, + "ings": 749, + " take": 750, + " beg": 751, + "ner": 752, + "ition": 753, + "ose": 754, + " own": 755, + " again": 756, + " seem": 757, + "ise": 758, + " wat": 759, + "\"W": 760, + " far": 761, + "aking": 762, + "fore": 763, + "ady": 764, + "-s": 765, + "less": 766, + " ret": 767, + " sha": 768, + " came": 769, + "ger": 770, + " good": 771, + "ather": 772, + "ark": 773, + "row": 774, + " ke": 775, + "'m": 776, + " has": 777, + "ath": 778, + "pped": 779, + " went": 780, + " tell": 781, + "quash": 782, + " en": 783, + " first": 784, + " hot": 785, + "iz": 786, + " away": 787, + " something": 788, + " rem": 789, + " town": 790, + " sm": 791, + " This": 792, + " better": 793, + " Then": 794, + "was": 795, + "of": 796, + "bard": 797, + " L": 798, + "li": 799, + "fe": 800, + " Tokyo": 801, + " long": 802, + "ily": 803, + " sure": 804, + " looked": 805, + "ubbard": 806, + "ction": 807, + "ord": 808, + " many": 809, + "ious": 810, + " too": 811, + " here": 812, + "os": 813, + " under": 814, + "ase": 815, + "ng": 816, + "ped": 817, + "od": 818, + "me": 819, + " just": 820, + " now": 821, + "ince": 822, + " heard": 823, + " kind": 824, + " They": 825, + " before": 826, + "hy": 827, + " In": 828, + " ent": 829, + " board": 830, + "!\"": 831, + "ward": 832, + " being": 833, + " well": 834, + "erm": 835, + "ried": 836, + " wrong": 837, + "aid": 838, + "xt": 839, + " return": 840, + "ited": 841, + " yen": 842, + " matter": 843, + " call": 844, + " tal": 845, + " You": 846, + "ced": 847, + "ised": 848, + " cha": 849, + "ons": 850, + " same": 851, + " once": 852, + "day": 853, + "ft": 854, + " sw": 855, + " because": 856, + " think": 857, + " where": 858, + " No": 859, + " Hubbard": 860, + " Squash": 861, + " cop": 862, + "with": 863, + "ered": 864, + "ollow": 865, + " place": 866, + "idd": 867, + "cess": 868, + " show": 869, + "isha": 870, + " ra": 871, + " letter": 872, + "ne": 873, + "ves": 874, + "ating": 875, + "rang": 876, + " aff": 877, + " hand": 878, + " sc": 879, + " pers": 880, + "int": 881, + "pr": 882, + "side": 883, + "fter": 884, + " saying": 885, + " lau": 886, + "that": 887, + " without": 888, + "ron": 889, + "air": 890, + "lect": 891, + " What": 892, + "elt": 893, + " while": 894, + "oga": 895, + "aper": 896, + " pe": 897, + "oy": 898, + " sat": 899, + "ies": 900, + " add": 901, + " days": 902, + " spe": 903, + " ho": 904, + " ans": 905, + " har": 906, + " When": 907, + " anything": 908, + "pen": 909, + "]\n": 910, + "tain": 911, + " must": 912, + " new": 913, + "lic": 914, + " vo": 915, + "hile": 916, + "get": 917, + " As": 918, + " very": 919, + "'re": 920, + " every": 921, + "ave": 922, + "?\"": 923, + "adger": 924, + " Koga": 925, + " Mr": 926, + "rough": 927, + "ult": 928, + " follow": 929, + "ting": 930, + "ife": 931, + "iddle": 932, + "ful": 933, + "ank": 934, + " So": 935, + " seemed": 936, + " And": 937, + "ix": 938, + " set": 939, + " care": 940, + " res": 941, + " never": 942, + " found": 943, + " lo": 944, + "cid": 945, + "ined": 946, + " class": 947, + " myself": 948, + "aw": 949, + " wom": 950, + "ations": 951, + " left": 952, + " We": 953, + " teachers": 954, + "\"Y": 955, + "na": 956, + "ont": 957, + " des": 958, + " those": 959, + "ired": 960, + " sen": 961, + "ying": 962, + " these": 963, + "az": 964, + " There": 965, + "cept": 966, + " dang": 967, + " U": 968, + "\"H": 969, + "bod": 970, + "body": 971, + " having": 972, + "alary": 973, + " watch": 974, + " give": 975, + "age": 976, + " its": 977, + " appe": 978, + "ue": 979, + " count": 980, + " hard": 981, + " bel": 982, + "ott": 983, + " dist": 984, + "\"S": 985, + " Mad": 986, + "-n": 987, + "ribut": 988, + "ged": 989, + " att": 990, + "fere": 991, + "ither": 992, + " upon": 993, + " tem": 994, + " person": 995, + "ning": 996, + " che": 997, + "arly": 998, + "oney": 999, + " soon": 1000, + "ement": 1001, + " (": 1002, + " trans": 1003, + " exp": 1004, + " ser": 1005, + " reg": 1006, + "ason": 1007, + " saw": 1008, + " next": 1009, + "oot": 1010, + " half": 1011, + " took": 1012, + " bad": 1013, + " hour": 1014, + " salary": 1015, + " began": 1016, + "right": 1017, + "onna": 1018, + "-san": 1019, + " works": 1020, + " J": 1021, + "form": 1022, + "ical": 1023, + " tra": 1024, + "man": 1025, + " nothing": 1026, + " still": 1027, + "ears": 1028, + " supp": 1029, + " turn": 1030, + " felt": 1031, + " woman": 1032, + " started": 1033, + "ouble": 1034, + "ura": 1035, + "ishing": 1036, + ":\n": 1037, + "lectron": 1038, + "lectronic": 1039, + "ook": 1040, + " copy": 1041, + " full": 1042, + "cond": 1043, + "mat": 1044, + " middle": 1045, + " look": 1046, + " comm": 1047, + "wered": 1048, + " became": 1049, + " fellows": 1050, + "would": 1051, + " got": 1052, + " gl": 1053, + " gu": 1054, + " keep": 1055, + " ge": 1056, + " Madonna": 1057, + "iter": 1058, + "ished": 1059, + " underst": 1060, + " stra": 1061, + "sid": 1062, + " country": 1063, + "ople": 1064, + " prov": 1065, + " put": 1066, + "no": 1067, + "'ll": 1068, + " sle": 1069, + "range": 1070, + " She": 1071, + "pos": 1072, + " mind": 1073, + " pass": 1074, + " through": 1075, + " quite": 1076, + " ind": 1077, + " boarding": 1078, + "teacher": 1079, + "ple": 1080, + "Porcupine": 1081, + " ple": 1082, + " geisha": 1083, + " ": 1084, + "ost": 1085, + "ense": 1086, + "No": 1087, + "ible": 1088, + " read": 1089, + " red": 1090, + "ention": 1091, + "ened": 1092, + "!\"\n": 1093, + " ref": 1094, + " ad": 1095, + " fl": 1096, + " stay": 1097, + "up": 1098, + " round": 1099, + " cle": 1100, + " open": 1101, + " ob": 1102, + "tend": 1103, + " find": 1104, + " per": 1105, + " called": 1106, + " sur": 1107, + "rew": 1108, + " paper": 1109, + " Badger": 1110, + " meet": 1111, + "iss": 1112, + "\"That": 1113, + "erms": 1114, + "TE": 1115, + "itten": 1116, + "ably": 1117, + "ness": 1118, + " cannot": 1119, + " simp": 1120, + "con": 1121, + " reason": 1122, + "you": 1123, + " home": 1124, + "by": 1125, + " fight": 1126, + "ittle": 1127, + " things": 1128, + " eas": 1129, + " imp": 1130, + "ressed": 1131, + " mean": 1132, + " appeared": 1133, + " nat": 1134, + " hel": 1135, + "ret": 1136, + "aken": 1137, + " straight": 1138, + " affair": 1139, + "iting": 1140, + " ed": 1141, + " since": 1142, + "log": 1143, + " pay": 1144, + " front": 1145, + "my": 1146, + " voice": 1147, + "ready": 1148, + " fool": 1149, + "oundation": 1150, + " electronic": 1151, + " terms": 1152, + " mar": 1153, + "apan": 1154, + "any": 1155, + " resp": 1156, + " end": 1157, + "app": 1158, + "what": 1159, + "str": 1160, + "rap": 1161, + "ial": 1162, + "icul": 1163, + " acc": 1164, + "oth": 1165, + " second": 1166, + " flo": 1167, + " six": 1168, + " feet": 1169, + "br": 1170, + "iet": 1171, + " little": 1172, + "les": 1173, + " money": 1174, + " decl": 1175, + " ey": 1176, + " comp": 1177, + "aring": 1178, + " agre": 1179, + "where": 1180, + " St": 1181, + " stre": 1182, + "ex": 1183, + "ract": 1184, + " int": 1185, + " dire": 1186, + " become": 1187, + " hon": 1188, + " consid": 1189, + "ertain": 1190, + "now": 1191, + " sl": 1192, + "itor": 1193, + "gg": 1194, + " jum": 1195, + " bu": 1196, + " thing": 1197, + " answered": 1198, + "oes": 1199, + "ya": 1200, + " That": 1201, + "ize": 1202, + "ond": 1203, + "act": 1204, + " eff": 1205, + " bang": 1206, + "about": 1207, + " bed": 1208, + "orrow": 1209, + "ung": 1210, + " To": 1211, + " kept": 1212, + " wal": 1213, + " bath": 1214, + " dra": 1215, + "\"A": 1216, + "rings": 1217, + "hopp": 1218, + " resign": 1219, + " din": 1220, + " lady": 1221, + ".E": 1222, + " use": 1223, + "lish": 1224, + "ors": 1225, + " written": 1226, + "ene": 1227, + "iv": 1228, + " dif": 1229, + " ste": 1230, + " story": 1231, + "com": 1232, + "res": 1233, + "ently": 1234, + " fact": 1235, + "hes": 1236, + "ways": 1237, + " why": 1238, + " though": 1239, + " str": 1240, + "onder": 1241, + "head": 1242, + " cour": 1243, + " mon": 1244, + " sk": 1245, + " belie": 1246, + " let": 1247, + "fer": 1248, + " requ": 1249, + " line": 1250, + "room": 1251, + "-day": 1252, + " done": 1253, + " does": 1254, + " One": 1255, + " dango": 1256, + "asshopp": 1257, + " consider": 1258, + " dinner": 1259, + " Foundation": 1260, + "**": 1261, + "empt": 1262, + "ese": 1263, + " word": 1264, + "rest": 1265, + " enough": 1266, + " great": 1267, + " name": 1268, + " pub": 1269, + " manner": 1270, + "wer": 1271, + "ict": 1272, + "iness": 1273, + " himself": 1274, + " people": 1275, + "ew": 1276, + " cor": 1277, + "estion": 1278, + " big": 1279, + "ee": 1280, + " ri": 1281, + "ides": 1282, + " brother": 1283, + " heart": 1284, + "ected": 1285, + "eed": 1286, + " others": 1287, + "sol": 1288, + "ted": 1289, + " eyes": 1290, + " trouble": 1291, + " teach": 1292, + " boat": 1293, + " four": 1294, + " already": 1295, + "rom": 1296, + "ghed": 1297, + " squ": 1298, + " pol": 1299, + "ces": 1300, + " Hott": 1301, + " leave": 1302, + " distribut": 1303, + "aster": 1304, + "CH": 1305, + "uc": 1306, + " im": 1307, + " however": 1308, + "there": 1309, + "apanese": 1310, + " last": 1311, + " cr": 1312, + "ility": 1313, + " simple": 1314, + " life": 1315, + "-c": 1316, + " regard": 1317, + " fin": 1318, + "ual": 1319, + " means": 1320, + " stand": 1321, + "atch": 1322, + " short": 1323, + "ned": 1324, + " seen": 1325, + " happ": 1326, + "-k": 1327, + " against": 1328, + "him": 1329, + "amed": 1330, + " stood": 1331, + " gra": 1332, + " mother": 1333, + " fish": 1334, + " water": 1335, + "ail": 1336, + "cei": 1337, + " rather": 1338, + " ins": 1339, + " feel": 1340, + " also": 1341, + " ord": 1342, + " coming": 1343, + "ics": 1344, + " either": 1345, + "nce": 1346, + " '": 1347, + " kid": 1348, + " laughed": 1349, + "like": 1350, + " Ar": 1351, + "gr": 1352, + " Hotta": 1353, + " talk": 1354, + "gether": 1355, + " Sir": 1356, + " pun": 1357, + "Pro": 1358, + "ats": 1359, + "most": 1360, + " rep": 1361, + " gi": 1362, + "isf": 1363, + "bably": 1364, + "akes": 1365, + " Not": 1366, + "ny": 1367, + " appear": 1368, + "mp": 1369, + "cha": 1370, + " act": 1371, + "bed": 1372, + "ief": 1373, + "uff": 1374, + " apo": 1375, + " met": 1376, + " returned": 1377, + " sound": 1378, + "usiness": 1379, + " laugh": 1380, + " clear": 1381, + " need": 1382, + "fess": 1383, + "ested": 1384, + " inv": 1385, + " accept": 1386, + "under": 1387, + ";\n": 1388, + " surpr": 1389, + "de": 1390, + " train": 1391, + " hotel": 1392, + " sleep": 1393, + " dr": 1394, + " hold": 1395, + "lock": 1396, + "pura": 1397, + " springs": 1398, + " ......": 1399, + " agreement": 1400, + " Dar": 1401, + " rest": 1402, + "clud": 1403, + "ator": 1404, + "av": 1405, + " orig": 1406, + " origin": 1407, + " el": 1408, + " nor": 1409, + " pres": 1410, + " understand": 1411, + " taken": 1412, + " light": 1413, + "ener": 1414, + "some": 1415, + " brought": 1416, + "raph": 1417, + " most": 1418, + "oke": 1419, + "-w": 1420, + " unt": 1421, + " father": 1422, + " used": 1423, + " eat": 1424, + " years": 1425, + " While": 1426, + " chan": 1427, + " sudd": 1428, + " sudden": 1429, + " apolog": 1430, + " sett": 1431, + " thin": 1432, + " My": 1433, + " ten": 1434, + "imes": 1435, + "for": 1436, + "oud": 1437, + "When": 1438, + " det": 1439, + " live": 1440, + " oc": 1441, + " five": 1442, + " cont": 1443, + " help": 1444, + " wa": 1445, + " passed": 1446, + " run": 1447, + " making": 1448, + " strange": 1449, + " taking": 1450, + " each": 1451, + "\"You": 1452, + " another": 1453, + "\"Say": 1454, + "\"The": 1455, + "ates": 1456, + " pleas": 1457, + "asshoppers": 1458, + " mom": 1459, + " moment": 1460, + "entle": 1461, + "nglish": 1462, + "CHA": 1463, + " original": 1464, + "ions": 1465, + "uring": 1466, + " public": 1467, + "uct": 1468, + "uck": 1469, + " question": 1470, + "ai": 1471, + "cy": 1472, + "ek": 1473, + " floor": 1474, + " car": 1475, + "ouse": 1476, + " side": 1477, + "-ya": 1478, + " certain": 1479, + "hys": 1480, + "-d": 1481, + "igh": 1482, + "agin": 1483, + "weet": 1484, + " poor": 1485, + " decid": 1486, + "ually": 1487, + " business": 1488, + "pro": 1489, + "plain": 1490, + " stop": 1491, + "!\n": 1492, + " How": 1493, + "\"What": 1494, + "can": 1495, + " Un": 1496, + "ps": 1497, + "und": 1498, + "-night": 1499, + " meeting": 1500, + "edo": 1501, + " raise": 1502, + "Gutenberg": 1503, + " Darling": 1504, + "ume": 1505, + " English": 1506, + "TER": 1507, + "ading": 1508, + " transl": 1509, + " able": 1510, + "ssible": 1511, + " satisf": 1512, + " wanted": 1513, + " sub": 1514, + " case": 1515, + "ific": 1516, + "iterary": 1517, + " maid": 1518, + " inc": 1519, + " pos": 1520, + " position": 1521, + " pat": 1522, + "ured": 1523, + "orry": 1524, + " account": 1525, + " both": 1526, + " frie": 1527, + " friend": 1528, + "this": 1529, + " always": 1530, + " particul": 1531, + "What": 1532, + " small": 1533, + "enty": 1534, + "ushed": 1535, + " mis": 1536, + "ully": 1537, + " recei": 1538, + "You": 1539, + " yet": 1540, + " gave": 1541, + "But": 1542, + "had": 1543, + " answer": 1544, + " abs": 1545, + "ile": 1546, + "cket": 1547, + " nood": 1548, + " course": 1549, + " form": 1550, + " everything": 1551, + "ection": 1552, + "If": 1553, + "part": 1554, + " sing": 1555, + " sit": 1556, + " pur": 1557, + "ip": 1558, + " fishing": 1559, + " eh": 1560, + " par": 1561, + " together": 1562, + "He": 1563, + " whe": 1564, + " whether": 1565, + " bra": 1566, + "\"Yes": 1567, + " punish": 1568, + "Shirt": 1569, + " Yedo": 1570, + " farew": 1571, + " farewell": 1572, + " dance": 1573, + " less": 1574, + "ural": 1575, + " def": 1576, + " attempt": 1577, + "ween": 1578, + " sign": 1579, + " sy": 1580, + "ferent": 1581, + " least": 1582, + "ser": 1583, + "ob": 1584, + "nding": 1585, + " sorry": 1586, + " jumped": 1587, + " jan": 1588, + " janitor": 1589, + "ized": 1590, + " toward": 1591, + " mor": 1592, + "aving": 1593, + " bit": 1594, + "\"This": 1595, + " remark": 1596, + " fut": 1597, + " wonder": 1598, + " fun": 1599, + "Then": 1600, + " dec": 1601, + " whom": 1602, + " didn": 1603, + " rec": 1604, + "bec": 1605, + "\"If": 1606, + " knew": 1607, + "after": 1608, + " thus": 1609, + " isn": 1610, + " sight": 1611, + "med": 1612, + "[F": 1613, + "uss": 1614, + "cident": 1615, + "them": 1616, + " fif": 1617, + " draw": 1618, + " hear": 1619, + " writing": 1620, + " getting": 1621, + "sh": 1622, + "ference": 1623, + " raised": 1624, + "they": 1625, + "ax": 1626, + " fine": 1627, + "sel": 1628, + " Nobe": 1629, + " Nobeok": 1630, + " Nobeoka": 1631, + "ormal": 1632, + " eB": 1633, + "icense": 1634, + "00": 1635, + " best": 1636, + "wor": 1637, + "fic": 1638, + "terest": 1639, + " remar": 1640, + "bl": 1641, + "arted": 1642, + " dark": 1643, + " young": 1644, + "ush": 1645, + " bet": 1646, + "outh": 1647, + "house": 1648, + "aught": 1649, + " phys": 1650, + " strong": 1651, + " fur": 1652, + " roll": 1653, + "cove": 1654, + "chief": 1655, + "awa": 1656, + " followed": 1657, + " fond": 1658, + " future": 1659, + "ird": 1660, + "fully": 1661, + " effort": 1662, + "After": 1663, + "oward": 1664, + " really": 1665, + " among": 1666, + " around": 1667, + " compl": 1668, + " gaz": 1669, + " bow": 1670, + "ater": 1671, + " insist": 1672, + " turned": 1673, + "hel": 1674, + "rem": 1675, + " hours": 1676, + " decided": 1677, + "ys": 1678, + " month": 1679, + "-a": 1680, + " adv": 1681, + " believe": 1682, + " teaching": 1683, + " easy": 1684, + " direction": 1685, + "ooked": 1686, + " war": 1687, + " unless": 1688, + "have": 1689, + " square": 1690, + "vil": 1691, + " quiet": 1692, + " hung": 1693, + " goes": 1694, + " paid": 1695, + " shall": 1696, + "\"No": 1697, + " punishment": 1698, + "pose": 1699, + " sweet": 1700, + "'ve": 1701, + "\"Well": 1702, + " gentle": 1703, + " normal": 1704, + "agraph": 1705, + "chive": 1706, + "chan": 1707, + " includ": 1708, + "ww": 1709, + "org": 1710, + "tem": 1711, + "AR": 1712, + " TH": 1713, + " equ": 1714, + " tone": 1715, + " possible": 1716, + " becom": 1717, + " Japanese": 1718, + "vers": 1719, + " following": 1720, + " pain": 1721, + " whole": 1722, + "wr": 1723, + " serious": 1724, + " nar": 1725, + " tired": 1726, + "In": 1727, + " play": 1728, + " prom": 1729, + " game": 1730, + " Some": 1731, + " happened": 1732, + " cut": 1733, + " twenty": 1734, + " door": 1735, + " morning": 1736, + "hind": 1737, + " bre": 1738, + " inside": 1739, + "ove": 1740, + "alth": 1741, + "uk": 1742, + "arge": 1743, + "amb": 1744, + " dam": 1745, + " worry": 1746, + "ative": 1747, + " expected": 1748, + " fam": 1749, + " pra": 1750, + " pocket": 1751, + "ooks": 1752, + "ched": 1753, + " sil": 1754, + "ol": 1755, + " fav": 1756, + " else": 1757, + " high": 1758, + " real": 1759, + " along": 1760, + " med": 1761, + "hik": 1762, + "hemat": 1763, + "hematics": 1764, + " list": 1765, + " sick": 1766, + "oint": 1767, + "[Foot": 1768, + "[Footnot": 1769, + "[Footnote": 1770, + ".]\n": 1771, + "night": 1772, + "ses": 1773, + "ior": 1774, + " says": 1775, + " mouth": 1776, + "how": 1777, + "ming": 1778, + " clo": 1779, + " cur": 1780, + "ging": 1781, + " suddenly": 1782, + "-ah": 1783, + "amp": 1784, + " black": 1785, + "ross": 1786, + " fac": 1787, + "selves": 1788, + "iew": 1789, + "ission": 1790, + " copyright": 1791, + " paragraph": 1792, + " Archive": 1793, + " donations": 1794, + "Project": 1795, + " cost": 1796, + ".org": 1797, + "LI": 1798, + "uced": 1799, + " suc": 1800, + "yle": 1801, + " force": 1802, + "joy": 1803, + "ouch": 1804, + "tr": 1805, + "It": 1806, + " trad": 1807, + " present": 1808, + " ext": 1809, + "ased": 1810, + "redit": 1811, + " fault": 1812, + "ib": 1813, + "-m": 1814, + "urd": 1815, + " tried": 1816, + "time": 1817, + " pret": 1818, + " spee": 1819, + "ower": 1820, + " words": 1821, + "CHAP": 1822, + "CHAPTER": 1823, + "school": 1824, + " ask": 1825, + " doing": 1826, + "ately": 1827, + " until": 1828, + "bout": 1829, + " tree": 1830, + "call": 1831, + "amash": 1832, + "amashir": 1833, + "amashiro": 1834, + "ste": 1835, + " behind": 1836, + "old": 1837, + " wall": 1838, + "itory": 1839, + " rolled": 1840, + " move": 1841, + " apologize": 1842, + " large": 1843, + "amboo": 1844, + "su": 1845, + " settled": 1846, + "\"He": 1847, + "wo": 1848, + " thinking": 1849, + "used": 1850, + "ified": 1851, + " almost": 1852, + " tre": 1853, + " treat": 1854, + " noodle": 1855, + " note": 1856, + " All": 1857, + " beat": 1858, + " object": 1859, + " seems": 1860, + " ide": 1861, + "Yes": 1862, + "ows": 1863, + " remain": 1864, + " begin": 1865, + "ught": 1866, + "ments": 1867, + " alone": 1868, + "spect": 1869, + " mathematics": 1870, + " rough": 1871, + " outside": 1872, + " comes": 1873, + "back": 1874, + " wind": 1875, + "sed": 1876, + " wouldn": 1877, + "eer": 1878, + "inut": 1879, + "from": 1880, + " repl": 1881, + " narrow": 1882, + " incident": 1883, + " air": 1884, + " sea": 1885, + "ts": 1886, + " surprised": 1887, + " tea": 1888, + "Red": 1889, + " talking": 1890, + " boss": 1891, + "que": 1892, + " pict": 1893, + "irty": 1894, + " ce": 1895, + " lim": 1896, + " Why": 1897, + " point": 1898, + " law": 1899, + "ciated": 1900, + " moon": 1901, + "ircu": 1902, + "got": 1903, + " Is": 1904, + " hands": 1905, + " honor": 1906, + "aut": 1907, + "rge": 1908, + " state": 1909, + " Literary": 1910, + ".F": 1911, + "This": 1912, + "line": 1913, + ".g": 1914, + ".gutenberg": 1915, + " OF": 1916, + "EN": 1917, + "racter": 1918, + " bene": 1919, + " Even": 1920, + "oub": 1921, + " makes": 1922, + " interest": 1923, + "ope": 1924, + "ms": 1925, + " respons": 1926, + " fore": 1927, + " somewhat": 1928, + " honest": 1929, + "ock": 1930, + "irit": 1931, + " held": 1932, + " added": 1933, + "fu": 1934, + "aded": 1935, + "als": 1936, + "att": 1937, + "tern": 1938, + " personal": 1939, + " ass": 1940, + " With": 1941, + "tic": 1942, + "Tokyo": 1943, + " shout": 1944, + " pretty": 1945, + "umb": 1946, + " early": 1947, + "opped": 1948, + " further": 1949, + " fre": 1950, + "esides": 1951, + " bamboo": 1952, + " ir": 1953, + "more": 1954, + " living": 1955, + " received": 1956, + " lived": 1957, + " meant": 1958, + " coward": 1959, + "position": 1960, + " loc": 1961, + "iled": 1962, + " tender": 1963, + " ch": 1964, + " After": 1965, + "cer": 1966, + " favor": 1967, + "who": 1968, + " liked": 1969, + "rance": 1970, + " pri": 1971, + "kisha": 1972, + " study": 1973, + " order": 1974, + " afterward": 1975, + " greatly": 1976, + " unable": 1977, + "go": 1978, + " wait": 1979, + "eping": 1980, + "iding": 1981, + " forty": 1982, + " sky": 1983, + " office": 1984, + "will": 1985, + "\"D": 1986, + "wel": 1987, + " station": 1988, + "bo": 1989, + "hot": 1990, + "such": 1991, + " loud": 1992, + " aw": 1993, + "land": 1994, + "?\n": 1995, + " respect": 1996, + "ances": 1997 + }, + "merges": [ + ] + } +} diff --git a/tests/assets/tokenizer/tokenizer_config.json b/tests/assets/tokenizer/tokenizer_config.json new file mode 100644 index 000000000..da6379b3f --- /dev/null +++ b/tests/assets/tokenizer/tokenizer_config.json @@ -0,0 +1,29 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/tests/unit_tests/test_dataset_checkpointing.py b/tests/unit_tests/test_dataset_checkpointing.py index 00998fc49..36dcd8f86 100644 --- a/tests/unit_tests/test_dataset_checkpointing.py +++ b/tests/unit_tests/test_dataset_checkpointing.py @@ -8,9 +8,9 @@ import torch from datasets import load_dataset +from torchtitan.components.tokenizer import HuggingFaceTokenizer from torchtitan.config_manager import ConfigManager from torchtitan.datasets.hf_datasets import build_hf_dataloader, DatasetConfig, DATASETS -from torchtitan.datasets.tokenizer.tiktoken import TikTokenizer class TestDatasetCheckpointing(unittest.TestCase): @@ -58,7 +58,7 @@ def test_c4_resumption(self): assert torch.equal(labels, expected_labels) def _build_dataloader(self, dataset_name, batch_size, seq_len, world_size, rank): - tokenizer = TikTokenizer("./tests/assets/test_tiktoken.model") + tokenizer = HuggingFaceTokenizer("./tests/assets/tokenizer") config_manager = ConfigManager() config = config_manager.parse_args( [ diff --git a/tests/unit_tests/test_tokenizer.py b/tests/unit_tests/test_tokenizer.py index 8efd48167..72fa28a46 100644 --- a/tests/unit_tests/test_tokenizer.py +++ b/tests/unit_tests/test_tokenizer.py @@ -19,7 +19,7 @@ parametrize, ) -from torchtitan.components.tokenizer import build_hf_tokenizer +from torchtitan.components.tokenizer import HuggingFaceTokenizer class TestTokenizerIntegration(unittest.TestCase): @@ -278,7 +278,7 @@ def test_download_and_build_tokenizer(self, test_repo_id): model_name = test_repo_id.split("/")[-1] tokenizer_dir = "tokenizer" if model_name == "FLUX.1-dev" else "." tokenizer_path = os.path.join(self.temp_dir, model_name, tokenizer_dir) - our_tokenizer = build_hf_tokenizer(tokenizer_path) + our_tokenizer = HuggingFaceTokenizer(tokenizer_path) # Step 3: Load tokenizer using official Tokenizer library (if available) official_tokenizer = None @@ -308,101 +308,6 @@ def test_download_and_build_tokenizer(self, test_repo_id): our_tokenizer, transformers_tokenizer, test_repo_id ) - def test_backward_comptability(self): - from torchtitan.datasets.tokenizer.tiktoken import TikTokenizer - - # The existing tokenizer lives under assets/original/tokenizer.model - # This test ensures that the new tokenizer can load the old tokenizer - # and produce the same results - - # Get the base project directory (two levels up from test file) - base_project_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) - old_tokenizer_path = os.path.join( - base_project_dir, "assets", "tokenizer", "original", "tokenizer.model" - ) - - # Skip test if the old tokenizer path cannot be found - if not os.path.exists(old_tokenizer_path): - self.skipTest(f"Old tokenizer file not found at {old_tokenizer_path}") - - print(old_tokenizer_path) - old_tokenizer = TikTokenizer(old_tokenizer_path) - - # Download and load a new tokenizer for comparison (using Meta-Llama-3.1-8B) - test_repo_id = "meta-llama/Meta-Llama-3.1-8B" - try: - download_hf_tokenizer_files( - repo_id=test_repo_id, - local_dir=self.temp_dir, - ) - - # Load the new tokenizer - model_name = test_repo_id.split("/")[-1] - new_tokenizer_path = os.path.join(self.temp_dir, model_name) - new_tokenizer = build_hf_tokenizer(new_tokenizer_path) - - # Compare encoding and decoding functionality only (TikTokenizer doesn't support vocab operations) - test_texts = [ - "Hello world!", - "This is a test.", - "The quick brown fox jumps over the lazy dog.", - "Special characters: @#$%^&*()", - "Numbers: 123456789", - "Mixed: Hello123 World!@#", - "", # Empty string - " ", # Single space - " ", # Multiple spaces - ] - - for text in test_texts: - # Encode with both tokenizers - # TikTokenizer requires bos and eos parameters - old_tokens = old_tokenizer.encode(text, bos=True, eos=False) - # HuggingFaceTokenizer has optional add_bos and add_eos parameters - new_tokens = new_tokenizer.encode(text) - - self.assertEqual( - old_tokens, - new_tokens, - f"Encoded tokens should match for text '{text}' in backward compatibility test", - ) - - # Test decoding - old_decoded = old_tokenizer.decode(old_tokens) - new_decoded = new_tokenizer.decode( - new_tokens, skip_special_tokens=False - ) - - self.assertEqual( - old_decoded, - new_decoded, - f"Decoded text should match for '{text}' in backward compatibility test", - ) - - # Test edge cases - edge_cases = [ - "🚀🌟✨", # Emojis - "café naïve résumé", # Accented characters - "こんにちは世界", # Non-Latin scripts (Japanese) - "Здравствуй мир", # Cyrillic - "\n\t\r", # Whitespace characters - "a" - * 100, # Long repeated character (reduced from 1000 to avoid tiktoken limits) - ] - - for text in edge_cases: - old_tokens = old_tokenizer.encode(text, bos=True, eos=False) - new_tokens = new_tokenizer.encode(text) - - self.assertEqual( - old_tokens, - new_tokens, - f"Edge case tokens should match for text '{text[:50]}...' in backward compatibility test", - ) - - except HTTPError as e: - self.skipTest(f"Could not download new tokenizer for comparison: {e}") - instantiate_parametrized_tests(TestTokenizerIntegration) diff --git a/tests/unit_tests/test_train_spec.py b/tests/unit_tests/test_train_spec.py index 15780d10a..c364af385 100644 --- a/tests/unit_tests/test_train_spec.py +++ b/tests/unit_tests/test_train_spec.py @@ -12,9 +12,9 @@ from torchtitan.components.loss import build_cross_entropy_loss from torchtitan.components.lr_scheduler import build_lr_schedulers from torchtitan.components.optimizer import build_optimizers, OptimizersContainer +from torchtitan.components.tokenizer import build_hf_tokenizer from torchtitan.config_manager import JobConfig from torchtitan.datasets.hf_datasets import build_hf_dataloader -from torchtitan.datasets.tokenizer.tiktoken import build_tiktoken_tokenizer from torchtitan.models.llama3 import parallelize_llama, pipeline_llama from torchtitan.protocols.train_spec import ( apply_to_train_specs, @@ -67,7 +67,7 @@ def test_register_train_spec(self): build_optimizers_fn=build_optimizers, build_lr_schedulers_fn=build_lr_schedulers, build_dataloader_fn=build_hf_dataloader, - build_tokenizer_fn=build_tiktoken_tokenizer, + build_tokenizer_fn=build_hf_tokenizer, build_loss_fn=build_cross_entropy_loss, ) register_train_spec(spec) @@ -88,7 +88,7 @@ def test_optim_hook(self): build_optimizers_fn=fake_build_optimizers, build_lr_schedulers_fn=build_lr_schedulers, build_dataloader_fn=build_hf_dataloader, - build_tokenizer_fn=build_tiktoken_tokenizer, + build_tokenizer_fn=build_hf_tokenizer, build_loss_fn=build_cross_entropy_loss, ) register_train_spec(spec) diff --git a/torchtitan/components/tokenizer.py b/torchtitan/components/tokenizer.py index def7594ae..45ecf34f9 100644 --- a/torchtitan/components/tokenizer.py +++ b/torchtitan/components/tokenizer.py @@ -6,18 +6,22 @@ import json + +import logging import os from abc import ABC, abstractmethod -from typing import Any, Optional +from typing import Any, Optional, Union -from tokenizers import AddedToken, Tokenizer as HfTokenizer +from tokenizers import AddedToken, Tokenizer +from torchtitan.config_manager import JobConfig from typing_extensions import override +logger = logging.getLogger(__name__) + -class Tokenizer(ABC): - # basic tokenizer interface, for typing purpose mainly +class BaseTokenizer(ABC): + # base tokenizer interface, for typing purpose mainly def __init__(self): - self._n_words = 8 self.eos_id = 0 @abstractmethod @@ -28,12 +32,12 @@ def encode(self, *args, **kwargs) -> list[int]: def decode(self, *args, **kwargs) -> str: ... - @property - def n_words(self) -> int: - return self._n_words + @abstractmethod + def get_vocab_size(self) -> int: + ... -class HuggingFaceTokenizer(Tokenizer): +class HuggingFaceTokenizer(BaseTokenizer): """ A tokenizer wrapper that handles BOS/EOS token inference and encoding. @@ -49,6 +53,7 @@ def __init__( self, tokenizer_path: str, ): + super().__init__() self.tokenizer_path = tokenizer_path # Initialize BOS/EOS token attributes (frequently used) @@ -76,7 +81,7 @@ def _load_config(self, config_path: str) -> Optional[dict]: return json.load(f) return None - def _load_tokenizer_from_path(self, tokenizer_path: str) -> HfTokenizer: + def _load_tokenizer_from_path(self, tokenizer_path: str) -> Tokenizer: """Load tokenizer from various file formats.""" if not os.path.exists(tokenizer_path): raise FileNotFoundError(f"Tokenizer path '{tokenizer_path}' does not exist") @@ -87,87 +92,79 @@ def _load_tokenizer_from_path(self, tokenizer_path: str) -> HfTokenizer: vocab_json_path = os.path.join(tokenizer_path, "vocab.json") merges_txt_path = os.path.join(tokenizer_path, "merges.txt") - try: - # Strategy 1: Load from tokenizer.json (preferred for modern tokenizers) - if os.path.exists(tokenizer_json_path): - print("Loading tokenizer from tokenizer.json") - return HfTokenizer.from_file(tokenizer_json_path) - # Strategy 2: Load from vocab files (with or without merges.txt) - elif os.path.exists(vocab_json_path) or os.path.exists(vocab_txt_path): - # Load vocabulary - if os.path.exists(vocab_json_path): - print("Loading vocabulary from vocab.json") - with open(vocab_json_path, "r") as f: - vocab = json.load(f) - vocab_source = "vocab.json" - else: - print("Loading vocabulary from vocab.txt") - vocab = {} - with open(vocab_txt_path, "r") as f: - for i, line in enumerate(f): - token = line.strip() - if token: - vocab[token] = i - vocab_source = "vocab.txt" - - # Strategy 2a: Use BPE if merges.txt exists - if os.path.exists(merges_txt_path): - print(f"Loading BPE tokenizer from {vocab_source} + merges.txt") - from tokenizers import decoders, pre_tokenizers, processors - from tokenizers.models import BPE - - # Load merges from file and convert to tuples - merges = [] - with open(merges_txt_path, "r") as f: - for line in f: - line = line.strip() - if line and not line.startswith( - "#" - ): # Skip comments and empty lines - parts = line.split() - if len(parts) >= 2: - merges.append((parts[0], parts[1])) - - # Create BPE model - bpe_model = BPE(vocab=vocab, merges=merges) - tokenizer = HfTokenizer(bpe_model) - - # Configure GPT-2 style components for proper space handling - tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( - add_prefix_space=False - ) - tokenizer.decoder = decoders.ByteLevel() - tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) - - return tokenizer - - # Strategy 2b: Use WordLevel if no merges.txt - else: - print(f"Loading WordLevel tokenizer from {vocab_source}") - from tokenizers.models import WordLevel - - word_level_model = WordLevel(vocab=vocab, unk_token="[UNK]") - return HfTokenizer(word_level_model) - + # Strategy 1: Load from tokenizer.json (preferred for modern tokenizers) + if os.path.exists(tokenizer_json_path): + logger.info("Loading tokenizer from tokenizer.json") + return Tokenizer.from_file(tokenizer_json_path) + # Strategy 2: Load from vocab files (with or without merges.txt) + elif os.path.exists(vocab_json_path) or os.path.exists(vocab_txt_path): + # Load vocabulary + if os.path.exists(vocab_json_path): + logger.info("Loading vocabulary from vocab.json") + with open(vocab_json_path, "r") as f: + vocab = json.load(f) + vocab_source = "vocab.json" else: - # List available files for debugging - available_files = [ - f - for f in os.listdir(tokenizer_path) - if os.path.isfile(os.path.join(tokenizer_path, f)) - ] - raise FileNotFoundError( - f"No supported tokenizer files found in '{tokenizer_path}'. " - f"Available files: {available_files}. " - "Looking for: tokenizer.json, tokenizer.model, vocab.txt+merges.txt, or vocab.json+merges.txt" + logger.info("Loading vocabulary from vocab.txt") + vocab = {} + with open(vocab_txt_path, "r") as f: + for i, line in enumerate(f): + token = line.strip() + if token: + vocab[token] = i + vocab_source = "vocab.txt" + + # Strategy 2a: Use BPE if merges.txt exists + if os.path.exists(merges_txt_path): + logger.info(f"Loading BPE tokenizer from {vocab_source} + merges.txt") + from tokenizers import decoders, pre_tokenizers, processors + from tokenizers.models import BPE + + # Load merges from file and convert to tuples + merges = [] + with open(merges_txt_path, "r") as f: + for line in f: + line = line.strip() + if line and not line.startswith( + "#" + ): # Skip comments and empty lines + parts = line.split() + if len(parts) >= 2: + merges.append((parts[0], parts[1])) + + # Create BPE model + bpe_model = BPE(vocab=vocab, merges=merges) + tokenizer = Tokenizer(bpe_model) + + # Configure GPT-2 style components for proper space handling + tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( + add_prefix_space=False ) + tokenizer.decoder = decoders.ByteLevel() + tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) - except Exception as e: - if isinstance(e, FileNotFoundError): - raise e - raise Exception( - f"Failed to load tokenizer from '{tokenizer_path}': {e}" - ) from e + return tokenizer + + # Strategy 2b: Use WordLevel if no merges.txt + else: + logger.info(f"Loading WordLevel tokenizer from {vocab_source}") + from tokenizers.models import WordLevel + + word_level_model = WordLevel(vocab=vocab, unk_token="[UNK]") + return Tokenizer(word_level_model) + + else: + # List available files for debugging + available_files = [ + f + for f in os.listdir(tokenizer_path) + if os.path.isfile(os.path.join(tokenizer_path, f)) + ] + raise FileNotFoundError( + f"No supported tokenizer files found in '{tokenizer_path}'. " + f"Available files: {available_files}. " + "Looking for: tokenizer.json, tokenizer.model, vocab.txt+merges.txt, or vocab.json+merges.txt" + ) def _get_token_from_config(self, config: dict[str, Any], key: str) -> Optional[str]: """ @@ -387,11 +384,11 @@ def decode(self, *args, **kwargs) -> str: @property def vocab_size(self) -> int: """Get the vocabulary size.""" - return len(self.tokenizer.get_vocab()) + return self.tokenizer.get_vocab_size() def get_vocab_size(self) -> int: """Get the vocabulary size.""" - return len(self.tokenizer.get_vocab()) + return self.tokenizer.get_vocab_size() def get_vocab(self) -> dict[str, int]: """Get the vocabulary as a dictionary.""" @@ -406,7 +403,9 @@ def id_to_token(self, token_id: int) -> Optional[str]: return self.tokenizer.id_to_token(token_id) -def build_hf_tokenizer(tokenizer_path: str) -> HuggingFaceTokenizer: +def build_hf_tokenizer( + job_config: JobConfig, +) -> Union[HuggingFaceTokenizer, BaseTokenizer]: """ Builds a HuggingFaceTokenizer from the specified path. @@ -415,11 +414,10 @@ def build_hf_tokenizer(tokenizer_path: str) -> HuggingFaceTokenizer: from various file formats and infers special token behavior. Args: - tokenizer_path (str): Path to the directory containing tokenizer files. - Should contain one or more of the supported file types. + JobConfig: A JobConfig object containing the path to the tokenizer directory. Returns: tokenizer (HuggingFaceTokenizer): Loaded tokenizer instance with intelligent BOS/EOS handling """ - tokenizer = HuggingFaceTokenizer(tokenizer_path) + tokenizer = HuggingFaceTokenizer(job_config.model.tokenizer_path) return tokenizer diff --git a/torchtitan/components/validate.py b/torchtitan/components/validate.py index 77d89c454..904c65ca5 100644 --- a/torchtitan/components/validate.py +++ b/torchtitan/components/validate.py @@ -11,7 +11,7 @@ from torch.distributed.fsdp import FSDPModule from torchtitan.components.dataloader import BaseDataLoader from torchtitan.components.loss import LossFunction -from torchtitan.components.tokenizer import Tokenizer +from torchtitan.components.tokenizer import BaseTokenizer from torchtitan.config_manager import JobConfig from torchtitan.datasets.hf_datasets import build_hf_validation_dataloader from torchtitan.distributed import ParallelDims, utils as dist_utils @@ -48,7 +48,7 @@ def __init__( job_config: JobConfig, dp_world_size: int, dp_rank: int, - tokenizer: Tokenizer, + tokenizer: BaseTokenizer, parallel_dims: ParallelDims, world_mesh: torch.distributed.DeviceMesh, loss_fn: LossFunction, @@ -142,7 +142,7 @@ def build_validator( job_config: JobConfig, dp_world_size: int, dp_rank: int, - tokenizer: Tokenizer, + tokenizer: BaseTokenizer, parallel_dims: ParallelDims, world_mesh: torch.distributed.DeviceMesh, loss_fn: LossFunction, diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py index d5e60bf23..8a2ebe434 100644 --- a/torchtitan/config_manager.py +++ b/torchtitan/config_manager.py @@ -884,7 +884,15 @@ def _validate_config(self) -> None: self.config.model.tokenizer_path = old_tokenizer_path logger.warning( f"Temporarily switching to previous default tokenizer path {old_tokenizer_path}. " - "Please update your config." + "Please download the new tokenizer model (python scripts/download_tokenizer.py) and update your config." + ) + else: + # Check if we are using tokenizer.model, if so then we need to alert users to redownload the tokenizer + if self.config.model.tokenizer_path.endswith("tokenizer.model"): + raise Exception( + "You are using the old tokenizer.model, please redownload the tokenizer ", + "(python scripts/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3.1-8B) ", + " and update your config to the directory of the downloaded tokenizer.", ) @staticmethod diff --git a/torchtitan/datasets/hf_datasets.py b/torchtitan/datasets/hf_datasets.py index 9f692d81d..dbef80a6e 100644 --- a/torchtitan/datasets/hf_datasets.py +++ b/torchtitan/datasets/hf_datasets.py @@ -17,7 +17,7 @@ from torch.utils.data import IterableDataset from torchtitan.components.dataloader import ParallelAwareDataloader -from torchtitan.components.tokenizer import Tokenizer +from torchtitan.components.tokenizer import BaseTokenizer from torchtitan.config_manager import JobConfig from torchtitan.tools.logging import logger @@ -80,7 +80,7 @@ def __init__( self, dataset_name: str, dataset_path: str | None, - tokenizer: Tokenizer, + tokenizer: BaseTokenizer, seq_len: int = 2048, dp_rank: int = 0, dp_world_size: int = 1, @@ -123,7 +123,9 @@ def __iter__(self): for sample in self._get_data_iter(): # Use the dataset-specific text processor sample_text = self._text_processor(sample) - sample_tokens = self._tokenizer.encode(sample_text, bos=True, eos=True) + sample_tokens = self._tokenizer.encode( + sample_text, add_bos=True, add_eos=True + ) self._token_buffer.extend(sample_tokens) self._sample_idx += 1 @@ -174,7 +176,7 @@ def state_dict(self): def build_hf_dataloader( dp_world_size: int, dp_rank: int, - tokenizer: Tokenizer, + tokenizer: BaseTokenizer, job_config: JobConfig, infinite: bool = True, ) -> ParallelAwareDataloader: @@ -205,7 +207,7 @@ def build_hf_dataloader( def build_hf_validation_dataloader( dp_world_size: int, dp_rank: int, - tokenizer: Tokenizer, + tokenizer: BaseTokenizer, job_config: JobConfig, ) -> ParallelAwareDataloader: """Build a validation data loader for HuggingFace datasets.""" diff --git a/torchtitan/datasets/tokenizer/tiktoken.py b/torchtitan/datasets/tokenizer/tiktoken.py deleted file mode 100644 index 401757a93..000000000 --- a/torchtitan/datasets/tokenizer/tiktoken.py +++ /dev/null @@ -1,190 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement. - -import os -from collections.abc import Collection, Iterator, Sequence, Set as AbstractSet -from pathlib import Path -from typing import cast, Literal - -import tiktoken -from tiktoken.load import load_tiktoken_bpe - -from torchtitan.components.tokenizer import Tokenizer -from torchtitan.config_manager import JobConfig -from torchtitan.tools.logging import logger - - -class TikTokenizer(Tokenizer): - """ - Tokenizing and encoding/decoding text using the Tiktoken tokenizer. - - Args: - model_path (str): The path to the Tiktoken model file. - """ - - special_tokens: dict[str, int] - - num_reserved_special_tokens = 256 - - pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" # noqa: E501, B950 - - def __init__(self, model_path: str): - super().__init__() - assert os.path.exists( - model_path - ), f"The tokenizer path does not exist: {model_path}" - assert os.path.isfile(model_path), model_path - - mergeable_ranks = load_tiktoken_bpe(model_path) - num_base_tokens = len(mergeable_ranks) - special_tokens = [ - "<|begin_of_text|>", - "<|end_of_text|>", - "<|reserved_special_token_0|>", - "<|reserved_special_token_1|>", - "<|reserved_special_token_2|>", - "<|reserved_special_token_3|>", - "<|start_header_id|>", - "<|end_header_id|>", - "<|reserved_special_token_4|>", - "<|eot_id|>", # end of turn - ] + [ - f"<|reserved_special_token_{i}|>" - for i in range(5, self.num_reserved_special_tokens - 5) - ] - self.special_tokens = { - token: num_base_tokens + i for i, token in enumerate(special_tokens) - } - self.model = tiktoken.Encoding( - name=Path(model_path).name, - pat_str=self.pat_str, - mergeable_ranks=mergeable_ranks, - special_tokens=self.special_tokens, - ) - - self._n_words: int = self.model.n_vocab - # BOS / EOS token IDs - self.bos_id: int = self.special_tokens["<|begin_of_text|>"] - self.eos_id: int = self.special_tokens["<|end_of_text|>"] - self.pad_id: int = -1 - self.stop_tokens = { - self.special_tokens["<|end_of_text|>"], - self.special_tokens["<|eot_id|>"], - } - logger.info( - f"TikTokenizer built: #words {self.n_words}, BOS ID {self.bos_id}, EOS ID {self.eos_id}" - ) - - def encode( - self, - s: str, - *, - bos: bool, - eos: bool, - allowed_special: Literal["all"] | AbstractSet[str] | None = None, - disallowed_special: Literal["all"] | Collection[str] | None = None, - ) -> list[int]: - """ - Encodes a string into a list of token IDs. - - Args: - s (str): The input string to be encoded. - bos (bool): Whether to prepend the beginning-of-sequence token. - eos (bool): Whether to append the end-of-sequence token. - allowed_tokens ("all"|set[str]): allowed special tokens in string - disallowed_tokens ("all"|set[str]): special tokens that raise an error when in string - - Returns: - list[int]: A list of token IDs. - - By default, setting disallowed_special=() encodes a string by ignoring - special tokens. Specifically: - - Setting `disallowed_special` to () will cause all text corresponding - to special tokens to be encoded as natural text (insteading of raising - an error). - - Setting `allowed_special` to "all" will treat all text corresponding - to special tokens to be encoded as special tokens. - """ - assert type(s) is str - allowed_special = allowed_special or set() - disallowed_special = disallowed_special or () - - # The tiktoken tokenizer can handle <=400k chars without - # pyo3_runtime.PanicException. - TIKTOKEN_MAX_ENCODE_CHARS = 400_000 - - # https://github.com/openai/tiktoken/issues/195 - # Here we iterate over subsequences and split if we exceed the limit - # of max consecutive non-whitespace or whitespace characters. - MAX_NO_WHITESPACES_CHARS = 25_000 - - substrs = ( - substr - for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS) - for substr in self._split_whitespaces_or_nonwhitespaces( - s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS - ) - ) - t: list[int] = [] - for substr in substrs: - t.extend( - self.model.encode( - substr, - allowed_special=allowed_special, - disallowed_special=disallowed_special, - ) - ) - if bos: - t.insert(0, self.bos_id) - if eos: - t.append(self.eos_id) - return t - - def decode(self, t: Sequence[int]) -> str: - """ - Decodes a list of token IDs into a string. - - Args: - t (List[int]): The list of token IDs to be decoded. - - Returns: - str: The decoded string. - """ - # Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence. - return self.model.decode(cast(list[int], t)) - - @staticmethod - def _split_whitespaces_or_nonwhitespaces( - s: str, max_consecutive_slice_len: int - ) -> Iterator[str]: - """ - Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len` - consecutive whitespaces or consecutive non-whitespaces. - """ - current_slice_len = 0 - current_slice_is_space = s[0].isspace() if len(s) > 0 else False - slice_start = 0 - - for i in range(len(s)): - is_now_space = s[i].isspace() - - if current_slice_is_space ^ is_now_space: - current_slice_len = 1 - current_slice_is_space = is_now_space - else: - current_slice_len += 1 - if current_slice_len > max_consecutive_slice_len: - yield s[slice_start:i] - slice_start = i - current_slice_len = 1 - yield s[slice_start:] - - -def build_tiktoken_tokenizer(job_config: JobConfig) -> TikTokenizer: - return TikTokenizer(job_config.model.tokenizer_path) diff --git a/torchtitan/experiments/deepseek_v3/model_args.py b/torchtitan/experiments/deepseek_v3/model_args.py index 21e2dbd95..b7fd7f1a7 100644 --- a/torchtitan/experiments/deepseek_v3/model_args.py +++ b/torchtitan/experiments/deepseek_v3/model_args.py @@ -9,7 +9,7 @@ from torch import nn -from torchtitan.components.tokenizer import Tokenizer +from torchtitan.components.tokenizer import BaseTokenizer from torchtitan.config_manager import JobConfig from torchtitan.protocols.train_spec import BaseModelArgs @@ -59,8 +59,10 @@ class TransformerModelArgs(BaseModelArgs): use_grouped_mm: bool = True # grouped mm or for-loop for the experts computation load_balance_coeff: float | None = 1e-3 - def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None: - self.vocab_size = tokenizer.n_words + def update_from_config( + self, job_config: JobConfig, tokenizer: BaseTokenizer + ) -> None: + self.vocab_size = tokenizer.get_vocab_size() self.max_seq_len = job_config.training.seq_len self.eos_id = tokenizer.eos_id diff --git a/torchtitan/experiments/deepseek_v3/train_configs/deepseek_v2.toml b/torchtitan/experiments/deepseek_v3/train_configs/deepseek_v2.toml index 1402e57e1..6b8390178 100644 --- a/torchtitan/experiments/deepseek_v3/train_configs/deepseek_v2.toml +++ b/torchtitan/experiments/deepseek_v3/train_configs/deepseek_v2.toml @@ -22,7 +22,7 @@ enable_wandb = false name = "deepseek_v2" flavor = "deepseek-ai/DeepSeek-V2-Lite" # test tokenizer.model, for debug purpose only -tokenizer_path = "./tests/assets/test_tiktoken.model" +tokenizer_path = "./tests/assets/tokenizer" # converters = ["float8"] [optimizer] diff --git a/torchtitan/experiments/flux/dataset/flux_dataset.py b/torchtitan/experiments/flux/dataset/flux_dataset.py index fcdac0b9d..83aa7ae06 100644 --- a/torchtitan/experiments/flux/dataset/flux_dataset.py +++ b/torchtitan/experiments/flux/dataset/flux_dataset.py @@ -20,7 +20,7 @@ from torch.utils.data import IterableDataset from torchtitan.components.dataloader import ParallelAwareDataloader -from torchtitan.components.tokenizer import Tokenizer +from torchtitan.components.tokenizer import BaseTokenizer from torchtitan.config_manager import JobConfig from torchtitan.experiments.flux.dataset.tokenizer import ( build_flux_tokenizer, @@ -161,8 +161,8 @@ def __init__( self, dataset_name: str, dataset_path: Optional[str], - t5_tokenizer: Tokenizer, - clip_tokenizer: Tokenizer, + t5_tokenizer: BaseTokenizer, + clip_tokenizer: BaseTokenizer, job_config: Optional[JobConfig] = None, dp_rank: int = 0, dp_world_size: int = 1, diff --git a/torchtitan/experiments/flux/dataset/tokenizer.py b/torchtitan/experiments/flux/dataset/tokenizer.py index 3903c8a17..3d69b0ac5 100644 --- a/torchtitan/experiments/flux/dataset/tokenizer.py +++ b/torchtitan/experiments/flux/dataset/tokenizer.py @@ -11,20 +11,19 @@ from typing import List import torch -from torchtitan.components.tokenizer import Tokenizer +from torchtitan.components.tokenizer import BaseTokenizer, HuggingFaceTokenizer from torchtitan.config_manager import JobConfig -from torchtitan.datasets.tokenizer.tiktoken import TikTokenizer from transformers import CLIPTokenizer, T5Tokenizer -class FluxTestTokenizer(Tokenizer): +class FluxTestTokenizer(BaseTokenizer): """ Flux Tokenizer for test purpose. This is a simple wrapper around the TikTokenizer, to make it has same interface as the T5 and CLIP tokenizer used for Flux. """ def __init__(self, model_path: str = "t5-small", max_length: int = 77, **hf_kwargs): - self.tiktokenizer = TikTokenizer(model_path, **hf_kwargs) + self.tiktokenizer = HuggingFaceTokenizer(model_path, **hf_kwargs) self._max_length = max_length self.pad_id = 0 @@ -43,11 +42,14 @@ def _pad_and_chunk_tokens( return tokens + def get_vocab_size(self) -> int: + return self.tiktokenizer.vocab_size + def encode(self, text: str) -> torch.Tensor: """ Use TikTokenizer to encode the text into tokens, and then pad and chunk the tokens to max_length. """ - tokens = self.tiktokenizer.encode(text, bos=True, eos=True) + tokens = self.tiktokenizer.encode(text, add_bos=True, add_eos=True) tokens = self._pad_and_chunk_tokens(tokens, self._max_length, self.pad_id) return torch.tensor(tokens) @@ -58,7 +60,7 @@ def decode(self, t: List[int]) -> str: return self.tiktokenizer.decode(t) -class FluxTokenizer(Tokenizer): +class FluxTokenizer(BaseTokenizer): """ Tokenizing and encoding/decoding text using the T5 or Clip tokenizer. @@ -83,6 +85,9 @@ def __init__(self, model_path: str = "t5-small", max_length: int = 77, **hf_kwar model_path, max_length=max_length, **hf_kwargs ) + def get_vocab_size(self) -> int: + return self._tokenizer.vocab_size + def encode( self, s: str, @@ -108,7 +113,7 @@ def decode(self, t: List[int]) -> str: return self._tokenizer.decode(t) -def build_flux_tokenizer(job_config: JobConfig) -> tuple[Tokenizer, Tokenizer]: +def build_flux_tokenizer(job_config: JobConfig) -> tuple[BaseTokenizer, BaseTokenizer]: """ Build the tokenizer for Flux. """ diff --git a/torchtitan/experiments/flux/sampling.py b/torchtitan/experiments/flux/sampling.py index 382832a0c..f9f1b9086 100644 --- a/torchtitan/experiments/flux/sampling.py +++ b/torchtitan/experiments/flux/sampling.py @@ -14,7 +14,7 @@ from torch import Tensor -from torchtitan.components.tokenizer import Tokenizer +from torchtitan.components.tokenizer import BaseTokenizer from torchtitan.config_manager import JobConfig from torchtitan.tools.logging import logger @@ -78,8 +78,8 @@ def generate_image( model: FluxModel, prompt: str, autoencoder: AutoEncoder, - t5_tokenizer: Tokenizer, - clip_tokenizer: Tokenizer, + t5_tokenizer: BaseTokenizer, + clip_tokenizer: BaseTokenizer, t5_encoder: FluxEmbedder, clip_encoder: FluxEmbedder, ) -> torch.Tensor: diff --git a/torchtitan/experiments/flux/tests/integration_tests.py b/torchtitan/experiments/flux/tests/integration_tests.py index 4bb588a0a..9ba7ee378 100755 --- a/torchtitan/experiments/flux/tests/integration_tests.py +++ b/torchtitan/experiments/flux/tests/integration_tests.py @@ -106,7 +106,7 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str): t5_encoder_version_arg = ( "--encoder.t5_encoder torchtitan/experiments/flux/tests/assets/t5-v1_1-xxl/" ) - tokenzier_path_arg = "--model.tokenizer_path tests/assets/test_tiktoken.model" + tokenzier_path_arg = "--model.tokenizer_path tests/assets/tokenizer" all_ranks = ",".join(map(str, range(test_flavor.ngpu))) diff --git a/torchtitan/experiments/llama4/README.md b/torchtitan/experiments/llama4/README.md index 4b42f7c3f..23b75b859 100644 --- a/torchtitan/experiments/llama4/README.md +++ b/torchtitan/experiments/llama4/README.md @@ -12,7 +12,7 @@ https://github.com/pytorch/torchtitan/issues/1118 #### Download Llama 4 tokenizer ```bash # Llama 4 tokenizer.model -python scripts/download_tokenizer.py --repo_id meta-llama/Llama-4-Scout-17B-16E --tokenizer_path "" --hf_token=... +python scripts/download_tokenizer.py --repo_id meta-llama/Llama-4-Scout-17B-16E --hf_token=... ``` #### To be added diff --git a/torchtitan/experiments/llama4/__init__.py b/torchtitan/experiments/llama4/__init__.py index 329c4e9d7..9f7affc09 100644 --- a/torchtitan/experiments/llama4/__init__.py +++ b/torchtitan/experiments/llama4/__init__.py @@ -6,8 +6,8 @@ from torchtitan.components.loss import build_cross_entropy_loss from torchtitan.components.lr_scheduler import build_lr_schedulers +from torchtitan.components.tokenizer import build_hf_tokenizer from torchtitan.datasets.hf_datasets import build_hf_dataloader -from torchtitan.datasets.tokenizer.tiktoken import build_tiktoken_tokenizer from torchtitan.models.llama3 import pipeline_llama from torchtitan.protocols.train_spec import register_train_spec, TrainSpec @@ -101,7 +101,7 @@ build_optimizers_fn=build_llama4_optimizers, build_lr_schedulers_fn=build_lr_schedulers, build_dataloader_fn=build_hf_dataloader, - build_tokenizer_fn=build_tiktoken_tokenizer, + build_tokenizer_fn=build_hf_tokenizer, build_loss_fn=build_cross_entropy_loss, ) ) diff --git a/torchtitan/experiments/llama4/model/args.py b/torchtitan/experiments/llama4/model/args.py index 96168fcaf..a7f99e732 100644 --- a/torchtitan/experiments/llama4/model/args.py +++ b/torchtitan/experiments/llama4/model/args.py @@ -8,7 +8,7 @@ from dataclasses import dataclass from torch import nn -from torchtitan.components.tokenizer import Tokenizer +from torchtitan.components.tokenizer import BaseTokenizer from torchtitan.config_manager import JobConfig from torchtitan.protocols.train_spec import BaseModelArgs @@ -58,8 +58,10 @@ class TransformerModelArgs(BaseModelArgs): use_grouped_mm: bool = True # grouped mm or for-loop for the experts computation load_balance_coeff: float | None = 1e-3 - def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None: - self.vocab_size = tokenizer.n_words + def update_from_config( + self, job_config: JobConfig, tokenizer: BaseTokenizer + ) -> None: + self.vocab_size = tokenizer.get_vocab_size() self.max_seq_len = job_config.training.seq_len self.eos_id = tokenizer.eos_id diff --git a/torchtitan/experiments/llama4/train_configs/debug_model.toml b/torchtitan/experiments/llama4/train_configs/debug_model.toml index 7fbe95e19..d72406d8c 100644 --- a/torchtitan/experiments/llama4/train_configs/debug_model.toml +++ b/torchtitan/experiments/llama4/train_configs/debug_model.toml @@ -22,7 +22,7 @@ enable_wandb = false name = "llama4" flavor = "debugmodel" # test tokenizer.model, for debug purpose only -tokenizer_path = "./tests/assets/test_tiktoken.model" +tokenizer_path = "./tests/assets/tokenizer" # converters = ["float8"] [optimizer] diff --git a/torchtitan/experiments/llama4/train_configs/llama4_17bx128e.toml b/torchtitan/experiments/llama4/train_configs/llama4_17bx128e.toml index 4b9fc4d4f..707fea92e 100644 --- a/torchtitan/experiments/llama4/train_configs/llama4_17bx128e.toml +++ b/torchtitan/experiments/llama4/train_configs/llama4_17bx128e.toml @@ -17,7 +17,7 @@ save_tb_folder = "tb" [model] name = "llama4" flavor = "17bx128e" -tokenizer_path = "./assets/tokenizer/tokenizer.model" +tokenizer_path = "./assets/tokenizer/Llama-4-Scout-17B-16E" # converters = ["float8"] [optimizer] diff --git a/torchtitan/experiments/llama4/train_configs/llama4_17bx16e.toml b/torchtitan/experiments/llama4/train_configs/llama4_17bx16e.toml index 0f9402456..b4b14358c 100644 --- a/torchtitan/experiments/llama4/train_configs/llama4_17bx16e.toml +++ b/torchtitan/experiments/llama4/train_configs/llama4_17bx16e.toml @@ -17,7 +17,7 @@ save_tb_folder = "tb" [model] name = "llama4" flavor = "17bx16e" -tokenizer_path = "./assets/tokenizer/tokenizer.model" +tokenizer_path = "./assets/tokenizer/Llama-4-Scout-17B-16E" # converters = ["float8"] [optimizer] diff --git a/torchtitan/experiments/multimodal/__init__.py b/torchtitan/experiments/multimodal/__init__.py index fe08681bb..f3ba2a2d4 100644 --- a/torchtitan/experiments/multimodal/__init__.py +++ b/torchtitan/experiments/multimodal/__init__.py @@ -9,7 +9,7 @@ from torchtitan.components.loss import build_cross_entropy_loss from torchtitan.components.lr_scheduler import build_lr_schedulers from torchtitan.components.optimizer import build_optimizers -from torchtitan.datasets.tokenizer.tiktoken import build_tiktoken_tokenizer +from torchtitan.components.tokenizer import build_hf_tokenizer from torchtitan.models.llama3 import parallelize_llama, pipeline_llama from torchtitan.protocols.train_spec import register_train_spec, TrainSpec @@ -31,7 +31,7 @@ build_optimizers_fn=build_optimizers, build_lr_schedulers_fn=build_lr_schedulers, build_dataloader_fn=build_mm_dataloader, - build_tokenizer_fn=build_tiktoken_tokenizer, + build_tokenizer_fn=build_hf_tokenizer, build_loss_fn=build_cross_entropy_loss, ) ) diff --git a/torchtitan/experiments/multimodal/mm_dataset.py b/torchtitan/experiments/multimodal/mm_dataset.py index 519272c74..5daf1d0ea 100644 --- a/torchtitan/experiments/multimodal/mm_dataset.py +++ b/torchtitan/experiments/multimodal/mm_dataset.py @@ -13,7 +13,7 @@ from datasets.distributed import split_dataset_by_node from mm_collator import MultiModalCollator -from tokenizer.tiktoken import IGNORE_INDEX, Tokenizer +from tokenizer.tiktoken import BaseTokenizer, IGNORE_INDEX from torch.distributed.checkpoint.stateful import Stateful from torch.utils.data import IterableDataset from transform import CLIPTransform @@ -110,7 +110,7 @@ def __init__( self, dataset_name: str, dataset_path: Optional[str], - tokenizer: Tokenizer, + tokenizer: BaseTokenizer, image_token: str = "<|image|>", tile_size: int = 448, max_num_tiles: int = 4, @@ -178,8 +178,8 @@ def __iter__(self): # Tokenize tokens = self._tokenizer.encode( sample["text"], - bos=True, - eos=True, + add_bos=True, + add_eos=True, allowed_special=set(["<|image|>"]), ) sample["input_ids"] = torch.LongTensor(tokens[:-1]) @@ -233,7 +233,7 @@ def state_dict(self): def build_mm_dataloader( dp_world_size: int, dp_rank: int, - tokenizer: Tokenizer, + tokenizer: BaseTokenizer, job_config: JobConfig, infinite: bool = True, ) -> ParallelAwareDataloader: diff --git a/torchtitan/experiments/multimodal/tokenizer/tiktoken.py b/torchtitan/experiments/multimodal/tokenizer/tiktoken.py index 9d494a06f..b6de11e52 100644 --- a/torchtitan/experiments/multimodal/tokenizer/tiktoken.py +++ b/torchtitan/experiments/multimodal/tokenizer/tiktoken.py @@ -7,6 +7,9 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement. +# TODO: Refactor this file since we have updated the tokenizer to +# depend on Hugging Face Tokenizer (https://github.com/pytorch/torchtitan/pull/1333) + import os from pathlib import Path from typing import ( @@ -28,7 +31,7 @@ import torch from tiktoken.load import load_tiktoken_bpe -from torchtitan.components.tokenizer import Tokenizer +from torchtitan.components.tokenizer import BaseTokenizer from torchtitan.config_manager import JobConfig from torchtitan.tools.logging import logger @@ -36,7 +39,7 @@ IGNORE_INDEX = -100 -class TikTokenizer(Tokenizer): +class TikTokenizer(BaseTokenizer): """ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. diff --git a/torchtitan/experiments/simple_fsdp/__init__.py b/torchtitan/experiments/simple_fsdp/__init__.py index 9ed592326..80a2b3c3a 100644 --- a/torchtitan/experiments/simple_fsdp/__init__.py +++ b/torchtitan/experiments/simple_fsdp/__init__.py @@ -9,8 +9,8 @@ from torchtitan.components.loss import build_cross_entropy_loss from torchtitan.components.lr_scheduler import build_lr_schedulers from torchtitan.components.optimizer import build_optimizers +from torchtitan.components.tokenizer import build_hf_tokenizer from torchtitan.datasets.hf_datasets import build_hf_dataloader -from torchtitan.datasets.tokenizer.tiktoken import build_tiktoken_tokenizer from torchtitan.models.llama3 import llama3_configs, pipeline_llama from torchtitan.protocols.train_spec import register_train_spec, TrainSpec @@ -27,7 +27,7 @@ build_optimizers_fn=build_optimizers, build_lr_schedulers_fn=build_lr_schedulers, build_dataloader_fn=build_hf_dataloader, - build_tokenizer_fn=build_tiktoken_tokenizer, + build_tokenizer_fn=build_hf_tokenizer, build_loss_fn=build_cross_entropy_loss, ) ) diff --git a/torchtitan/models/llama3/__init__.py b/torchtitan/models/llama3/__init__.py index eec35cbf1..2e9a11d47 100644 --- a/torchtitan/models/llama3/__init__.py +++ b/torchtitan/models/llama3/__init__.py @@ -9,9 +9,9 @@ from torchtitan.components.loss import build_cross_entropy_loss from torchtitan.components.lr_scheduler import build_lr_schedulers from torchtitan.components.optimizer import build_optimizers +from torchtitan.components.tokenizer import build_hf_tokenizer from torchtitan.components.validate import build_validator from torchtitan.datasets.hf_datasets import build_hf_dataloader -from torchtitan.datasets.tokenizer.tiktoken import build_tiktoken_tokenizer from torchtitan.protocols.train_spec import register_train_spec, TrainSpec from .infra.parallelize import parallelize_llama @@ -80,7 +80,7 @@ build_optimizers_fn=build_optimizers, build_lr_schedulers_fn=build_lr_schedulers, build_dataloader_fn=build_hf_dataloader, - build_tokenizer_fn=build_tiktoken_tokenizer, + build_tokenizer_fn=build_hf_tokenizer, build_loss_fn=build_cross_entropy_loss, build_validator_fn=build_validator, ) diff --git a/torchtitan/models/llama3/model/args.py b/torchtitan/models/llama3/model/args.py index 20e3bcbcd..38f7e3321 100644 --- a/torchtitan/models/llama3/model/args.py +++ b/torchtitan/models/llama3/model/args.py @@ -11,7 +11,7 @@ from torch import nn -from torchtitan.components.tokenizer import Tokenizer +from torchtitan.components.tokenizer import BaseTokenizer from torchtitan.config_manager import JobConfig from torchtitan.protocols.train_spec import BaseModelArgs @@ -37,8 +37,10 @@ class TransformerModelArgs(BaseModelArgs): attn_mask_type: str = "causal" eos_id: int = 0 - def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None: - self.vocab_size = tokenizer.n_words + def update_from_config( + self, job_config: JobConfig, tokenizer: BaseTokenizer + ) -> None: + self.vocab_size = tokenizer.get_vocab_size() self.max_seq_len = job_config.training.seq_len self.eos_id = tokenizer.eos_id diff --git a/torchtitan/models/llama3/train_configs/debug_model.toml b/torchtitan/models/llama3/train_configs/debug_model.toml index 3710c689f..b9d26c7d9 100644 --- a/torchtitan/models/llama3/train_configs/debug_model.toml +++ b/torchtitan/models/llama3/train_configs/debug_model.toml @@ -23,8 +23,8 @@ enable_wandb = false [model] name = "llama3" flavor = "debugmodel" -# test tokenizer.model, for debug purpose only -tokenizer_path = "./tests/assets/test_tiktoken.model" +# test folder with tokenizer.json, for debug purpose only +tokenizer_path = "./tests/assets/tokenizer" # converters = ["float8"] [optimizer] diff --git a/torchtitan/models/llama3/train_configs/llama3_405b.toml b/torchtitan/models/llama3/train_configs/llama3_405b.toml index 61203e985..8b12113c5 100644 --- a/torchtitan/models/llama3/train_configs/llama3_405b.toml +++ b/torchtitan/models/llama3/train_configs/llama3_405b.toml @@ -18,7 +18,7 @@ save_tb_folder = "tb" [model] name = "llama3" flavor = "405B" -tokenizer_path = "./assets/tokenizer/original/tokenizer.model" +tokenizer_path = "./assets/tokenizer/Llama-3.1-8B" converters = ["float8"] [optimizer] diff --git a/torchtitan/models/llama3/train_configs/llama3_70b.toml b/torchtitan/models/llama3/train_configs/llama3_70b.toml index 55386f929..e65d7a1ad 100644 --- a/torchtitan/models/llama3/train_configs/llama3_70b.toml +++ b/torchtitan/models/llama3/train_configs/llama3_70b.toml @@ -18,7 +18,7 @@ save_tb_folder = "tb" [model] name = "llama3" flavor = "70B" -tokenizer_path = "./assets/tokenizer/original/tokenizer.model" +tokenizer_path = "./assets/tokenizer/Llama-3.1-8B" # converters = ["float8"] [optimizer] diff --git a/torchtitan/models/llama3/train_configs/llama3_8b.toml b/torchtitan/models/llama3/train_configs/llama3_8b.toml index 63b4ce6da..553017779 100644 --- a/torchtitan/models/llama3/train_configs/llama3_8b.toml +++ b/torchtitan/models/llama3/train_configs/llama3_8b.toml @@ -18,7 +18,7 @@ save_tb_folder = "tb" [model] name = "llama3" flavor = "8B" -tokenizer_path = "./assets/tokenizer/original/tokenizer.model" +tokenizer_path = "./assets/tokenizer/Llama-3.1-8B" # converters = ["float8"] [optimizer] diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py index 2cabd698a..e7caa89f0 100644 --- a/torchtitan/protocols/train_spec.py +++ b/torchtitan/protocols/train_spec.py @@ -22,7 +22,7 @@ from torchtitan.components.lr_scheduler import LRSchedulersContainer from torchtitan.components.metrics import MetricsProcessor from torchtitan.components.optimizer import OptimizersContainer -from torchtitan.components.tokenizer import Tokenizer +from torchtitan.components.tokenizer import BaseTokenizer from torchtitan.components.validate import BaseValidator from torchtitan.config_manager import JobConfig from torchtitan.distributed import ParallelDims @@ -41,7 +41,9 @@ class BaseModelArgs: _enforced: str = "This field is used to enforce all fields have defaults." @abstractmethod - def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None: + def update_from_config( + self, job_config: JobConfig, tokenizer: BaseTokenizer + ) -> None: pass @abstractmethod @@ -71,7 +73,7 @@ def init_weights(self, buffer_device: torch.device | None = None) -> None: ..., tuple[_PipelineSchedule, list[nn.Module], bool, bool] ] DataLoaderBuilder: TypeAlias = Callable[..., BaseDataLoader] -TokenizerBuilder: TypeAlias = Callable[..., Tokenizer] +TokenizerBuilder: TypeAlias = Callable[..., BaseTokenizer] MetricsProcessorBuilder: TypeAlias = Callable[..., MetricsProcessor] OptimizersBuilder: TypeAlias = Callable[ [list[nn.Module], JobConfig, ParallelDims, DeviceMesh, FTManager],