Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 109 additions & 0 deletions bionemo-recipes/models/llama3/create_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-Apache2

"""Script to create the HuggingFace PreTrainedTokenizerFast for nucleotide sequences.

This script creates a tokenizer that:
1. Maps each character to its ord() value (ASCII encoding)
2. Uses special tokens with NeMo convention (EOS=0, PAD=1, BOS=2, UNK=3)
3. Works with AutoTokenizer.from_pretrained()

Run this script to regenerate the tokenizer files if needed.
"""

import logging
import os

from tokenizers import Tokenizer, processors
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Split
from transformers import PreTrainedTokenizerFast


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def create_nucleotide_tokenizer(
eos_id: int = 0,
pad_id: int = 1,
bos_id: int = 2,
unk_id: int = 3,
) -> PreTrainedTokenizerFast:
"""Create a PreTrainedTokenizerFast for nucleotide sequences.

Uses special token IDs for causal language modeling:
- BOS = 2 (beginning of sequence)
- EOS = 0 (end of sequence)
- PAD = 1 (padding)
- UNK = 3 (unknown)

Args:
eos_id: End-of-sequence token ID (default: 0)
pad_id: Padding token ID (default: 1)
bos_id: Beginning-of-sequence token ID (default: 2)
unk_id: Unknown token ID (default: 3)

Returns:
PreTrainedTokenizerFast ready to use and save
"""
# Define special tokens
special_tokens = {
"<BOS>": bos_id,
"<EOS>": eos_id,
"<PAD>": pad_id,
"<UNK>": unk_id,
}

# Build vocab: Map each ASCII character to its ord() value
# IMPORTANT: Exclude reserved IDs for special tokens
reserved_ids = set(special_tokens.values())
vocab = {chr(i): i for i in range(256) if i not in reserved_ids}
vocab = {**vocab, **special_tokens}

# Create Rust tokenizer backend with WordLevel model
tokenizer = Tokenizer(WordLevel(vocab, unk_token="<UNK>"))

# Configure pre-tokenizer: Split into individual characters
tokenizer.pre_tokenizer = Split(pattern="", behavior="isolated")

# Configure post-processor: Add BOS/EOS tokens automatically
tokenizer.post_processor = processors.TemplateProcessing(
single="<BOS> $A <EOS>",
pair="<BOS> $A <EOS> <BOS> $B <EOS>",
special_tokens=[
("<BOS>", bos_id),
("<EOS>", eos_id),
],
)

# Wrap in HuggingFace PreTrainedTokenizerFast
hf_tokenizer = PreTrainedTokenizerFast(
tokenizer_object=tokenizer,
unk_token="<UNK>",
pad_token="<PAD>",
eos_token="<EOS>",
bos_token="<BOS>",
)

return hf_tokenizer


def main():
"""Create and save the nucleotide tokenizer."""
logger.info("Creating nucleotide tokenizer")

# Create tokenizer with default settings (BOS=2, EOS=0, PAD=1, UNK=3)
tokenizer = create_nucleotide_tokenizer()

logger.info(f"Vocab size: {tokenizer.vocab_size}")
logger.info(f"Special tokens: BOS={tokenizer.bos_token_id}, EOS={tokenizer.eos_token_id}, PAD={tokenizer.pad_token_id}, UNK={tokenizer.unk_token_id}")

# Save to default location
save_path = os.path.join(os.path.dirname(__file__), "nucleotide_fast_tokenizer")
tokenizer.save_pretrained(save_path)
logger.info(f"Tokenizer saved to: {save_path}")


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"bos_token": "<BOS>",
"eos_token": "<EOS>",
"pad_token": "<PAD>",
"unk_token": "<UNK>"
}
Loading
Loading