diff --git a/tests/test_gpt_dataset.py b/tests/test_gpt_dataset.py new file mode 100644 index 0000000..3b54103 --- /dev/null +++ b/tests/test_gpt_dataset.py @@ -0,0 +1,38 @@ +from types import SimpleNamespace +import torch + +from toynlp.gpt.dataset import split_text_into_contexts + + +class DummyTokenizer: + def __init__(self) -> None: + self._vocab: dict[str, int] = {"": 0, "": 1} + + def encode(self, text: str) -> SimpleNamespace: + ids = [self._vocab.setdefault(char, len(self._vocab)) for char in text] + return SimpleNamespace(ids=ids) + + def token_to_id(self, token: str) -> int | None: + return self._vocab.get(token) + + +def test_split_text_includes_eos_and_pads_last_chunk() -> None: + tokenizer = DummyTokenizer() + contexts = split_text_into_contexts(["abcd"], max_length=3, tokenizer=tokenizer) + + assert len(contexts) == 2 + expected_first = torch.tensor([2, 3, 4], dtype=torch.long) + expected_second = torch.tensor([5, 1, 0], dtype=torch.long) + assert torch.equal(contexts[0], expected_first) + assert torch.equal(contexts[1], expected_second) + + +def test_split_text_inserts_single_eos_per_document() -> None: + tokenizer = DummyTokenizer() + texts = ["alpha", "should_be_literal"] + contexts = split_text_into_contexts(texts, max_length=4, tokenizer=tokenizer) + + eos_id = tokenizer.token_to_id("") + stacked = torch.stack(contexts) + eos_count = int((stacked == eos_id).sum().item()) + assert eos_count == len(texts) diff --git a/toynlp/gpt/README.md b/toynlp/gpt/README.md index 062bf68..b755792 100644 --- a/toynlp/gpt/README.md +++ b/toynlp/gpt/README.md @@ -11,14 +11,14 @@ The differences with the original GPT model: | Aspect | Original GPT | This Implementation | |:--------:|:---------------:|:-------------------:| -| Training Epochs | 100 | 45 | +| Training Epochs | 100 | 22 | Performance comparison: | Metric | Original GPT | This Implementation | |:--------:|:---------------:|:-------------------:| -| Perplexity| 18.4 | 24.3| -| SST2 Accuracy | 91.3% | **92.69%** | +| Perplexity| 18.4 | 25.6| +| SST2 Accuracy | 91.3% | **x%** | ### The dataset is around 800M words(1B tokens) diff --git a/toynlp/gpt/config.py b/toynlp/gpt/config.py index 7e8e495..d4e4ad8 100644 --- a/toynlp/gpt/config.py +++ b/toynlp/gpt/config.py @@ -19,7 +19,7 @@ class GPTConfig: # model configs vocab_size: int = 40478 # paper: (BPE) vocabulary with 40,478 merges special_tokens: list[str] = field( - default_factory=lambda: ["", ""], + default_factory=lambda: ["", "", ""], ) # model arch configs max_seq_length: int = 512 # paper setting: 128, 512 diff --git a/toynlp/gpt/dataset.py b/toynlp/gpt/dataset.py index ca6444f..44695cb 100644 --- a/toynlp/gpt/dataset.py +++ b/toynlp/gpt/dataset.py @@ -9,16 +9,22 @@ def split_text_into_contexts(texts: list[str], max_length: int, tokenizer: Tokenizer) -> list[torch.Tensor]: contexts = [] - # print(f"len texts: {len(texts)}") + eos_id = tokenizer.token_to_id("") + pad_id = tokenizer.token_to_id("") + if eos_id is None or pad_id is None: + msg = "Missing required special tokens or in tokenizer vocabulary" + raise ValueError(msg) + for text in texts: - # print(f"Processing text of length {len(text)}") token_ids = tokenizer.encode(text).ids - for i in range(len(token_ids) // max_length + 1): - start_idx = i * max_length - end_idx = (i + 1) * max_length - # print(f"i: {i}, start_idx: {start_idx}, end_idx: {end_idx}, len(token_ids): {len(token_ids)}") - if end_idx < len(token_ids): - contexts.append(torch.tensor(token_ids[start_idx:end_idx], dtype=torch.long)) + token_ids.append(eos_id) + + for start_idx in range(0, len(token_ids), max_length): + chunk = token_ids[start_idx : start_idx + max_length] + if len(chunk) < max_length: + chunk.extend([pad_id] * (max_length - len(chunk))) + contexts.append(torch.tensor(chunk, dtype=torch.long)) + return contexts