Remove casts in nanoGPT's layernorms (#662)

yizhuoz004 · web-flow · commit 12995a1e0a20 · 2025-06-30T12:47:52.000-07:00
diff --git a/tripy/examples/nanogpt/model.py b/tripy/examples/nanogpt/model.py
@@ -117,16 +117,14 @@ def forward(self, x):
 class Block(tp.Module):
     def __init__(self, config):
         super().__init__()
-        self.ln_1 = tp.LayerNorm(config.embedding_size)
+        self.ln_1 = tp.LayerNorm(config.embedding_size, dtype=config.dtype)
         self.attn = CausalSelfAttention(config)
-        self.ln_2 = tp.LayerNorm(config.embedding_size)
+        self.ln_2 = tp.LayerNorm(config.embedding_size, dtype=config.dtype)
         self.mlp = MLP(config)
 
     def forward(self, x):
-        x_ln1 = tp.cast(self.ln_1(tp.cast(x, self.ln_1.dtype)), x.dtype)
-        x = x + self.attn(x_ln1)
-        x_ln2 = tp.cast(self.ln_2(tp.cast(x, self.ln_2.dtype)), x.dtype)
-        x = x + self.mlp(x_ln2)
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
         return x
 
 
@@ -137,15 +135,15 @@ def __init__(self, config):
         self.wte = tp.Embedding(config.vocab_size, config.embedding_size, dtype=config.dtype)
         self.wpe = tp.Embedding(config.block_size, config.embedding_size, dtype=config.dtype)
         self.h = tp.Sequential(*[Block(config) for _ in range(config.num_layers)])
-        self.ln_f = tp.LayerNorm(config.embedding_size)
+        self.ln_f = tp.LayerNorm(config.embedding_size, dtype=config.dtype)
 
     def forward(self, idx):
         tok_emb = self.wte(idx)  # token embeddings of shape (batch_size, seq_len, embedding_size)
         pos = tp.unsqueeze(tp.arange(self.seq_len, dtype=tp.int32)[: idx.shape[1]], 0)
         pos_emb = self.wpe(pos)  # position embeddings of shape (seq_len, embedding_size)
         x = tok_emb + pos_emb  # (batch_size, seq_len, embedding_size)
         x = self.h(x)
-        x = tp.cast(self.ln_f(tp.cast(x, self.ln_f.dtype)), x.dtype)
+        x = self.ln_f(x)
         return x
 
 
diff --git a/tripy/examples/nanogpt/weight_loader.py b/tripy/examples/nanogpt/weight_loader.py
@@ -51,8 +51,7 @@ def load_weights_from_hf(model, model_type, dtype):
         if any(key.endswith(w) for w in transposed):
             with torch.no_grad():
                 weight = hf_state_dict[key].t().contiguous()
-        if "ln" not in key:
-            weight = weight.to(torch_dtype)
+        weight = weight.to(torch_dtype)
         param = tp.Tensor(weight)
         tripy_state_dict[key] = param
 
@@ -112,8 +111,7 @@ def get_submodule(module, attr_name):
             key, _ = key.split("quantizer._amax")
             key += "scale"
 
-        if "ln" not in key:
-            weight = weight.to(torch_dtype)
+        weight = weight.to(torch_dtype)
         param = tp.Tensor(weight.contiguous())
         assert key in expected_keys
         tripy_state_dict[key] = param