initially add nano-rwkv rnn mode

N1kSt4r · N1kSt4r · commit c22dc6a76aca · 2023-12-16T23:40:50.000+03:00
diff --git a/model.py b/model.py
@@ -20,6 +20,31 @@
 import torch.nn as nn
 from torch.nn import functional as F
 
+
+@dataclass
+class GPTConfig:
+    block_size: int = 1024
+    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
+    n_layer: int = 12
+    n_head: int = 12
+    n_embd: int = 768
+    dropout: float = 0.0
+    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+
+
+class LayerState:
+    # the recurrent neural network (RNN) state for a layer of RWKV5.2 
+    def __init__(self, x, cfg:GPTConfig):
+        # B, T, C, H, K = x.size(0), x.size(1), cfg.n_embed, cfg.n_heads, cfg.n_embed // cfg.n_heads
+        B, T, C, H, K = x.size(0), x.size(1), cfg.n_embd, cfg.n_head, cfg.n_embd // cfg.n_head
+        V = K
+        # a (B,C) size tensor representing latest time mixer token embedding processed
+        self.time_mixer_x_state = torch.zeros(B,C,dtype=x.dtype,device=x.device)
+        # an (B,H,K,V) size tensor representing a decaying token embedding memory for each head, where H=number_of_heads, K=key_dim_per_head, V=value_dim_per_head 
+        self.kv_state = torch.zeros(B,H,K,V,dtype=x.dtype,device=x.device)
+        # a (B,C) size tensor representing latest channel mixer token embedding processed
+        self.channel_mixer_x_state = torch.zeros(B,C,dtype=x.dtype,device=x.device)
+
 class LayerNorm(nn.Module):
     """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
 
@@ -138,6 +163,52 @@ def forward(self, x):
         y = self.dropout(self.output(y))
         return y
 
+    def forward_step(self, x, state, kv_state):
+        print('time mix forward_step x.shape:', x.shape)
+        B, T = x.size(0), 1
+        C = 128 # ToDo(fix)
+        H, N = self.n_head, self.head_size
+        #
+        # we divide a block into chunks to speed up computation & save vram.
+        # you can try to find the optimal chunk_len for your GPU.
+        # avoid going below 128 if you are using bf16 (otherwise time_decay might be less accurate).
+        #
+
+        xx = state - x
+        xk = x + xx * self.time_maa_k
+        xv = x + xx * self.time_maa_v
+        xr = x + xx * self.time_maa_r
+        xg = x + xx * self.time_maa_g
+        r = self.receptance(xr).view(B, T, H, 1, N)
+        k = self.key(xk).view(B, T, H, N, 1)
+        v = self.value(xv).view(B, T, H, 1, N)
+        g = F.silu(self.gate(xg)) # extra gate
+
+        w = torch.exp(-torch.exp(self.time_decay.float())).unsqueeze(-1) # time_decay
+        u = self.time_faaaa.float().unsqueeze(-1) # time_first
+
+        y = torch.empty(B, T, H, N, dtype=x.dtype, device=x.device)
+        for t in range(T):
+            y[:,t], kv_state = self.single_timestep(r[:,t], k[:,t], v[:,t], u, w, kv_state)
+
+        y = y.transpose(1, 2).contiguous().view(B * T, C)
+        y = self.ln_x(y).view(B, T, C) * g
+
+        # output projection
+        y = self.dropout(self.output(y))
+        return y, x, kv_state
+
+    @staticmethod
+    def single_timestep(r, k, v, u, w, kv_state):
+        y = kv_state        # BHKV
+        y = y + (k @ v) * u # BHKV * HK1 + BHKV = BHKV
+        out = r @ y         # BH1K @ BHKV = BH1V
+
+        kv_state = kv_state * w         # BHKV
+        kv_state = kv_state + (k @ v)   # BHKV * HK1 + BHKV = BHKV
+
+        return out.squeeze(-2), kv_state # BHV, BHKV
+
 class RWKV_ChannelMix_x051a(nn.Module):
 
     def __init__(self, config, layer_id):
@@ -169,6 +240,19 @@ def forward(self, x):
         x = self.dropout(x)
         return x
 
+    def forward_step(self, x, state):
+        xx = state - x
+        xk = x + xx * self.time_maa_k
+        xr = x + xx * self.time_maa_r
+
+        out = self.key(xk)
+        out = torch.relu(out) ** 2
+        out = self.value(out)
+        out = torch.sigmoid(self.receptance(xr)) * out
+        out = self.dropout(out)
+        return out, x
+
+
 class Block(nn.Module):
 
     def __init__(self, config, layer_id):
@@ -183,15 +267,15 @@ def forward(self, x):
         x = x + self.cmix(self.ln_2(x))
         return x
 
-@dataclass
-class GPTConfig:
-    block_size: int = 1024
-    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
-    n_layer: int = 12
-    n_head: int = 12
-    n_embd: int = 768
-    dropout: float = 0.0
-    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+    def forward_step(self, x, s: LayerState):
+        out, s.time_mixer_x_state, s.kv_state = \
+            self.tmix.forward_step(self.ln_1(x), s.time_mixer_x_state, s.kv_state)
+        x = x + out
+        out, s.channel_mixer_x_state = \
+            self.cmix.forward_step(self.ln_2(x), s.channel_mixer_x_state)
+        x = x + out
+        return x, s
+
 
 class GPT(nn.Module):
 
@@ -253,11 +337,13 @@ def forward(self, idx, targets=None):
 
         # forward the GPT model itself
         tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
-        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
-        x = self.transformer.drop(tok_emb + pos_emb)
+        #pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
+        x = self.transformer.drop(tok_emb) # + pos_emb)
         for block in self.transformer.h:
             x = block(x)
         x = self.transformer.ln_f(x)
+        return self.lm_head(x), None
+        
 
         if targets is not None:
             # if we are given some desired targets also calculate the loss
@@ -270,6 +356,16 @@ def forward(self, idx, targets=None):
 
         return logits, loss
 
+    def forward_step(self, x, s):
+        tok_emb = self.transformer.wte(x) # token embeddings of shape (b, n_embd)
+        #pos_emb = self.transformer.wpe(pos) # position embeddings of shape (n_embd)
+        x = self.transformer.drop(tok_emb) # + pos_emb)
+        for layer_id, block in enumerate(self.transformer.h):  # run each rwkv block
+            x, s[layer_id] = block.forward_step(x, s[layer_id])
+        x = self.transformer.ln_f(x)
+        logits = self.lm_head(x)
+        return logits, s
+
     def crop_block_size(self, block_size):
         # model surgery to decrease the block size if necessary
         # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
diff --git a/sample.py b/sample.py
@@ -6,7 +6,7 @@
 from contextlib import nullcontext
 import torch
 import tiktoken
-from model import GPTConfig, GPT
+from model import GPTConfig, GPT, LayerState
 
 # -----------------------------------------------------------------------------
 init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
@@ -78,13 +78,26 @@
     with open(start[5:], 'r', encoding='utf-8') as f:
         start = f.read()
 start_ids = encode(start)
-x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
+
+test_seq = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]
+x = (torch.tensor(test_seq, dtype=torch.long, device=device)[None, ...])
 
 # run generation
 with torch.no_grad():
     with ctx:
         for k in range(num_samples):
             print('(note: this is using "GPT-mode" for inference (very slow), so we limit it to 100 characters. The much faster "RNN-mode" for inference is coming soon)')
-            y = model.generate(x, 100, temperature=temperature, top_k=top_k)
-            print(decode(y[0].tolist()))
-            print('---------------')
+            # y = model.generate(x, 1, temperature=temperature, top_k=top_k)
+            gt, _ = model.forward(x)
+
+            states = [LayerState(x, gptconf) for _ in range(gptconf.n_layer)]
+            for i, test_token in enumerate(test_seq):
+                x = torch.tensor([test_token], dtype=torch.long, device=device)
+                x, states = model.forward_step(x, states)
+                assert torch.allclose(gt[:, i], x[:, 0, :], rtol=1e-2), i
+
+            # print(y)
+            # print(model.forward(y))
+            # print(decode(y[0].tolist()))
+            # print('---------------')
+            break