Tested tree and lm based run and train. Did some thorough documenting on how the nn code works.

Christian Newman · Christian Newman · commit 26857a12c433 · 2025-06-09T22:43:27.000-04:00
diff --git a/main b/main
@@ -67,12 +67,11 @@ if __name__ == "__main__":
             download_files()
             train_tree(config)
         elif args.model_type == "lm_based":
-            download_files()
             train_lm(SCRIPT_DIR)
 
     elif args.mode == "run":
         if args.model_type == "tree_based":
-            config = load_config_tree()
+            config = load_config_tree(SCRIPT_DIR)
             # Inject overrides
             download_files()
             config["model_type"] = args.model_type
diff --git a/requirements.txt b/requirements.txt
@@ -6,5 +6,6 @@ pytorch-crf==0.7.2
 scikit-learn==1.6.1
 spiral @ git+https://github.com/cnewman/spiral.git@dff537320c15849c10e583968036df2d966eddee
 torch==2.7.1
-transformers==4.52.4
 waitress==3.0.2
+gensim==4.3.3
+transformers[torch]
diff --git a/src/lm_based_tagger/distilbert_crf.py b/src/lm_based_tagger/distilbert_crf.py
@@ -6,16 +6,39 @@
 
 class DistilBertCRFForTokenClassification(nn.Module):
     """
-    DistilBERT ➜ dropout ➜ linear projection ➜ CRF.
-    The CRF layer models label‑to‑label transitions, so the model
-    is optimised at *sequence* level rather than *token* level.
+    Token-level classifier that combines DistilBERT with a CRF layer for structured prediction.
+
+    Architecture:
+        input_ids, attention_mask
+            ↓
+        DistilBERT (pretrained encoder)
+            ↓
+        Dropout
+            ↓
+        Linear layer (projects hidden size → num_labels)
+            ↓
+        CRF layer (models sequence-level transitions)
+
+    Training:
+        - Uses negative log-likelihood from CRF as loss.
+        - Learns both emission scores (token-level confidence) and
+          transition scores (label-to-label sequence consistency).
+
+    Inference:
+        - Uses Viterbi decoding to predict the most likely sequence of labels.
+
+    Output:
+        During training:
+            {"loss": ..., "logits": ...}
+        During inference:
+            {"logits": ..., "predictions": List[List[int]]}
+
+    Example input shape:
+        input_ids:      [B, T]      — e.g. [16, 128]
+        attention_mask: [B, T]      — 1 for real tokens, 0 for padding
+        logits:         [B, T, C]   — C = number of label classes
     """
-    def __init__(self,
-                 num_labels: int,
-                 id2label: dict,
-                 label2id: dict,
-                 pretrained_name: str = "distilbert-base-uncased",
-                 dropout_prob: float = 0.1):
+    def __init__(self, num_labels: int, id2label: dict, label2id: dict, pretrained_name: str = "distilbert-base-uncased",  dropout_prob: float = 0.1):
         super().__init__()
 
         self.config = DistilBertConfig.from_pretrained(
@@ -29,11 +52,34 @@ def __init__(self,
         self.classifier = nn.Linear(self.config.hidden_size, num_labels)
         self.crf = CRF(num_labels, batch_first=True)
 
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                labels=None,
-                **kwargs):
+    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
+        """
+        Forward pass for training or inference.
+
+        Args:
+            input_ids (Tensor): Token IDs of shape [B, T]
+            attention_mask (Tensor): Attention mask of shape [B, T]
+            labels (Tensor, optional): Ground-truth labels of shape [B, T]. Required during training.
+            kwargs: Any additional DistilBERT-compatible inputs (e.g., head_mask, position_ids, etc.)
+
+        Returns:
+            If labels are provided (training mode):
+                dict with:
+                    - loss (Tensor): scalar negative log-likelihood from CRF
+                    - logits (Tensor): emission scores of shape [B, T, C]
+
+            If labels are not provided (inference mode):
+                dict with:
+                    - logits (Tensor): emission scores of shape [B, T, C]
+                    - predictions (List[List[int]]): decoded label IDs from CRF,
+                                                    one list per sequence,
+                                                    each of length T-2 (excluding [CLS] and [SEP])
+
+        Notes:
+            - logits: [B, T, C], where B = batch size, T = sequence length, C = number of label classes
+            - predictions: List[List[int]], where each inner list has length T-2
+                        (i.e., excludes [CLS] and [SEP]) and contains Viterbi-decoded label IDs
+        """
 
         # Hugging Face occasionally injects helper fields (e.g. num_items_in_batch)
         # Filter `kwargs` down to what DistilBertModel.forward actually accepts.
@@ -48,36 +94,49 @@ def forward(self,
             attention_mask=attention_mask,
             **bert_kwargs,
         )
-        # —— Build emissions once ——————————————————————————————
-        sequence_output = self.dropout(outputs[0])          # [B, T, H]
-        emission_scores = self.classifier(sequence_output)  # [B, T, C]
+        # 1) Compute per-token emission scores
+        # Applies dropout to the BERT hidden states, then projects them to label logits.
+        # Shape: [B, T, C], where B=batch size, T=sequence length, C=number of classes
+        sequence_output = self.dropout(outputs[0])
+        emission_scores = self.classifier(sequence_output)
 
-        # ============================== TRAINING ==============================
         if labels is not None:
-            # 1. Drop [CLS] (idx 0) and [SEP] (idx –1)
-            emissions = emission_scores[:, 1:-1, :]         # [B, T‑2, C]
-            tags      = labels[:,           1:-1].clone()   # [B, T‑2]
-            crf_mask  = (tags != -100)                      # True = keep
+            # 2) Remove [CLS] and [SEP] special tokens from emissions and labels
+            # These tokens were added by the tokenizer but are not part of the identifier
+            emissions = emission_scores[:, 1:-1, :]         # [B, T-2, C]
+            tags      = labels[:, 1:-1].clone()             # [B, T-2]
 
-            # 2. For any position that’s masked‑off ➜ set tag to a valid id (0)
+            # 3) Create a mask: True where label is valid, False where label == -100
+            # The CRF will use this to ignore special/padded tokens
+            crf_mask  = (tags != -100)
+
+            # 4) Replace invalid label positions (-100) with a dummy label (e.g., 0)
+            # This is required because CRF expects a label at every position, even if masked
             tags[~crf_mask] = 0
 
-            # 3. Guarantee first timestep is ON for every sequence
+            # 5) Ensure the first token of every sequence is active in the CRF mask
+            # This avoids CRF errors when the first token is masked out (which breaks decoding)
             first_off = (~crf_mask[:, 0]).nonzero(as_tuple=True)[0]
             if len(first_off):
-                crf_mask[first_off, 0] = True        # flip mask to ON
-                tags[first_off, 0] = 0               # give it tag 0
+                crf_mask[first_off, 0] = True
+                tags[first_off, 0] = 0  # assign a dummy label
 
+            # 6) Compute CRF negative log-likelihood loss
             loss = -self.crf(emissions, tags, mask=crf_mask, reduction="mean")
             return {"loss": loss, "logits": emission_scores}
 
-        # ============================= INFERENCE ==============================
         else:
-            crf_mask  = attention_mask[:, 1:-1].bool()      # [B, T‑2]
-            emissions = emission_scores[:, 1:-1, :]         # [B, T‑2, C]
+            # INFERENCE MODE
+
+            # 2) Remove [CLS] and [SEP] from emissions and build CRF mask from attention
+            # Only use the inner content of the input sequence
+            crf_mask  = attention_mask[:, 1:-1].bool()      # [B, T-2]
+            emissions = emission_scores[:, 1:-1, :]         # [B, T-2, C]
+
+            # 3) Run Viterbi decoding to get best label sequence for each input
             best_paths = self.crf.decode(emissions, mask=crf_mask)
-            return {"logits": emission_scores,
-                    "predictions": best_paths}
+            return {"logits": emission_scores, "predictions": best_paths}
+    
     @classmethod
     def from_pretrained(cls, ckpt_dir, local=False, **kw):
         from safetensors.torch import load_file as load_safe_file
diff --git a/src/lm_based_tagger/distilbert_preprocessing.py b/src/lm_based_tagger/distilbert_preprocessing.py
@@ -1,14 +1,8 @@
 import re
-from nltk import pos_tag
-import nltk
 from difflib import SequenceMatcher
 import pandas as pd
 from datasets import Dataset
 
-# Download once (we’ll just do it quietly here)
-nltk.download('averaged_perceptron_tagger_eng', quiet=True)
-nltk.download('universal_tagset', quiet=True)
-
 # === Constants ===
 VOWELS = set("aeiou")
 LOW_FREQ_TAGS = {"CJ", "VM", "PRE", "V"}
@@ -27,15 +21,13 @@
     "hungarian",
     "cvr",
     "digit",
-    #"nltk"
 ]
 
 FEATURE_FUNCTIONS = {
     "context": lambda row, tokens: CONTEXT_MAP.get(row["CONTEXT"].strip().upper(), "@unknown"),
     "hungarian": lambda row, tokens: detect_hungarian_prefix(tokens[0]) if tokens else "@hung_none",
     "cvr": lambda row, tokens: consonant_vowel_ratio_bucket(tokens),
     "digit": lambda row, tokens: detect_digit_feature(tokens),
-    "nltk": lambda row, tokens: "@nltk_" + '-'.join(tag.lower() for _, tag in pos_tag(tokens, tagset="universal"))
 }
 
 def get_feature_tokens(row, tokens):
@@ -99,6 +91,38 @@ def normalize_language(lang_str):
     return "@lang_" + lang_str.strip().lower().replace("++", "pp").replace("#", "sharp")
 
 def prepare_dataset(df: pd.DataFrame, label2id: dict):
+    """
+    Converts a DataFrame of identifier tokens and grammar tags into a HuggingFace Dataset
+    formatted for NER training with feature and position tokens.
+
+    Each row in the input DataFrame should contain:
+        - tokens: List[str] (e.g., ['get', 'Employee', 'Name'])
+        - tags:   List[str] (e.g., ['V', 'NM', 'N'])
+        - CONTEXT: str (e.g., 'function')
+
+    The function adds:
+        - Feature tokens: ['@hung_get', '@no_digit', '@cvr_mid', '@func']
+        - Interleaved position and real tokens:
+            ['@pos_0', 'get', '@pos_1', 'Employee', '@pos_2', 'Name']
+
+    The NER tags are aligned so that:
+        - Feature tokens and position markers get label -100 (ignored in loss)
+        - Real tokens are converted from grammar tags using `label2id`
+
+    Example Input:
+        df = pd.DataFrame([{
+            "tokens": ["get", "Employee", "Name"],
+            "tags": ["V", "NM", "N"],
+            "CONTEXT": "function"
+        }])
+
+    Example Output:
+        Dataset with:
+            tokens:    ['@hung_get', '@no_digit', '@cvr_mid', '@func',
+                        '@pos_0', 'get', '@pos_1', 'Employee', '@pos_2', 'Name']
+            ner_tags:  [-100, -100, -100, -100,
+                        -100, 1, -100, 2, -100, 3]  # assuming label2id = {"V": 1, "NM": 2, "N": 3}
+    """
     rows = []
     for _, row in df.iterrows():
         tokens = row["tokens"]
@@ -123,9 +147,34 @@ def prepare_dataset(df: pd.DataFrame, label2id: dict):
         "ner_tags": [r["ner_tags"] for r in rows]
     })
 
-def tokenize_and_align_labels(example, tokenizer):
+def tokenize_and_align_labels(sample, tokenizer):
+    """
+    Tokenizes an example and aligns NER labels with subword tokens.
+
+    The input `example` comes from `prepare_dataset()` and contains:
+        - tokens: List[str], including feature and position tokens
+        - ner_tags: List[int], aligned with `tokens`, with -100 for ignored tokens
+
+    This function:
+        - Uses `is_split_into_words=True` to tokenize each item in `tokens`
+        - Uses `tokenizer.word_ids()` to map each subword back to its original token index
+        - Assigns the corresponding label (or -100) for each subword token
+
+    Example Input:
+        example = {
+            "tokens": ['@hung_get', '@no_digit', '@cvr_mid', '@func',
+                       '@pos_0', 'get', '@pos_1', 'Employee', '@pos_2', 'Name'],
+            "ner_tags": [-100, -100, -100, -100,
+                         -100, 1, -100, 2, -100, 3]
+        }
+
+    Assuming 'Employee' is tokenized to ['Em', '##ployee'],
+    Example Output:
+        tokenized["labels"] = [-100, -100, -100, -100,
+                               -100, 1, -100, 2, 2, -100, 3]
+    """
     tokenized = tokenizer(
-        example["tokens"],
+        sample["tokens"],
         truncation=True,
         is_split_into_words=True
     )
@@ -136,8 +185,8 @@ def tokenize_and_align_labels(example, tokenizer):
     for word_id in word_ids:
         if word_id is None:
             labels.append(-100)
-        elif word_id < len(example["ner_tags"]):
-            labels.append(example["ner_tags"][word_id])
+        elif word_id < len(sample["ner_tags"]):
+            labels.append(sample["ner_tags"][word_id])
         else:
             labels.append(-100)
 
diff --git a/src/lm_based_tagger/distilbert_tagger.py b/src/lm_based_tagger/distilbert_tagger.py
diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py