From 259dbb3453882f67e4ea80b1f94ffe5a3b9f58a4 Mon Sep 17 00:00:00 2001
From: SyreenBan <sbanabil@kent.edu>
Date: Wed, 5 Feb 2025 20:23:58 -0500
Subject: [PATCH 01/51] add split_by_capitals

---
 tag_identifier.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tag_identifier.py b/tag_identifier.py
index 8195c4e..d781b1c 100644
--- a/tag_identifier.py
+++ b/tag_identifier.py
@@ -8,6 +8,7 @@
 from waitress import serve
 from spiral import ronin
 import json
+import re
 from create_models import createModel, stable_features, mutable_feature_list
 app = Flask(__name__)
 
@@ -159,6 +160,11 @@ def start_server(temp_config = {}):
     serve(app, host=server_host, port=server_port, url_scheme=server_url_scheme)
     data.close()
 
+def split_by_capitals(name: str):
+    matches = re.finditer(r'[A-Z]+(?![a-z])|[A-Z][a-z]*|[a-z]+', name)
+    words = [match.group() for match in matches]
+    return words
+
 def dictionary_lookup(word):
     #return true if the word exists in the dictionary (the nltk words corpus)
     #or if the word is in the list of approved words
@@ -218,6 +224,9 @@ def listen(student, identifier_name: str, identifier_context: str) -> List[dict]
     # Split identifier_name into words
     words = identifier_name.split('_')
     
+    if (len(words) == 1 and identifier_name == words[0]):
+        words = split_by_capitals(identifier_name)
+
     # # Create initial data frame
     data = pd.DataFrame({
         'WORD': words,

From a8b4ba36a1906b886ed8d5420f1c6b5eb045f3c0 Mon Sep 17 00:00:00 2001
From: SyreenBan <sbanabil@kent.edu>
Date: Thu, 13 Feb 2025 12:28:33 -0500
Subject: [PATCH 02/51] fix the Splitter

---
 tag_identifier.py | 726 +++++++++++++++++++++++-----------------------
 1 file changed, 359 insertions(+), 367 deletions(-)

diff --git a/tag_identifier.py b/tag_identifier.py
index d781b1c..bb89017 100644
--- a/tag_identifier.py
+++ b/tag_identifier.py
@@ -1,367 +1,359 @@
-import os
-import time
-import joblib
-import nltk
-import pandas as pd
-from feature_generator import *
-from flask import Flask
-from waitress import serve
-from spiral import ronin
-import json
-import re
-from create_models import createModel, stable_features, mutable_feature_list
-app = Flask(__name__)
-
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-class ModelData:
-    def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> None:
-        """
-        Initialize an instance of the ModelData class with word vector models.
-
-        Args:
-            ModelTokens: Word vectors model for tokens.
-            ModelMethods: Word vectors model for methods.
-            ModelGensimEnglish: Word vectors model for general English words.
-        """
-
-        self.ModelTokens = modelTokens
-        self.ModelMethods = modelMethods
-        self.ModelGensimEnglish = modelGensimEnglish
-        self.wordCount = wordCount
-        # self.ModelClassifier = joblib.load('output/model_RandomForestClassifier.pkl')
-
-class AppCache:
-    def __init__(self, Path, Filename) -> None:
-        self.Cache = {}
-        self.Path = Path
-        self.Filename = Filename
-
-    def load(self): 
-        if not os.path.isdir(self.Path): 
-            raise Exception("Cannot load path: "+self.Path)
-        else:
-            if not os.path.isfile(self.Path+"/"+self.Filename):
-                JSONcache = open(self.Path+"/"+self.Filename, 'w')
-                json.dump({}, JSONcache)
-                JSONcache.close()
-            JSONcache = open(self.Path+"/"+self.Filename, 'r')
-            self.Cache = json.load(JSONcache)
-            JSONcache.close()
-
-    def add(self, identifier, result):
-        info = result
-        info.update({"firstEncounter": time.time()})
-        info.update({"lastEncounter": time.time()})
-        info.update({"count": 1})
-        info.update({"version": "SCANL 1.0"})
-        self.Cache.update({identifier : info})
-
-    def encounter(self, identifier):
-        self.Cache[identifier].update({"lastEncounter": time.time()})
-        self.Cache[identifier].update({"count": self.Cache[identifier]["count"]+1})
-        self.Cache[identifier].update({"version": "SCANL 1.0"})
-
-    def save(self):
-        JSONcache = open(self.Path+"/"+self.Filename, 'w')
-        json.dump(self.Cache, JSONcache)
-        JSONcache.close()
-
-class WordList:
-    def __init__(self, Path):
-        self.Words = set()
-        self.Path = Path
-    
-    def load(self):
-        if not os.path.isfile(self.Path):
-            print("Could not find word list file!")
-            return
-        with open(self.Path) as file: 
-            for line in file:
-                self.Words.add(line[:line.find(',')]) #stop at comma
-    
-    def find(self, item):
-        return item in self.Words
-
-def initialize_model():
-    """
-    Initialize and load word vectors for the application, and load a word count DataFrame.
-
-    This function initializes and loads word vectors using the 'createModel' function, and loads word counts 
-    from a JSON file into a Pandas DataFrame for use in the application.
-
-    Returns:
-        tuple: (ModelData, WORD_COUNT DataFrame)
-    """
-    print("Loading word vectors!!")
-    modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR)
-    print("Word vectors loaded!!")
-    
-    # Load the word count JSON file into a DataFrame
-    word_count_path = os.path.join("input", "word_count.json")
-    if os.path.exists(word_count_path):
-        print(f"Loading word count data from {word_count_path}...")
-        word_count_df = pd.read_json(word_count_path, orient='index', typ='series').reset_index()
-        word_count_df.columns = ['word', 'log_frequency']
-        print("Word count data loaded!")
-    else:
-        print(f"Word count file not found at {word_count_path}. Initializing empty DataFrame.")
-        word_count_df = pd.DataFrame(columns=['word', 'log_frequency'])
-    
-    # Create and store model data
-    app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df)
-
-def start_server(temp_config = {}):
-    """
-    Initialize the model and start the server.
-
-    This function first initializes the model by calling the 'initialize_model' function. Then, it starts the server using
-    the waitress `serve` method, allowing incoming HTTP requests to be handled.
-
-    The arguments to waitress serve are read from the configuration file `serve.json`. The default option is to
-    listen for HTTP requests on all interfaces (ip address 0.0.0.0, port 5000).
-
-    Returns:
-        None
-    """
-    print('initializing model...')
-    initialize_model()
-
-    print("loading cache...")
-    if not os.path.isdir("cache"): os.mkdir("cache")
-    app.cache = AppCache("cache", "cache.json")
-    app.studentCache = AppCache("cache", "student_cache.json")
-    app.cache.load()
-
-    print("loading dictionary...")
-    nltk.download("words")
-    app.english_words = set(w.lower() for w in nltk.corpus.words.words())
-    #insert english words from words/en.txt
-    if not os.path.exists("words/en.txt"):
-        print("could not find English words, using WordNet only!")
-    else:
-        with open("words/en.txt") as words:
-            for word in words:
-                app.english_words.add(word[:-1])
-
-    print('retrieving server configuration...')
-    data = open('serve.json')
-    config = json.load(data)
-
-    server_host = temp_config["address"] if "address" in temp_config.keys() else config["address"]
-    server_port = temp_config["port"] if "port" in temp_config.keys() else config['port']
-    server_url_scheme = temp_config["protocol"] if "protocol" in temp_config.keys() else config["protocol"]
-
-    print("loading word list...")
-    wordListPath = temp_config["words"] if "words" in temp_config.keys() else config["words"]
-    app.words = WordList(wordListPath)
-    app.words.load()
-
-    print("Starting server...")
-    serve(app, host=server_host, port=server_port, url_scheme=server_url_scheme)
-    data.close()
-
-def split_by_capitals(name: str):
-    matches = re.finditer(r'[A-Z]+(?![a-z])|[A-Z][a-z]*|[a-z]+', name)
-    words = [match.group() for match in matches]
-    return words
-
-def dictionary_lookup(word):
-    #return true if the word exists in the dictionary (the nltk words corpus)
-    #or if the word is in the list of approved words
-    dictionaryType = ""
-    dictionary = word.lower() in app.english_words
-    acceptable = app.words.find(word)
-    digit = word.isnumeric()
-    if (dictionary):
-        dictionaryType = "DW"
-    elif (acceptable):
-        dictionaryType = "AW"
-    elif (digit):
-        dictionaryType = "DD"
-    else:
-        dictionaryType = "UC"
-    
-    return dictionaryType
-
-#TODO: this is not an intuitive way to save cache
-@app.route('/')
-def save():
-    app.cache.save()
-    app.studentCache.save()
-    return "successfully saved cache"
-
-#TODO: use a query string instead for specifying student cache
-@app.route('/<student>/<identifier_name>/<identifier_context>')
-def listen(student, identifier_name: str, identifier_context: str) -> List[dict]:
-    #check if identifier name has already been used
-    cache = None;
-
-    if (student == "student"):
-        cache = app.studentCache
-    else: 
-        cache = app.cache
-
-    if (identifier_name in cache.Cache.keys()): 
-        cache.encounter(identifier_name)
-        return cache.Cache[identifier_name]
-    
-    """
-    Process a web request to analyze an identifier within a specific context.
-
-    This route function takes two URL parameters (identifier_name, and identifier_context) from an
-    incoming HTTP request and performs data preprocessing and feature extraction on the identifier_name.
-    It then uses a trained classifier to annotate the identifier with part-of-speech tags and other linguistic features.
-
-    Args:
-        identifier_name (str): The name of the identifier to be analyzed.
-        identifier_context (str): The context in which the identifier appears.
-
-    Returns:
-        List[dict]: A list of dictionaries containing words and their predicted POS tags.
-    """
-    print(f"INPUT: {identifier_name} {identifier_context}")
-   
-    # Split identifier_name into words
-    words = identifier_name.split('_')
-    
-    if (len(words) == 1 and identifier_name == words[0]):
-        words = split_by_capitals(identifier_name)
-
-    # # Create initial data frame
-    data = pd.DataFrame({
-        'WORD': words,
-        'SPLIT_IDENTIFIER': ' '.join(words),
-        'CONTEXT_NUMBER': context_to_number(identifier_context),  # Predefined context number
-    })
-
-    # create response JSON
-    # tags = list(annotate_identifier(app.model_data.ModelClassifier, data))
-    result = {
-        "words" : []
-    }
-
-    # Add features to the data
-    data = createFeatures(
-        data, 
-        mutable_feature_list,
-        modelGensimEnglish=app.model_data.ModelGensimEnglish,
-    )
-    
-    categorical_features = ['NLTK_POS']
-    category_variables = []
-
-    for category_column in categorical_features:
-        if category_column in data.columns:
-            category_variables.append(category_column)
-            data.loc[:, category_column] = data[category_column].astype(str)
-
-    for category_column in category_variables:
-        # Explicitly handle categorical conversion
-        unique_values = data[category_column].unique()
-        category_map = {}
-        for value in unique_values:
-            if value in universal_to_custom:
-                category_map[value] = custom_to_numeric[universal_to_custom[value]]
-            else:
-                category_map[value] = custom_to_numeric['NOUN']  # Assign 'NM' (8) for unknown categories
-
-        data.loc[:, category_column] = data[category_column].map(category_map)
-
-    # Convert categorical variables to numeric
-    # Load and apply the classifier
-    clf = joblib.load(os.path.join(SCRIPT_DIR, 'output', 'model_GradientBoostingClassifier.pkl'))
-    predicted_tags = annotate_identifier(clf, data)
-
-    # Combine words and their POS tags into a parseable format
-    #result = [{'word': word, 'pos_tag': tag} for word, tag in zip(words, predicted_tags)]
-
-    for i in range(len(words)):
-        #check dictionary
-        dictionary = "UC" #uncategorized
-        word = words[i]
-        dictionary = dictionary_lookup(word)
-        result["words"].append(
-            {
-                words[i] : {
-                    "tag" : predicted_tags[i],
-                    "dictionary" : dictionary
-                }
-            }
-        )
-
-    # append result to cache
-    cache.add(identifier_name, result)
-
-    return result
-    
-def context_to_number(context):
-    """
-    Convert a textual context description to a numerical representation.
-
-    This function takes a context description as a string and maps it to a numerical representation according to a
-    predefined mapping.
-
-    Args:
-        context (str): The textual context description.
-
-    Returns:
-        int: The numerical representation of the context.
-
-    Raises:
-        ValueError: If the provided context is not one of the predefined values.
-
-    Example:
-        numeric_context = context_to_number("CLASS")
-    """
-    if context == "ATTRIBUTE":
-        return 1
-    elif context == "CLASS":
-        return 2
-    elif context == "DECLARATION":
-        return 3
-    elif context == "FUNCTION":
-        return 4 
-    elif context == "PARAMETER":
-        return 5
-
-def annotate_identifier(clf, data):
-    """
-    Annotate identifier tokens using a trained classifier.
-
-    This function takes a trained classifier and a dataset containing features for identifier tokens. It applies the
-    classifier to predict labels for the identifier tokens.
-
-    Args:
-        clf (Classifier): The trained classifier model.
-        data (pd.DataFrame): A DataFrame containing features for identifier tokens. The columns of the DataFrame should
-                             match the feature names used during training.
-
-    Returns:
-        np.array: An array of predicted labels for the identifier tokens.
-    """
-    # Drop unnecessary columns
-    data = data.drop(columns=['WORD', 'SPLIT_IDENTIFIER'], errors='ignore')
-
-    # Ensure only the features used during training are included
-    trained_features = clf.feature_names_in_  # Features expected by the classifier
-    missing_features = set(trained_features) - set(data.columns)
-    extra_features = set(data.columns) - set(trained_features)
-
-    if missing_features:
-        raise ValueError(f"The following expected features are missing: {missing_features}")
-    if extra_features:
-        print(f"Warning: The following unused features are being ignored: {extra_features}")
-        data = data[trained_features]
-
-    # Ensure feature order matches the trained model
-    df_features = data[trained_features]
-    
-    print("THESE")
-    print(df_features)
-    
-    print("THOSE")
-    print(clf.feature_names_in_)
-
-    # Make predictions
-    y_pred = clf.predict(df_features)
-    return y_pred
+import os
+import time
+import joblib
+import nltk
+import pandas as pd
+from feature_generator import *
+from flask import Flask
+from waitress import serve
+from spiral import ronin
+import json
+from create_models import createModel, stable_features, mutable_feature_list
+
+app = Flask(__name__)
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+class ModelData:
+    def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> None:
+        """
+        Initialize an instance of the ModelData class with word vector models.
+
+        Args:
+            ModelTokens: Word vectors model for tokens.
+            ModelMethods: Word vectors model for methods.
+            ModelGensimEnglish: Word vectors model for general English words.
+        """
+
+        self.ModelTokens = modelTokens
+        self.ModelMethods = modelMethods
+        self.ModelGensimEnglish = modelGensimEnglish
+        self.wordCount = wordCount
+        # self.ModelClassifier = joblib.load('output/model_RandomForestClassifier.pkl')
+
+class AppCache:
+    def __init__(self, Path, Filename) -> None:
+        self.Cache = {}
+        self.Path = Path
+        self.Filename = Filename
+
+    def load(self): 
+        if not os.path.isdir(self.Path): 
+            raise Exception("Cannot load path: "+self.Path)
+        else:
+            if not os.path.isfile(self.Path+"/"+self.Filename):
+                JSONcache = open(self.Path+"/"+self.Filename, 'w')
+                json.dump({}, JSONcache)
+                JSONcache.close()
+            JSONcache = open(self.Path+"/"+self.Filename, 'r')
+            self.Cache = json.load(JSONcache)
+            JSONcache.close()
+
+    def add(self, identifier, result):
+        info = result
+        info.update({"firstEncounter": time.time()})
+        info.update({"lastEncounter": time.time()})
+        info.update({"count": 1})
+        info.update({"version": "SCANL 1.0"})
+        self.Cache.update({identifier : info})
+
+    def encounter(self, identifier):
+        self.Cache[identifier].update({"lastEncounter": time.time()})
+        self.Cache[identifier].update({"count": self.Cache[identifier]["count"]+1})
+        self.Cache[identifier].update({"version": "SCANL 1.0"})
+
+    def save(self):
+        JSONcache = open(self.Path+"/"+self.Filename, 'w')
+        json.dump(self.Cache, JSONcache)
+        JSONcache.close()
+
+class WordList:
+    def __init__(self, Path):
+        self.Words = set()
+        self.Path = Path
+    
+    def load(self):
+        if not os.path.isfile(self.Path):
+            print("Could not find word list file!")
+            return
+        with open(self.Path) as file: 
+            for line in file:
+                self.Words.add(line[:line.find(',')]) #stop at comma
+    
+    def find(self, item):
+        return item in self.Words
+
+def initialize_model():
+    """
+    Initialize and load word vectors for the application, and load a word count DataFrame.
+
+    This function initializes and loads word vectors using the 'createModel' function, and loads word counts 
+    from a JSON file into a Pandas DataFrame for use in the application.
+
+    Returns:
+        tuple: (ModelData, WORD_COUNT DataFrame)
+    """
+    print("Loading word vectors!!")
+    modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR)
+    print("Word vectors loaded!!")
+    
+    # Load the word count JSON file into a DataFrame
+    word_count_path = os.path.join("input", "word_count.json")
+    if os.path.exists(word_count_path):
+        print(f"Loading word count data from {word_count_path}...")
+        word_count_df = pd.read_json(word_count_path, orient='index', typ='series').reset_index()
+        word_count_df.columns = ['word', 'log_frequency']
+        print("Word count data loaded!")
+    else:
+        print(f"Word count file not found at {word_count_path}. Initializing empty DataFrame.")
+        word_count_df = pd.DataFrame(columns=['word', 'log_frequency'])
+    
+    # Create and store model data
+    app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df)
+
+def start_server(temp_config = {}):
+    """
+    Initialize the model and start the server.
+
+    This function first initializes the model by calling the 'initialize_model' function. Then, it starts the server using
+    the waitress `serve` method, allowing incoming HTTP requests to be handled.
+
+    The arguments to waitress serve are read from the configuration file `serve.json`. The default option is to
+    listen for HTTP requests on all interfaces (ip address 0.0.0.0, port 5000).
+
+    Returns:
+        None
+    """
+    print('initializing model...')
+    initialize_model()
+
+    print("loading cache...")
+    if not os.path.isdir("cache"): os.mkdir("cache")
+    app.cache = AppCache("cache", "cache.json")
+    app.studentCache = AppCache("cache", "student_cache.json")
+    app.cache.load()
+
+    print("loading dictionary...")
+    nltk.download("words")
+    app.english_words = set(w.lower() for w in nltk.corpus.words.words())
+    #insert english words from words/en.txt
+    if not os.path.exists("words/en.txt"):
+        print("could not find English words, using WordNet only!")
+    else:
+        with open("words/en.txt") as words:
+            for word in words:
+                app.english_words.add(word[:-1])
+
+    print('retrieving server configuration...')
+    data = open('serve.json')
+    config = json.load(data)
+
+    server_host = temp_config["address"] if "address" in temp_config.keys() else config["address"]
+    server_port = temp_config["port"] if "port" in temp_config.keys() else config['port']
+    server_url_scheme = temp_config["protocol"] if "protocol" in temp_config.keys() else config["protocol"]
+
+    print("loading word list...")
+    wordListPath = temp_config["words"] if "words" in temp_config.keys() else config["words"]
+    app.words = WordList(wordListPath)
+    app.words.load()
+
+    print("Starting server...")
+    serve(app, host=server_host, port=server_port, url_scheme=server_url_scheme)
+    data.close()
+
+def dictionary_lookup(word):
+    #return true if the word exists in the dictionary (the nltk words corpus)
+    #or if the word is in the list of approved words
+    dictionaryType = ""
+    dictionary = word.lower() in app.english_words
+    acceptable = app.words.find(word)
+    digit = word.isnumeric()
+    if (dictionary):
+        dictionaryType = "DW"
+    elif (acceptable):
+        dictionaryType = "AW"
+    elif (digit):
+        dictionaryType = "DD"
+    else:
+        dictionaryType = "UC"
+    
+    return dictionaryType
+
+#TODO: this is not an intuitive way to save cache
+@app.route('/')
+def save():
+    app.cache.save()
+    app.studentCache.save()
+    return "successfully saved cache"
+
+#TODO: use a query string instead for specifying student cache
+@app.route('/<student>/<identifier_name>/<identifier_context>')
+def listen(student, identifier_name: str, identifier_context: str) -> List[dict]:
+    #check if identifier name has already been used
+    cache = None;
+
+    if (student == "student"):
+        cache = app.studentCache
+    else: 
+        cache = app.cache
+
+    if (identifier_name in cache.Cache.keys()): 
+        cache.encounter(identifier_name)
+        return cache.Cache[identifier_name]
+    
+    """
+    Process a web request to analyze an identifier within a specific context.
+
+    This route function takes two URL parameters (identifier_name, and identifier_context) from an
+    incoming HTTP request and performs data preprocessing and feature extraction on the identifier_name.
+    It then uses a trained classifier to annotate the identifier with part-of-speech tags and other linguistic features.
+
+    Args:
+        identifier_name (str): The name of the identifier to be analyzed.
+        identifier_context (str): The context in which the identifier appears.
+
+    Returns:
+        List[dict]: A list of dictionaries containing words and their predicted POS tags.
+    """
+    print(f"INPUT: {identifier_name} {identifier_context}")
+   
+    # Split identifier_name into words
+    words = ronin.split(identifier_name)
+
+    # # Create initial data frame
+    data = pd.DataFrame({
+        'WORD': words,
+        'SPLIT_IDENTIFIER': ' '.join(words),
+        'CONTEXT_NUMBER': context_to_number(identifier_context),  # Predefined context number
+    })
+
+    # create response JSON
+    # tags = list(annotate_identifier(app.model_data.ModelClassifier, data))
+    result = {
+        "words" : []
+    }
+
+    # Add features to the data
+    data = createFeatures(
+        data, 
+        mutable_feature_list,
+        modelGensimEnglish=app.model_data.ModelGensimEnglish,
+    )
+    
+    categorical_features = ['NLTK_POS']
+    category_variables = []
+
+    for category_column in categorical_features:
+        if category_column in data.columns:
+            category_variables.append(category_column)
+            data.loc[:, category_column] = data[category_column].astype(str)
+
+    for category_column in category_variables:
+        # Explicitly handle categorical conversion
+        unique_values = data[category_column].unique()
+        category_map = {}
+        for value in unique_values:
+            if value in universal_to_custom:
+                category_map[value] = custom_to_numeric[universal_to_custom[value]]
+            else:
+                category_map[value] = custom_to_numeric['NOUN']  # Assign 'NM' (8) for unknown categories
+
+        data.loc[:, category_column] = data[category_column].map(category_map)
+
+    # Convert categorical variables to numeric
+    # Load and apply the classifier
+    clf = joblib.load(os.path.join(SCRIPT_DIR, 'output', 'model_GradientBoostingClassifier.pkl'))
+    predicted_tags = annotate_identifier(clf, data)
+
+    # Combine words and their POS tags into a parseable format
+    #result = [{'word': word, 'pos_tag': tag} for word, tag in zip(words, predicted_tags)]
+
+    for i in range(len(words)):
+        #check dictionary
+        dictionary = "UC" #uncategorized
+        word = words[i]
+        dictionary = dictionary_lookup(word)
+        result["words"].append(
+            {
+                words[i] : {
+                    "tag" : predicted_tags[i],
+                    "dictionary" : dictionary
+                }
+            }
+        )
+
+    # append result to cache
+    cache.add(identifier_name, result)
+
+    return result
+    
+def context_to_number(context):
+    """
+    Convert a textual context description to a numerical representation.
+
+    This function takes a context description as a string and maps it to a numerical representation according to a
+    predefined mapping.
+
+    Args:
+        context (str): The textual context description.
+
+    Returns:
+        int: The numerical representation of the context.
+
+    Raises:
+        ValueError: If the provided context is not one of the predefined values.
+
+    Example:
+        numeric_context = context_to_number("CLASS")
+    """
+    if context == "ATTRIBUTE":
+        return 1
+    elif context == "CLASS":
+        return 2
+    elif context == "DECLARATION":
+        return 3
+    elif context == "FUNCTION":
+        return 4 
+    elif context == "PARAMETER":
+        return 5
+
+def annotate_identifier(clf, data):
+    """
+    Annotate identifier tokens using a trained classifier.
+
+    This function takes a trained classifier and a dataset containing features for identifier tokens. It applies the
+    classifier to predict labels for the identifier tokens.
+
+    Args:
+        clf (Classifier): The trained classifier model.
+        data (pd.DataFrame): A DataFrame containing features for identifier tokens. The columns of the DataFrame should
+                             match the feature names used during training.
+
+    Returns:
+        np.array: An array of predicted labels for the identifier tokens.
+    """
+    # Drop unnecessary columns
+    data = data.drop(columns=['WORD', 'SPLIT_IDENTIFIER'], errors='ignore')
+
+    # Ensure only the features used during training are included
+    trained_features = clf.feature_names_in_  # Features expected by the classifier
+    missing_features = set(trained_features) - set(data.columns)
+    extra_features = set(data.columns) - set(trained_features)
+
+    if missing_features:
+        raise ValueError(f"The following expected features are missing: {missing_features}")
+    if extra_features:
+        print(f"Warning: The following unused features are being ignored: {extra_features}")
+        data = data[trained_features]
+
+    # Ensure feature order matches the trained model
+    df_features = data[trained_features]
+    
+    print("THESE")
+    print(df_features)
+    
+    print("THOSE")
+    print(clf.feature_names_in_)
+
+    # Make predictions
+    y_pred = clf.predict(df_features)
+    return y_pred

From d91ed0d654d07af3eff4809f9b881ee0bcdbf92d Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@kent.edu>
Date: Sun, 16 Feb 2025 01:09:30 -0500
Subject: [PATCH 03/51] Change ports in the readme

---
 README.md | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 1686629..7a5cd64 100644
--- a/README.md
+++ b/README.md
@@ -54,9 +54,7 @@ options:
 
 `./main -r` will start the server, which will listen for identifier names sent via HTTP over the route:
 
-http://127.0.0.1:5000/{cache_selection}/{identifier_name}/{code_context}
-
-**NOTE: ** On docker, the port is 8080 instead of 5000.
+http://127.0.0.1:8080/{cache_selection}/{identifier_name}/{code_context}
 
 "cache selection" will save results to a separate cache if it is set to "student"
 
@@ -69,11 +67,11 @@ http://127.0.0.1:5000/{cache_selection}/{identifier_name}/{code_context}
 
 For example:
 
-Tag a declaration: ``http://127.0.0.1:5000/cache/numberArray/DECLARATION``
+Tag a declaration: ``http://127.0.0.1:8080/cache/numberArray/DECLARATION``
 
-Tag a function: ``http://127.0.0.1:5000/cache/GetNumberArray/FUNCTION``
+Tag a function: ``http://127.0.0.1:8080/cache/GetNumberArray/FUNCTION``
 
-Tag an class: ``http://127.0.0.1:5000/cache/PersonRecord/CLASS``
+Tag an class: ``http://127.0.0.1:8080/cache/PersonRecord/CLASS``
 
 #### Note
 Kebab case is not currently supported due to the limitations of Spiral. Attempting to send the tagger identifiers which are in kebab case will result in the entry of a single noun. 

From cfea42ddc88c0a4c8232ff84376a30f88d7e5c16 Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@kent.edu>
Date: Sun, 16 Feb 2025 01:10:29 -0500
Subject: [PATCH 04/51] Forgot Git uses master and not main. Updated .yml to
 master.

---
 .github/workflows/tests.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index e57f85f..1586a95 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -2,9 +2,9 @@ name: SCALAR Tagger CI
 
 on:
   push:
-    branches: [ main, develop ]
+    branches: [ master, develop ]
   pull_request:
-    branches: [ main, develop ]
+    branches: [ master, develop ]
 
 jobs:
   test-docker:
@@ -112,4 +112,4 @@ jobs:
         uses: actions/cache@v3
         with:
           path: ~/.cache/gensim-data/fasttext-wiki-news-subwords-300*
-          key: ${{ runner.os }}-fasttext-model
\ No newline at end of file
+          key: ${{ runner.os }}-fasttext-model

From d730df7773a0e7c5406fe74aa26be94b14c38cdd Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@kent.edu>
Date: Sun, 16 Feb 2025 11:17:24 -0500
Subject: [PATCH 05/51] Change dockerhub link for now

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7a5cd64..83ead68 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ There are two ways to run the tagger. This document describes both ways.
 
 ## Getting Started with Docker
 
-To run SCNL tagger in a Docker container you can clone the repository and pull the latest docker impage from `srcml/scanl_tagger:latest`
+To run SCNL tagger in a Docker container you can clone the repository and pull the latest docker impage from `sourceslicer/scalar_tagger:latest`
 
 Make sure you have Docker and Docker Compose installed:
 https://docs.docker.com/engine/install/

From 40a72da0557e87d2ad289f8f2606d3e47915f60a Mon Sep 17 00:00:00 2001
From: Brandon Scholten <scholtenb8890@mtc.edu>
Date: Sun, 16 Mar 2025 00:11:07 -0400
Subject: [PATCH 06/51] Rewrite AppCache to use sqlite

---
 requirements.txt  |   1 +
 tag_identifier.py | 143 ++++++++++++++++++++++++++++++++++++----------
 2 files changed, 113 insertions(+), 31 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 74c39c3..f8846a2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,4 @@ scikit_learn==1.3.0
 scipy==1.10.1
 git+https://github.com/cnewman/spiral.git
 waitress==2.1.2
+sqlite3
diff --git a/tag_identifier.py b/tag_identifier.py
index bb89017..bb6b778 100644
--- a/tag_identifier.py
+++ b/tag_identifier.py
@@ -8,6 +8,7 @@
 from waitress import serve
 from spiral import ronin
 import json
+import sqlite3
 from create_models import createModel, stable_features, mutable_feature_list
 
 app = Flask(__name__)
@@ -30,41 +31,119 @@ def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) ->
         self.wordCount = wordCount
         # self.ModelClassifier = joblib.load('output/model_RandomForestClassifier.pkl')
 
+#TODO: rewrite to use an SQL lite database
+# class AppCache:
+#     def __init__(self, Path, Filename) -> None:
+#         self.Cache = {}
+#         self.Path = Path
+#         self.Filename = Filename
+
+#     def load(self): 
+#         if not os.path.isdir(self.Path): 
+#             raise Exception("Cannot load path: "+self.Path)
+#         else:
+#             if not os.path.isfile(self.Path+"/"+self.Filename):
+#                 JSONcache = open(self.Path+"/"+self.Filename, 'w')
+#                 json.dump({}, JSONcache)
+#                 JSONcache.close()
+#             JSONcache = open(self.Path+"/"+self.Filename, 'r')
+#             self.Cache = json.load(JSONcache)
+#             JSONcache.close()
+
+#     def add(self, identifier, result):
+#         info = result
+#         info.update({"firstEncounter": time.time()})
+#         info.update({"lastEncounter": time.time()})
+#         info.update({"count": 1})
+#         info.update({"version": "SCANL 1.0"})
+#         self.Cache.update({identifier : info})
+
+#     def encounter(self, identifier):
+#         self.Cache[identifier].update({"lastEncounter": time.time()})
+#         self.Cache[identifier].update({"count": self.Cache[identifier]["count"]+1})
+#         self.Cache[identifier].update({"version": "SCANL 1.0"})
+
+#     def save(self):
+#         JSONcache = open(self.Path+"/"+self.Filename, 'w')
+#         json.dump(self.Cache, JSONcache)
+#         JSONcache.close()
+
+#TODO: context should probably be considered when saving tagged names
 class AppCache:
-    def __init__(self, Path, Filename) -> None:
-        self.Cache = {}
-        self.Path = Path
-        self.Filename = Filename
-
-    def load(self): 
-        if not os.path.isdir(self.Path): 
-            raise Exception("Cannot load path: "+self.Path)
-        else:
-            if not os.path.isfile(self.Path+"/"+self.Filename):
-                JSONcache = open(self.Path+"/"+self.Filename, 'w')
-                json.dump({}, JSONcache)
-                JSONcache.close()
-            JSONcache = open(self.Path+"/"+self.Filename, 'r')
-            self.Cache = json.load(JSONcache)
-            JSONcache.close()
+    def __init__(self, Path) -> None:
+        self.Path = Path #path to an SQL lite database
+    
+    def load(self):
+        #create connection to database
+        conn = sqlite3.connect(self.Path)
+        #create the table of names if it doesn't exist
+        cursor = conn.cursor()
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS names (
+                       id INTEGER PRIMARY KEY AUTOINCREMENT,
+                       name TEXT NOT NULL,
+                       words TEXT, -- this is a JSON string
+                       firstEncounter INTEGER,
+                       lastEncounter INTEGER,
+                       count INTEGER
+                       )
+        ''')
+        #close the database connection
+        conn.commit()
+        conn.close()
 
     def add(self, identifier, result):
-        info = result
-        info.update({"firstEncounter": time.time()})
-        info.update({"lastEncounter": time.time()})
-        info.update({"count": 1})
-        info.update({"version": "SCANL 1.0"})
-        self.Cache.update({identifier : info})
+        #connection setup
+        conn = sqlite3.connect(self.Path)
+        cursor = conn.cursor()
+        #add identifier to table
+        record = {
+            "name": identifier,
+            "words": json.dumps(result["words"]),
+            "firstEncounter": time.time(),
+            "lastEncounter": time.time(),
+            "count": 1
+        }
+        cursor.execute('''
+            INSERT INTO names (name, words, firstEncounter, lastEncounter, count)
+            VALUES (:name, :words, :firstEncounter, :lastEncounter, :count)
+        ''', record)
+        #close the database connection
+        conn.commit()
+        conn.close()
+
+    def retrieve(self, identifier):
+        #return a dictionary of the name, or false if not in database
+        conn = sqlite3.connect(self.Path)
+        cursor = conn.cursor()
+        cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ?", identifier)
+        row = cursor.fetchone()
+
+        if row:
+            return {
+                "name": row[0],
+                "words": json.loads(rows[1]),
+                "firstEncounter": row[2],
+                "lastEncounter": row[3],
+                "count": row[4]
+            }
+        else: 
+            return False
 
     def encounter(self, identifier):
-        self.Cache[identifier].update({"lastEncounter": time.time()})
-        self.Cache[identifier].update({"count": self.Cache[identifier]["count"]+1})
-        self.Cache[identifier].update({"version": "SCANL 1.0"})
-
-    def save(self):
-        JSONcache = open(self.Path+"/"+self.Filename, 'w')
-        json.dump(self.Cache, JSONcache)
-        JSONcache.close()
+        currentCount = self.retrieve()["count"]
+        #connection setup
+        conn = sqlite3.connect(self.Path)
+        cursor = conn.cursor()
+        #update record
+        cursor.execute('''
+            UPDATE names 
+            SET lastEncounter = ?, count = ?
+            WHERE name = ?
+        ''', time.time(), currentCount+1, identifier)
+        #close connection
+        conn.commit()
+        conn.close()
 
 class WordList:
     def __init__(self, Path):
@@ -186,10 +265,12 @@ def save():
     return "successfully saved cache"
 
 #TODO: use a query string instead for specifying student cache
+#TODO: update to save data to SQL lite instead of updating a JSON
+#      responses should still be sent in the JSON format
 @app.route('/<student>/<identifier_name>/<identifier_context>')
 def listen(student, identifier_name: str, identifier_context: str) -> List[dict]:
     #check if identifier name has already been used
-    cache = None;
+    cache = None
 
     if (student == "student"):
         cache = app.studentCache

From fb2ab83bccfe3f614d0eb46ee8eeedfd95615fcc Mon Sep 17 00:00:00 2001
From: Brandon Scholten <scholtenb8890@mtc.edu>
Date: Mon, 17 Mar 2025 12:54:19 -0400
Subject: [PATCH 07/51] Switch to sqlite

---
 tag_identifier.py | 123 ++++++++++++++++++++++------------------------
 1 file changed, 60 insertions(+), 63 deletions(-)

diff --git a/tag_identifier.py b/tag_identifier.py
index bb6b778..2cf5325 100644
--- a/tag_identifier.py
+++ b/tag_identifier.py
@@ -29,44 +29,41 @@ def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) ->
         self.ModelMethods = modelMethods
         self.ModelGensimEnglish = modelGensimEnglish
         self.wordCount = wordCount
-        # self.ModelClassifier = joblib.load('output/model_RandomForestClassifier.pkl')
-
-#TODO: rewrite to use an SQL lite database
-# class AppCache:
-#     def __init__(self, Path, Filename) -> None:
-#         self.Cache = {}
-#         self.Path = Path
-#         self.Filename = Filename
-
-#     def load(self): 
-#         if not os.path.isdir(self.Path): 
-#             raise Exception("Cannot load path: "+self.Path)
-#         else:
-#             if not os.path.isfile(self.Path+"/"+self.Filename):
-#                 JSONcache = open(self.Path+"/"+self.Filename, 'w')
-#                 json.dump({}, JSONcache)
-#                 JSONcache.close()
-#             JSONcache = open(self.Path+"/"+self.Filename, 'r')
-#             self.Cache = json.load(JSONcache)
-#             JSONcache.close()
-
-#     def add(self, identifier, result):
-#         info = result
-#         info.update({"firstEncounter": time.time()})
-#         info.update({"lastEncounter": time.time()})
-#         info.update({"count": 1})
-#         info.update({"version": "SCANL 1.0"})
-#         self.Cache.update({identifier : info})
-
-#     def encounter(self, identifier):
-#         self.Cache[identifier].update({"lastEncounter": time.time()})
-#         self.Cache[identifier].update({"count": self.Cache[identifier]["count"]+1})
-#         self.Cache[identifier].update({"version": "SCANL 1.0"})
-
-#     def save(self):
-#         JSONcache = open(self.Path+"/"+self.Filename, 'w')
-#         json.dump(self.Cache, JSONcache)
-#         JSONcache.close()
+
+class CacheIndex:
+    def __init__(self, Path) -> None:
+        self.Path = Path
+        #create a table that just has a single column of cache IDs
+        conn = sqlite3.connect(Path)
+        cursor = conn.cursor()
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS caches (
+                cache_id TEXT NOT NULL
+            )
+        ''')
+        conn.commit()
+        conn.close()
+
+    def add(self, cache_id):
+        #add cache_id to the table
+        conn = sqlite3(self.Path)
+        cursor = conn.cursor()
+        cursor.execute('''
+            INSERT INTO caches (cache_id) VALUES (?)
+        ''', cache_id)
+        conn.commit()
+        conn.close()
+
+    def isCacheExistent(self, cache_id):
+        conn = sqlite3(self.Path)
+        cursor = conn.cursor()
+        cursor.execute('''
+            SELECT cache_id FROM caches WHERE cache_id = ?
+        ''')
+        row = cursor.fetchone()
+        if row: return True
+        else: return False
+
 
 #TODO: context should probably be considered when saving tagged names
 class AppCache:
@@ -118,6 +115,7 @@ def retrieve(self, identifier):
         cursor = conn.cursor()
         cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ?", identifier)
         row = cursor.fetchone()
+        conn.close()
 
         if row:
             return {
@@ -205,11 +203,8 @@ def start_server(temp_config = {}):
     print('initializing model...')
     initialize_model()
 
-    print("loading cache...")
-    if not os.path.isdir("cache"): os.mkdir("cache")
-    app.cache = AppCache("cache", "cache.json")
-    app.studentCache = AppCache("cache", "student_cache.json")
-    app.cache.load()
+    print("setting up cache...")
+    app.caches = {}
 
     print("loading dictionary...")
     nltk.download("words")
@@ -258,28 +253,30 @@ def dictionary_lookup(word):
     return dictionaryType
 
 #TODO: this is not an intuitive way to save cache
-@app.route('/')
-def save():
-    app.cache.save()
-    app.studentCache.save()
-    return "successfully saved cache"
-
-#TODO: use a query string instead for specifying student cache
-#TODO: update to save data to SQL lite instead of updating a JSON
-#      responses should still be sent in the JSON format
-@app.route('/<student>/<identifier_name>/<identifier_context>')
-def listen(student, identifier_name: str, identifier_context: str) -> List[dict]:
+# @app.route('/')
+# def save():
+#     app.cache.save()
+#     app.studentCache.save()
+#     return "successfully saved cache"
+
+#TODO: caches should be saved in an SQL lite database
+@app.route('/<cache_id>/<identifier_name>/<identifier_context>')
+def listen(cache_id, identifier_name: str, identifier_context: str) -> List[dict]:
     #check if identifier name has already been used
     cache = None
-
-    if (student == "student"):
-        cache = app.studentCache
-    else: 
-        cache = app.cache
-
-    if (identifier_name in cache.Cache.keys()): 
-        cache.encounter(identifier_name)
-        return cache.Cache[identifier_name]
+    
+    #find the existing cache in app.caches or create a new one if it doesn't exist
+    if cache_id in app.caches:
+        cache = app.caches[cache_id]
+        #check if the identifier name is in this cache and return it if so
+        data = cache.retrieve(identifier_name)
+        if data != False:
+            return data
+    else:
+        #create the cache and add it to the dictionary of caches
+        cache = AppCache("cache/"+cache_id+".db")
+        cache.load()
+        app.caches[cache_id] = cache
     
     """
     Process a web request to analyze an identifier within a specific context.

From 6c36a6a2005630cac73fc72ff707444aeb8a5d33 Mon Sep 17 00:00:00 2001
From: Brandon Scholten <scholtenb8890@mtc.edu>
Date: Fri, 21 Mar 2025 18:24:30 -0400
Subject: [PATCH 08/51] Finish initial sqlite implementation

---
 tag_identifier.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/tag_identifier.py b/tag_identifier.py
index 2cf5325..6692356 100644
--- a/tag_identifier.py
+++ b/tag_identifier.py
@@ -59,7 +59,7 @@ def isCacheExistent(self, cache_id):
         cursor = conn.cursor()
         cursor.execute('''
             SELECT cache_id FROM caches WHERE cache_id = ?
-        ''')
+        ''', cache_id)
         row = cursor.fetchone()
         if row: return True
         else: return False
@@ -204,7 +204,7 @@ def start_server(temp_config = {}):
     initialize_model()
 
     print("setting up cache...")
-    app.caches = {}
+    app.cacheIndex = CacheIndex('index.db')
 
     print("loading dictionary...")
     nltk.download("words")
@@ -252,23 +252,15 @@ def dictionary_lookup(word):
     
     return dictionaryType
 
-#TODO: this is not an intuitive way to save cache
-# @app.route('/')
-# def save():
-#     app.cache.save()
-#     app.studentCache.save()
-#     return "successfully saved cache"
-
-#TODO: caches should be saved in an SQL lite database
+#caches should be saved in an SQL lite database
 @app.route('/<cache_id>/<identifier_name>/<identifier_context>')
 def listen(cache_id, identifier_name: str, identifier_context: str) -> List[dict]:
     #check if identifier name has already been used
     cache = None
-    
     #find the existing cache in app.caches or create a new one if it doesn't exist
-    if cache_id in app.caches:
-        cache = app.caches[cache_id]
+    if app.cacheIndex.isCacheExistent(cache_id):
         #check if the identifier name is in this cache and return it if so
+        cache = AppCache("cache/"+cache_id+".db")
         data = cache.retrieve(identifier_name)
         if data != False:
             return data
@@ -276,7 +268,7 @@ def listen(cache_id, identifier_name: str, identifier_context: str) -> List[dict
         #create the cache and add it to the dictionary of caches
         cache = AppCache("cache/"+cache_id+".db")
         cache.load()
-        app.caches[cache_id] = cache
+        app.cacheIndex.add(cache_id)
     
     """
     Process a web request to analyze an identifier within a specific context.

From 38cbb03651fac60019b44dd9ddc7a3bf5aeab4be Mon Sep 17 00:00:00 2001
From: Brandon Scholten <scholtenb8890@mtc.edu>
Date: Fri, 21 Mar 2025 19:44:39 -0400
Subject: [PATCH 09/51] Remove sqlite3 from requirements

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index f8846a2..74c39c3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,4 +13,3 @@ scikit_learn==1.3.0
 scipy==1.10.1
 git+https://github.com/cnewman/spiral.git
 waitress==2.1.2
-sqlite3

From 4596af05c41bd39659d9b809cc8635a966cf1bac Mon Sep 17 00:00:00 2001
From: Brandon Scholten <scholtenb8890@mtc.edu>
Date: Sun, 23 Mar 2025 15:43:50 -0400
Subject: [PATCH 10/51] Fix bugs

---
 .gitignore        |  2 +-
 requirements.txt  |  1 +
 tag_identifier.py | 18 ++++++++++--------
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index cb28750..700916a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,4 @@ output/
 __pycache__/
 code2vec/
 cache/
-input.txt
\ No newline at end of file
+input.txt
diff --git a/requirements.txt b/requirements.txt
index 74c39c3..4caddc9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,4 @@ scikit_learn==1.3.0
 scipy==1.10.1
 git+https://github.com/cnewman/spiral.git
 waitress==2.1.2
+protobuf==3.20.3
diff --git a/tag_identifier.py b/tag_identifier.py
index 6692356..7db152b 100644
--- a/tag_identifier.py
+++ b/tag_identifier.py
@@ -46,20 +46,21 @@ def __init__(self, Path) -> None:
 
     def add(self, cache_id):
         #add cache_id to the table
-        conn = sqlite3(self.Path)
+        conn = sqlite3.connect(self.Path)
         cursor = conn.cursor()
+        #cache_id needs to be by itself in a tuple for some reason? otherwise sqlite freaks out idk
         cursor.execute('''
             INSERT INTO caches (cache_id) VALUES (?)
-        ''', cache_id)
+        ''', (cache_id,))
         conn.commit()
         conn.close()
 
     def isCacheExistent(self, cache_id):
-        conn = sqlite3(self.Path)
+        conn = sqlite3.connect(self.Path)
         cursor = conn.cursor()
         cursor.execute('''
             SELECT cache_id FROM caches WHERE cache_id = ?
-        ''', cache_id)
+        ''', (cache_id,))
         row = cursor.fetchone()
         if row: return True
         else: return False
@@ -113,14 +114,14 @@ def retrieve(self, identifier):
         #return a dictionary of the name, or false if not in database
         conn = sqlite3.connect(self.Path)
         cursor = conn.cursor()
-        cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ?", identifier)
+        cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ?", (identifier,))
         row = cursor.fetchone()
         conn.close()
 
         if row:
             return {
                 "name": row[0],
-                "words": json.loads(rows[1]),
+                "words": json.loads(row[1]),
                 "firstEncounter": row[2],
                 "lastEncounter": row[3],
                 "count": row[4]
@@ -204,7 +205,8 @@ def start_server(temp_config = {}):
     initialize_model()
 
     print("setting up cache...")
-    app.cacheIndex = CacheIndex('index.db')
+    if not os.path.exists('cache'): os.mkdir('cache')
+    app.cacheIndex = CacheIndex('cache/index.db')
 
     print("loading dictionary...")
     nltk.download("words")
@@ -254,7 +256,7 @@ def dictionary_lookup(word):
 
 #caches should be saved in an SQL lite database
 @app.route('/<cache_id>/<identifier_name>/<identifier_context>')
-def listen(cache_id, identifier_name: str, identifier_context: str) -> List[dict]:
+def listen(cache_id: str, identifier_name: str, identifier_context: str) -> List[dict]:
     #check if identifier name has already been used
     cache = None
     #find the existing cache in app.caches or create a new one if it doesn't exist

From 455d5b57e8f9b2c3edbabd4d4e56794ec1db15be Mon Sep 17 00:00:00 2001
From: Brandon Scholten <scholtenb8890@mtc.edu>
Date: Sun, 23 Mar 2025 15:48:14 -0400
Subject: [PATCH 11/51] Add restart always to compose.yml

---
 compose.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compose.yml b/compose.yml
index 63cd5f1..30a003a 100644
--- a/compose.yml
+++ b/compose.yml
@@ -20,3 +20,4 @@ services:
       - words:/words
     ports:
       - "${PORT-8080}:5000"
+    restart: always

From bec20b9675d8c6d4704a2815c23e7836ac6cfad0 Mon Sep 17 00:00:00 2001
From: Brandon Scholten <scholtenb8890@mtc.edu>
Date: Sun, 23 Mar 2025 19:47:41 -0400
Subject: [PATCH 12/51] Attempt at optional cache, broke everything

---
 tag_identifier.py | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/tag_identifier.py b/tag_identifier.py
index 7db152b..570e57d 100644
--- a/tag_identifier.py
+++ b/tag_identifier.py
@@ -9,7 +9,7 @@
 from spiral import ronin
 import json
 import sqlite3
-from create_models import createModel, stable_features, mutable_feature_list
+from create_models import createModel, mutable_feature_list
 
 app = Flask(__name__)
 
@@ -255,22 +255,24 @@ def dictionary_lookup(word):
     return dictionaryType
 
 #caches should be saved in an SQL lite database
-@app.route('/<cache_id>/<identifier_name>/<identifier_context>')
-def listen(cache_id: str, identifier_name: str, identifier_context: str) -> List[dict]:
+@app.route('/<identifier_name>/<identifier_context>')
+@app.route('/<identifier_name>/<identifier_context>/<cache_id>')
+def listen(identifier_name: str, identifier_context: str, cache_id: str = None) -> List[dict]:
     #check if identifier name has already been used
     cache = None
     #find the existing cache in app.caches or create a new one if it doesn't exist
-    if app.cacheIndex.isCacheExistent(cache_id):
-        #check if the identifier name is in this cache and return it if so
-        cache = AppCache("cache/"+cache_id+".db")
-        data = cache.retrieve(identifier_name)
-        if data != False:
-            return data
-    else:
-        #create the cache and add it to the dictionary of caches
-        cache = AppCache("cache/"+cache_id+".db")
-        cache.load()
-        app.cacheIndex.add(cache_id)
+    if cache_id != None:
+        if app.cacheIndex.isCacheExistent(cache_id):
+            #check if the identifier name is in this cache and return it if so
+            cache = AppCache("cache/"+cache_id+".db")
+            data = cache.retrieve(identifier_name)
+            if data != False:
+                return data
+        else:
+            #create the cache and add it to the dictionary of caches
+            cache = AppCache("cache/"+cache_id+".db")
+            cache.load()
+            app.cacheIndex.add(cache_id)
     
     """
     Process a web request to analyze an identifier within a specific context.
@@ -354,7 +356,8 @@ def listen(cache_id: str, identifier_name: str, identifier_context: str) -> List
         )
 
     # append result to cache
-    cache.add(identifier_name, result)
+    if cache_id != None:
+        cache.add(identifier_name, result)
 
     return result
     

From f982cf42f8555a30ea8cec2e26f70df860d23bb6 Mon Sep 17 00:00:00 2001
From: Brandon Scholten <brandonscholten@Brandons-MacBook-Air.local>
Date: Wed, 26 Mar 2025 06:45:53 -0400
Subject: [PATCH 13/51] Fix count

---
 tag_identifier.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tag_identifier.py b/tag_identifier.py
index 570e57d..897d09a 100644
--- a/tag_identifier.py
+++ b/tag_identifier.py
@@ -130,7 +130,7 @@ def retrieve(self, identifier):
             return False
 
     def encounter(self, identifier):
-        currentCount = self.retrieve()["count"]
+        currentCount = self.retrieve(identifier)["count"]
         #connection setup
         conn = sqlite3.connect(self.Path)
         cursor = conn.cursor()
@@ -139,7 +139,7 @@ def encounter(self, identifier):
             UPDATE names 
             SET lastEncounter = ?, count = ?
             WHERE name = ?
-        ''', time.time(), currentCount+1, identifier)
+        ''', (time.time(), currentCount+1, identifier))
         #close connection
         conn.commit()
         conn.close()
@@ -265,6 +265,7 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None)
         if app.cacheIndex.isCacheExistent(cache_id):
             #check if the identifier name is in this cache and return it if so
             cache = AppCache("cache/"+cache_id+".db")
+            cache.encounter(identifier_name)
             data = cache.retrieve(identifier_name)
             if data != False:
                 return data

From 9e62aa7c37ab16e6de19b86689a40861d435d343 Mon Sep 17 00:00:00 2001
From: Brandon Scholten <brandonscholten@Brandons-MacBook-Air.local>
Date: Wed, 26 Mar 2025 10:23:56 -0400
Subject: [PATCH 14/51] Fix encounter

---
 tag_identifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tag_identifier.py b/tag_identifier.py
index 897d09a..c60adf2 100644
--- a/tag_identifier.py
+++ b/tag_identifier.py
@@ -265,9 +265,9 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None)
         if app.cacheIndex.isCacheExistent(cache_id):
             #check if the identifier name is in this cache and return it if so
             cache = AppCache("cache/"+cache_id+".db")
-            cache.encounter(identifier_name)
             data = cache.retrieve(identifier_name)
             if data != False:
+                cache.encounter(identifier_name)
                 return data
         else:
             #create the cache and add it to the dictionary of caches

From bf8d0e500283a629f9ae4ac2f33551592f5c0552 Mon Sep 17 00:00:00 2001
From: Brandon Scholten <brandonscholten@Brandons-MacBook-Air.local>
Date: Sat, 29 Mar 2025 13:16:48 -0400
Subject: [PATCH 15/51] Remove use of CacheIndex, add probe route

---
 tag_identifier.py | 56 +++++++++++------------------------------------
 1 file changed, 13 insertions(+), 43 deletions(-)

diff --git a/tag_identifier.py b/tag_identifier.py
index c60adf2..2f94e3c 100644
--- a/tag_identifier.py
+++ b/tag_identifier.py
@@ -30,42 +30,6 @@ def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) ->
         self.ModelGensimEnglish = modelGensimEnglish
         self.wordCount = wordCount
 
-class CacheIndex:
-    def __init__(self, Path) -> None:
-        self.Path = Path
-        #create a table that just has a single column of cache IDs
-        conn = sqlite3.connect(Path)
-        cursor = conn.cursor()
-        cursor.execute('''
-            CREATE TABLE IF NOT EXISTS caches (
-                cache_id TEXT NOT NULL
-            )
-        ''')
-        conn.commit()
-        conn.close()
-
-    def add(self, cache_id):
-        #add cache_id to the table
-        conn = sqlite3.connect(self.Path)
-        cursor = conn.cursor()
-        #cache_id needs to be by itself in a tuple for some reason? otherwise sqlite freaks out idk
-        cursor.execute('''
-            INSERT INTO caches (cache_id) VALUES (?)
-        ''', (cache_id,))
-        conn.commit()
-        conn.close()
-
-    def isCacheExistent(self, cache_id):
-        conn = sqlite3.connect(self.Path)
-        cursor = conn.cursor()
-        cursor.execute('''
-            SELECT cache_id FROM caches WHERE cache_id = ?
-        ''', (cache_id,))
-        row = cursor.fetchone()
-        if row: return True
-        else: return False
-
-
 #TODO: context should probably be considered when saving tagged names
 class AppCache:
     def __init__(self, Path) -> None:
@@ -206,7 +170,6 @@ def start_server(temp_config = {}):
 
     print("setting up cache...")
     if not os.path.exists('cache'): os.mkdir('cache')
-    app.cacheIndex = CacheIndex('cache/index.db')
 
     print("loading dictionary...")
     nltk.download("words")
@@ -254,7 +217,15 @@ def dictionary_lookup(word):
     
     return dictionaryType
 
-#caches should be saved in an SQL lite database
+#route to check for and create a database if it does not exist already
+@app.route('/probe/<cache_id>')
+def probe(cache_id: str):
+    if os.path.exists("cache/"+cache_id+".db3"):
+        return "Opening existing identifier database..."
+    else:
+        return "First request will create identifier database: "+cache_id+"..."
+
+#route to tag an identifier name
 @app.route('/<identifier_name>/<identifier_context>')
 @app.route('/<identifier_name>/<identifier_context>/<cache_id>')
 def listen(identifier_name: str, identifier_context: str, cache_id: str = None) -> List[dict]:
@@ -262,18 +233,17 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None)
     cache = None
     #find the existing cache in app.caches or create a new one if it doesn't exist
     if cache_id != None:
-        if app.cacheIndex.isCacheExistent(cache_id):
+        if os.path.exists("cache/"+cache_id+".db3"):
             #check if the identifier name is in this cache and return it if so
-            cache = AppCache("cache/"+cache_id+".db")
+            cache = AppCache("cache/"+cache_id+".db3")
             data = cache.retrieve(identifier_name)
             if data != False:
                 cache.encounter(identifier_name)
                 return data
         else:
-            #create the cache and add it to the dictionary of caches
-            cache = AppCache("cache/"+cache_id+".db")
+            #create the cache
+            cache = AppCache("cache/"+cache_id+".db3")
             cache.load()
-            app.cacheIndex.add(cache_id)
     
     """
     Process a web request to analyze an identifier within a specific context.

From 0e5df4ef54bc89caf628a68fd1935e9ed70eb006 Mon Sep 17 00:00:00 2001
From: Brandon Scholten <brandon@fugio.money>
Date: Mon, 21 Apr 2025 13:20:44 -0400
Subject: [PATCH 16/51] Update documentation

---
 Dockerfile |  4 ++--
 README.md  | 12 +++++++-----
 serve.json |  4 ++--
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b9d7ed1..3c31e07 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,8 +2,8 @@ FROM python:3.10-slim
 
 # Install (and build) requirements
 COPY requirements.txt /requirements.txt
-RUN apt-get update && \
-    apt-get install -y git curl && \
+RUN apt-get update --fix-missing && \
+    apt-get install --allow-unauthenticated -y git curl && \
     pip install -r requirements.txt && \
     rm -rf /var/lib/apt/lists/*
 
diff --git a/README.md b/README.md
index 859dd10..f7f235b 100644
--- a/README.md
+++ b/README.md
@@ -24,9 +24,7 @@ Conosider configuring `PYTHONPATH` as well:
 
 	export PYTHONPATH=~/path/to/scanl_tagger
 
-Finally, you need to install Spiral, which we use for identifier splitting. The current version of Spiral on the official repo has a [problem](https://github.com/casics/spiral/issues/4), so consider installing the one from the link below:
-
-    sudo pip3 install git+https://github.com/cnewman/spiral.git
+Install dependencies by running `pip3 install -r requirements.txt` in the root of the repository. 
 
 Finally, we require the `token` and `target` vectors from [code2vec](https://github.com/tech-srl/code2vec). The tagger will attempt to automatically download them if it doesn't find them, but you could download them yourself if you like. It will place them in your local directory under `./code2vec/*`
 
@@ -50,9 +48,13 @@ options:
 
 `./main -r` will start the server, which will listen for identifier names sent via HTTP over the route:
 
-http://127.0.0.1:5000/{cache_selection}/{identifier_name}/{code_context}
+http://127.0.0.1:5000/{identifier_name}/{code_context}/{database_name (optional)}
+
+"database name" specifies an sqlite database to be used for result caching and data collection. If the database specified does not exist, one will be created. 
+
+You can check wehther or not a database exists by using the `/probe` route by sending an HTTP request like this:
 
-"cache selection" will save results to a separate cache if it is set to "student"
+http://127.0.0.1:5000/probe/{database_name}
 
 "code context" is one of:
 - FUNCTION
diff --git a/serve.json b/serve.json
index 84e15c0..261db0b 100644
--- a/serve.json
+++ b/serve.json
@@ -1,6 +1,6 @@
 {
     "address": "0.0.0.0",
     "port": 5000,
-    "protocol": "https",
+    "protocol": "http",
     "words":""
-}
\ No newline at end of file
+}

From 11e45a78d7dc7019e6414c22ff71e4286990c720 Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@kent.edu>
Date: Wed, 23 Apr 2025 21:03:04 -0400
Subject: [PATCH 17/51] Create LICENSE

---
 LICENSE | 674 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 674 insertions(+)
 create mode 100644 LICENSE

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..f288702
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.

From 041c103064e4bf6738461b2a3fd37c3f0b439848 Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@kent.edu>
Date: Wed, 23 Apr 2025 21:03:50 -0400
Subject: [PATCH 18/51] Update README.md

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 83ead68..f7794ce 100644
--- a/README.md
+++ b/README.md
@@ -8,10 +8,12 @@ There are two ways to run the tagger. This document describes both ways.
 
 ## Getting Started with Docker
 
-To run SCNL tagger in a Docker container you can clone the repository and pull the latest docker impage from `sourceslicer/scalar_tagger:latest`
+To run SCALAR in a Docker container you can clone the repository and pull the latest docker impage from `sourceslicer/scalar_tagger:latest`
 
 Make sure you have Docker and Docker Compose installed:
+
 https://docs.docker.com/engine/install/
+
 https://docs.docker.com/compose/install/
 
 ```

From 23d28e67f053f062f0f08c37316155d37790f82e Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@kent.edu>
Date: Sun, 27 Apr 2025 22:58:03 -0400
Subject: [PATCH 19/51] Update README.md

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index f7794ce..9c8ee5e 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,14 @@
 # SCALAR Part-of-speech tagger
 This the official release of the SCALAR Part-of-speech tagger
 
+# Current Metrics (this will be updated every time we update/change the model!)
+|            | Accuracy | Balanced Accuracy | Weighted Recall | Weighted Precision | Weighted F1 | Performance (seconds) |
+|------------|:--------:|:------------------:|:---------------:|:------------------:|:-----------:|:---------------------:|
+| **SCALAR** | **0.8216** | **0.9160** | **0.8216** | **0.8245** | **0.8220** | **249.05** |
+| Ensemble   | 0.7124   | 0.8311             | 0.7124          | 0.7597             | 0.7235      | 1149.44                |
+| Flair      | 0.6087   | 0.7844             | 0.6087          | 0.7755             | 0.6497      | 807.03                 |
+
+
 There are two ways to run the tagger. This document describes both ways.
 
 1. Using Docker compose (which runs the tagger's built-in server for you)

From 5a398c54b19e98cd8d0a8bb75b4a5ac1199787e0 Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@kent.edu>
Date: Sun, 27 Apr 2025 22:59:11 -0400
Subject: [PATCH 20/51] Update README.md

---
 README.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 9c8ee5e..99dcb9d 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,11 @@
 # SCALAR Part-of-speech tagger
 This the official release of the SCALAR Part-of-speech tagger
 
+There are two ways to run the tagger. This document describes both ways.
+
+1. Using Docker compose (which runs the tagger's built-in server for you)
+2. Running the tagger's built-in server without Docker
+
 # Current Metrics (this will be updated every time we update/change the model!)
 |            | Accuracy | Balanced Accuracy | Weighted Recall | Weighted Precision | Weighted F1 | Performance (seconds) |
 |------------|:--------:|:------------------:|:---------------:|:------------------:|:-----------:|:---------------------:|
@@ -8,12 +13,6 @@ This the official release of the SCALAR Part-of-speech tagger
 | Ensemble   | 0.7124   | 0.8311             | 0.7124          | 0.7597             | 0.7235      | 1149.44                |
 | Flair      | 0.6087   | 0.7844             | 0.6087          | 0.7755             | 0.6497      | 807.03                 |
 
-
-There are two ways to run the tagger. This document describes both ways.
-
-1. Using Docker compose (which runs the tagger's built-in server for you)
-2. Running the tagger's built-in server without Docker
-
 ## Getting Started with Docker
 
 To run SCALAR in a Docker container you can clone the repository and pull the latest docker impage from `sourceslicer/scalar_tagger:latest`

From 76933c7a64457d5db60d50d22e899d7d0ea64ecb Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@kent.edu>
Date: Sun, 27 Apr 2025 22:59:48 -0400
Subject: [PATCH 21/51] Update header level

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 99dcb9d..c384bae 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ There are two ways to run the tagger. This document describes both ways.
 1. Using Docker compose (which runs the tagger's built-in server for you)
 2. Running the tagger's built-in server without Docker
 
-# Current Metrics (this will be updated every time we update/change the model!)
+## Current Metrics (this will be updated every time we update/change the model!)
 |            | Accuracy | Balanced Accuracy | Weighted Recall | Weighted Precision | Weighted F1 | Performance (seconds) |
 |------------|:--------:|:------------------:|:---------------:|:------------------:|:-----------:|:---------------------:|
 | **SCALAR** | **0.8216** | **0.9160** | **0.8216** | **0.8245** | **0.8220** | **249.05** |

From 3fce0f676c581313d8b13f674922b30a933a4bb6 Mon Sep 17 00:00:00 2001
From: Brandon Scholten <brandon@fugio.money>
Date: Sat, 3 May 2025 17:20:26 -0400
Subject: [PATCH 22/51] Resolve merge conflicts

---
 README.md                                     |  95 +++++++++++---
 requirements.txt                              | 118 +++++++++++++++---
 serve.json                                    |   4 +-
 .../classifier_multiclass.py                  |   0
 create_models.py => src/create_models.py      |   0
 .../download_code2vec_vectors.py              |   0
 .../feature_generator.py                      |   0
 tag_identifier.py => src/tag_identifier.py    |   0
 8 files changed, 183 insertions(+), 34 deletions(-)
 rename classifier_multiclass.py => src/classifier_multiclass.py (100%)
 rename create_models.py => src/create_models.py (100%)
 rename download_code2vec_vectors.py => src/download_code2vec_vectors.py (100%)
 rename feature_generator.py => src/feature_generator.py (100%)
 rename tag_identifier.py => src/tag_identifier.py (100%)

diff --git a/README.md b/README.md
index f7f235b..79c0453 100644
--- a/README.md
+++ b/README.md
@@ -1,30 +1,45 @@
 # SCALAR Part-of-speech tagger
 This the official release of the SCALAR Part-of-speech tagger
 
-**NOTE**
-There is a fork of SCALAR which was designed to handle parallel http requests and cache SCALAR's output to increase its speed. You can find this version here: https://github.com/brandonscholten/scanl_tagger. These will be combined into a single application in the *very* near future.
+There are two ways to run the tagger. This document describes both ways.
+
+1. Using Docker compose (which runs the tagger's built-in server for you)
+2. Running the tagger's built-in server without Docker
+
+## Current Metrics (this will be updated every time we update/change the model!)
+|            | Accuracy | Balanced Accuracy | Weighted Recall | Weighted Precision | Weighted F1 | Performance (seconds) |
+|------------|:--------:|:------------------:|:---------------:|:------------------:|:-----------:|:---------------------:|
+| **SCALAR** | **0.8216** | **0.9160** | **0.8216** | **0.8245** | **0.8220** | **249.05** |
+| Ensemble   | 0.7124   | 0.8311             | 0.7124          | 0.7597             | 0.7235      | 1149.44                |
+| Flair      | 0.6087   | 0.7844             | 0.6087          | 0.7755             | 0.6497      | 807.03                 |
 
 ## Getting Started with Docker
 
-To run SCNL tagger in a Docker container you can clone the repository and pull the latest docker impage from `srcml/scanl_tagger:latest`
+To run SCALAR in a Docker container you can clone the repository and pull the latest docker impage from `sourceslicer/scalar_tagger:latest`
+
+Make sure you have Docker and Docker Compose installed:
+
+https://docs.docker.com/engine/install/
+
+https://docs.docker.com/compose/install/
 
 ```
-git clone https://github.com/brandonscholten/scanl_tagger.git
+git clone git@github.com:SCANL/scanl_tagger.git
 cd scanl_tagger
 docker compose pull
 docker compose up
 ```
 
-## Setup and Run
-You will need `python3.10` installed. 
+## Getting Started without Docker
+You will need `python3.12` installed. 
 
-You'll need to install `pip3`
+You'll need to install `pip` -- https://pip.pypa.io/en/stable/installation/
 
-Conosider configuring `PYTHONPATH` as well:
+Set up a virtual environtment: `python -m venv /tmp/tagger` -- feel free to put it somewhere else (change /tmp/tagger) if you prefer
 
-	export PYTHONPATH=~/path/to/scanl_tagger
+Activate the virtual environment: `source /tmp/tagger/bin/activate` (you can find how to activate it here if `source` does not work for you -- https://docs.python.org/3/library/venv.html#how-venvs-work)
 
-Install dependencies by running `pip3 install -r requirements.txt` in the root of the repository. 
+After it's installed and your virtual environment is activated, in the root of the repo, run `pip install -r requirements.txt`
 
 Finally, we require the `token` and `target` vectors from [code2vec](https://github.com/tech-srl/code2vec). The tagger will attempt to automatically download them if it doesn't find them, but you could download them yourself if you like. It will place them in your local directory under `./code2vec/*`
 
@@ -48,7 +63,7 @@ options:
 
 `./main -r` will start the server, which will listen for identifier names sent via HTTP over the route:
 
-http://127.0.0.1:5000/{identifier_name}/{code_context}/{database_name (optional)}
+http://127.0.0.1:8080/{identifier_name}/{code_context}/{database_name (optional)}
 
 "database name" specifies an sqlite database to be used for result caching and data collection. If the database specified does not exist, one will be created. 
 
@@ -65,26 +80,73 @@ http://127.0.0.1:5000/probe/{database_name}
 
 For example:
 
-Tag a declaration: ``http://127.0.0.1:5000/cache/numberArray/DECLARATION``
+Tag a declaration: ``http://127.0.0.1:8000/cache/numberArray/DECLARATION``
 
-Tag a function: ``http://127.0.0.1:5000/cache/GetNumberArray/FUNCTION``
+Tag a function: ``http://127.0.0.1:8000/cache/GetNumberArray/FUNCTION``
 
-Tag an class: ``http://127.0.0.1:5000/cache/PersonRecord/CLASS``
+Tag an class: ``http://127.0.0.1:8000/cache/PersonRecord/CLASS``
 
 #### Note
 Kebab case is not currently supported due to the limitations of Spiral. Attempting to send the tagger identifiers which are in kebab case will result in the entry of a single noun. 
 
 You will need to have a way to parse code and filter out identifier names if you want to do some on-the-fly analysis of source code. We recommend [srcML](https://www.srcml.org/). Since the actual tagger is a web server, you don't have to use srcML. You could always use other AST-based code representations, or any other method of obtaining identifier information. 
 
+
+## Tagset
+
+**Supported Tagset**
+| Abbreviation |                 Expanded Form                |                   Examples                   |
+|:------------:|:--------------------------------------------:|:--------------------------------------------:|
+|       N      |                     noun                     | Disneyland, shoe, faucet, mother             |
+|      DT      |                  determiner                  | the, this, that, these, those, which         |
+|      CJ      |                  conjunction                 | and, for, nor, but, or, yet, so              |
+|       P      |                  preposition                 | behind, in front of, at, under, above        |
+|      NPL     |                  noun plural                 | Streets, cities, cars, people, lists         |
+|      NM      | noun modifier  (**noun-adjunct**, adjective) | red, cold, hot, **bit**Set, **employee**Name |
+|       V      |                     verb                     | Run, jump, spin,                             |
+|      VM      |            verb modifier  (adverb)           | Very, loudly, seriously, impatiently         |
+|       D      |                     digit                    | 1, 2, 10, 4.12, 0xAF                         |
+|      PRE     |                   preamble                   | Gimp, GLEW, GL, G, p, m, b                   |
+
+**Penn Treebank to SCALAR tagset**
+
+|   Penn Treebank Annotation  | SCALAR Tagset            |
+|:---------------------------:|:------------------------:|
+|       Conjunction (CC)      |     Conjunction (CJ)     |
+|          Digit (CD)         |         Digit (D)        |
+|       Determiner (DT)       |      Determiner (DT)     |
+|      Foreign Word (FW)      |         Noun (N)         |
+|       Preposition (IN)      |      Preposition (P)     |
+|        Adjective (JJ)       |    Noun Modifier (NM)    |
+| Comparative Adjective (JJR) |    Noun Modifier (NM)    |
+| Superlative Adjective (JJS) |    Noun Modifier (NM)    |
+|        List Item (LS)       |         Noun (N)         |
+|          Modal (MD)         |         Verb (V)         |
+|      Noun Singular (NN)     |         Noun (N)         |
+|      Proper Noun (NNP)      |         Noun (N)         |
+|  Proper Noun Plural (NNPS)  |     Noun Plural (NPL)    |
+|      Noun Plural (NNS)      |     Noun Plural (NPL)    |
+|         Adverb (RB)         |    Verb Modifier (VM)    |
+|   Comparative Adverb (RBR)  |    Verb Modifier (VM)    |
+|        Particle (RP)        |    Verb Modifier (VM)    |
+|         Symbol (SYM)        |         Noun (N)         |
+|     To Preposition (TO)     |      Preposition (P)     |
+|          Verb (VB)          |         Verb (V)         |
+|          Verb (VBD)         |         Verb (V)         |
+|          Verb (VBG)         |         Verb (V)         |
+|          Verb (VBN)         |         Verb (V)         |
+|          Verb (VBP)         |         Verb (V)         |
+|          Verb (VBZ)         |         Verb (V)         |
+
 ## Training the tagger
 You can train this tagger using the `-t` option (which will re-run the training routine). For the moment, most of this is hard-coded in, so if you want to use a different data set/different seeds, you'll need to modify the code. This will potentially change in the future.
 
 ## Errors?
 Please make an issue if you run into errors
 
-# Please Cite the Paper!
+# Please Cite the Paper(s)!
 
-No paper for now however the current tagger is based on our previous, so you could cite the previous one for now: 
+Newman, Christian, Scholten , Brandon, Testa, Sophia, Behler, Joshua, Banabilah, Syreen, Collard, Michael L., Decker, Michael, Mkaouer, Mohamed Wiem, Zampieri, Marcos, Alomar, Eman Abdullah, Alsuhaibani, Reem, Peruma, Anthony, Maletic, Jonathan I., (2025), “SCALAR: A Part-of-speech Tagger for Identifiers”, in the Proceedings of the 33rd IEEE/ACM International Conference on Program Comprehension - Tool Demonstrations Track (ICPC), Ottawa, ON, Canada, April 27 -28, 5 pages TO APPEAR.
 
 Christian  D.  Newman,  Michael  J.  Decker,  Reem  S.  AlSuhaibani,  Anthony  Peruma,  Satyajit  Mohapatra,  Tejal  Vishnoi, Marcos Zampieri, Mohamed W. Mkaouer, Timothy J. Sheldon, and Emily Hill, "An Ensemble Approach for Annotating Source Code Identifiers with Part-of-speech Tags," in IEEE Transactions on Software Engineering, doi: 10.1109/TSE.2021.3098242.
 
@@ -98,4 +160,3 @@ Find our other research [at our webpage](https://www.scanl.org/) and check out t
 This project uses WordNet to perform a dictionary lookup on the individual words in each identifier:
 
 Princeton University "About WordNet." [WordNet](https://wordnet.princeton.edu/). Princeton University. 2010
-
diff --git a/requirements.txt b/requirements.txt
index 4caddc9..51e31b1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,16 +1,104 @@
-utils==1.0.1
-flair==0.14.0
+accelerate==1.3.0
+attrs==25.1.0
+beautifulsoup4==4.12.3
+bioc==2.1
+blinker==1.9.0
+boto3==1.36.6
+botocore==1.36.6
+certifi==2024.12.14
+charset-normalizer==3.4.1
+click==8.1.8
+conllu==4.5.3
+contourpy==1.3.1
+cycler==0.12.1
+Deprecated==1.2.17
+docopt==0.6.2
+filelock==3.17.0
+flair==0.15.0
 Flask==3.1.0
-gensim==4.3.1
-imbalanced_learn==0.12.2
-imblearn==0.0
-joblib==1.3.1
-nltk==3.8.1
-numpy==1.25.1
-pandas==2.0.3
-Requests==2.32.3
-scikit_learn==1.3.0
-scipy==1.10.1
-git+https://github.com/cnewman/spiral.git
-waitress==2.1.2
-protobuf==3.20.3
+fonttools==4.55.6
+fsspec==2024.12.0
+ftfy==6.3.1
+gdown==5.2.0
+gensim==4.3.3
+huggingface-hub==0.27.1
+humanize==4.11.0
+idna==3.10
+iniconfig==2.0.0
+intervaltree==3.1.0
+itsdangerous==2.2.0
+Jinja2==3.1.5
+jmespath==1.0.1
+joblib==1.4.2
+jsonlines==4.0.0
+kiwisolver==1.4.8
+langdetect==1.0.9
+lxml==5.3.0
+MarkupSafe==3.0.2
+matplotlib==3.10.0
+more-itertools==10.6.0
+mpld3==0.5.10
+mpmath==1.3.0
+networkx==3.4.2
+nltk==3.9.1
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+plac==1.4.3
+pluggy==1.5.0
+pptree==3.1
+protobuf==5.29.3
+psutil==6.1.1
+pyparsing==3.2.1
+PySocks==1.7.1
+pytest==8.3.4
+python-dateutil==2.9.0.post0
+pytorch_revgrad==0.2.0
+pytz==2024.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+s3transfer==0.11.2
+safetensors==0.5.2
+scikit-learn==1.6.1
+scipy==1.13.1
+segtok==1.5.11
+sentencepiece==0.2.0
+setuptools==75.8.0
+six==1.17.0
+smart-open==7.1.0
+sortedcontainers==2.4.0
+soupsieve==2.6
+spiral @ git+https://github.com/cnewman/spiral.git@dff537320c15849c10e583968036df2d966eddee
+sqlitedict==2.1.0
+sympy==1.13.1
+tabulate==0.9.0
+termcolor==2.5.0
+threadpoolctl==3.5.0
+tokenizers==0.21.0
+torch==2.5.1
+tqdm==4.67.1
+transformer-smaller-training-vocab==0.4.0
+transformers==4.48.1
+triton==3.1.0
+typing_extensions==4.12.2
+tzdata==2025.1
+urllib3==2.3.0
+waitress==3.0.2
+wcwidth==0.2.13
+Werkzeug==3.1.3
+Wikipedia-API==0.8.1
+wrapt==1.17.2
\ No newline at end of file
diff --git a/serve.json b/serve.json
index 261db0b..3eeb486 100644
--- a/serve.json
+++ b/serve.json
@@ -1,6 +1,6 @@
 {
     "address": "0.0.0.0",
-    "port": 5000,
-    "protocol": "http",
+    "port": 8080,
+    "protocol": "https",
     "words":""
 }
diff --git a/classifier_multiclass.py b/src/classifier_multiclass.py
similarity index 100%
rename from classifier_multiclass.py
rename to src/classifier_multiclass.py
diff --git a/create_models.py b/src/create_models.py
similarity index 100%
rename from create_models.py
rename to src/create_models.py
diff --git a/download_code2vec_vectors.py b/src/download_code2vec_vectors.py
similarity index 100%
rename from download_code2vec_vectors.py
rename to src/download_code2vec_vectors.py
diff --git a/feature_generator.py b/src/feature_generator.py
similarity index 100%
rename from feature_generator.py
rename to src/feature_generator.py
diff --git a/tag_identifier.py b/src/tag_identifier.py
similarity index 100%
rename from tag_identifier.py
rename to src/tag_identifier.py

From 81d140a15308689db1e3946d7d61971bea18767c Mon Sep 17 00:00:00 2001
From: Brandon Scholten <brandon@fugio.money>
Date: Sat, 3 May 2025 19:28:02 -0400
Subject: [PATCH 23/51] Resolve merge conflicts

---
 src/tag_identifier.py | 1158 ++++++++++++++---------------------------
 1 file changed, 401 insertions(+), 757 deletions(-)

diff --git a/src/tag_identifier.py b/src/tag_identifier.py
index f690d8f..0af80c1 100644
--- a/src/tag_identifier.py
+++ b/src/tag_identifier.py
@@ -1,757 +1,401 @@
-#original ======================================================================
-import os
-import time
-import joblib
-import nltk
-import pandas as pd
-from feature_generator import *
-from flask import Flask
-from waitress import serve
-from spiral import ronin
-import json
-import sqlite3
-from create_models import createModel, mutable_feature_list
-
-app = Flask(__name__)
-
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-class ModelData:
-    def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> None:
-        """
-        Initialize an instance of the ModelData class with word vector models.
-
-        Args:
-            ModelTokens: Word vectors model for tokens.
-            ModelMethods: Word vectors model for methods.
-            ModelGensimEnglish: Word vectors model for general English words.
-        """
-
-        self.ModelTokens = modelTokens
-        self.ModelMethods = modelMethods
-        self.ModelGensimEnglish = modelGensimEnglish
-        self.wordCount = wordCount
-
-#TODO: context should probably be considered when saving tagged names
-class AppCache:
-    def __init__(self, Path) -> None:
-        self.Path = Path #path to an SQL lite database
-    
-    def load(self):
-        #create connection to database
-        conn = sqlite3.connect(self.Path)
-        #create the table of names if it doesn't exist
-        cursor = conn.cursor()
-        cursor.execute('''
-            CREATE TABLE IF NOT EXISTS names (
-                       id INTEGER PRIMARY KEY AUTOINCREMENT,
-                       name TEXT NOT NULL,
-                       words TEXT, -- this is a JSON string
-                       firstEncounter INTEGER,
-                       lastEncounter INTEGER,
-                       count INTEGER
-                       )
-        ''')
-        #close the database connection
-        conn.commit()
-        conn.close()
-
-    def add(self, identifier, result):
-        #connection setup
-        conn = sqlite3.connect(self.Path)
-        cursor = conn.cursor()
-        #add identifier to table
-        record = {
-            "name": identifier,
-            "words": json.dumps(result["words"]),
-            "firstEncounter": time.time(),
-            "lastEncounter": time.time(),
-            "count": 1
-        }
-        cursor.execute('''
-            INSERT INTO names (name, words, firstEncounter, lastEncounter, count)
-            VALUES (:name, :words, :firstEncounter, :lastEncounter, :count)
-        ''', record)
-        #close the database connection
-        conn.commit()
-        conn.close()
-
-    def retrieve(self, identifier):
-        #return a dictionary of the name, or false if not in database
-        conn = sqlite3.connect(self.Path)
-        cursor = conn.cursor()
-        cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ?", (identifier,))
-        row = cursor.fetchone()
-        conn.close()
-
-        if row:
-            return {
-                "name": row[0],
-                "words": json.loads(row[1]),
-                "firstEncounter": row[2],
-                "lastEncounter": row[3],
-                "count": row[4]
-            }
-        else: 
-            return False
-
-    def encounter(self, identifier):
-        currentCount = self.retrieve(identifier)["count"]
-        #connection setup
-        conn = sqlite3.connect(self.Path)
-        cursor = conn.cursor()
-        #update record
-        cursor.execute('''
-            UPDATE names 
-            SET lastEncounter = ?, count = ?
-            WHERE name = ?
-        ''', (time.time(), currentCount+1, identifier))
-        #close connection
-        conn.commit()
-        conn.close()
-
-class WordList:
-    def __init__(self, Path):
-        self.Words = set()
-        self.Path = Path
-    
-    def load(self):
-        if not os.path.isfile(self.Path):
-            print("Could not find word list file!")
-            return
-        with open(self.Path) as file: 
-            for line in file:
-                self.Words.add(line[:line.find(',')]) #stop at comma
-    
-    def find(self, item):
-        return item in self.Words
-
-def initialize_model():
-    """
-    Initialize and load word vectors for the application, and load a word count DataFrame.
-
-    This function initializes and loads word vectors using the 'createModel' function, and loads word counts 
-    from a JSON file into a Pandas DataFrame for use in the application.
-
-    Returns:
-        tuple: (ModelData, WORD_COUNT DataFrame)
-    """
-    print("Loading word vectors!!")
-    modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR)
-    print("Word vectors loaded!!")
-    
-    # Load the word count JSON file into a DataFrame
-    word_count_path = os.path.join("input", "word_count.json")
-    if os.path.exists(word_count_path):
-        print(f"Loading word count data from {word_count_path}...")
-        word_count_df = pd.read_json(word_count_path, orient='index', typ='series').reset_index()
-        word_count_df.columns = ['word', 'log_frequency']
-        print("Word count data loaded!")
-    else:
-        print(f"Word count file not found at {word_count_path}. Initializing empty DataFrame.")
-        word_count_df = pd.DataFrame(columns=['word', 'log_frequency'])
-    
-    # Create and store model data
-    app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df)
-
-def start_server(temp_config = {}):
-    """
-    Initialize the model and start the server.
-
-    This function first initializes the model by calling the 'initialize_model' function. Then, it starts the server using
-    the waitress `serve` method, allowing incoming HTTP requests to be handled.
-
-    The arguments to waitress serve are read from the configuration file `serve.json`. The default option is to
-    listen for HTTP requests on all interfaces (ip address 0.0.0.0, port 5000).
-
-    Returns:
-        None
-    """
-    print('initializing model...')
-    initialize_model()
-
-    print("setting up cache...")
-    if not os.path.exists('cache'): os.mkdir('cache')
-
-    print("loading dictionary...")
-    nltk.download("words")
-    app.english_words = set(w.lower() for w in nltk.corpus.words.words())
-    #insert english words from words/en.txt
-    if not os.path.exists("words/en.txt"):
-        print("could not find English words, using WordNet only!")
-    else:
-        with open("words/en.txt") as words:
-            for word in words:
-                app.english_words.add(word[:-1])
-
-    print('retrieving server configuration...')
-    data = open('serve.json')
-    config = json.load(data)
-
-    server_host = temp_config["address"] if "address" in temp_config.keys() else config["address"]
-    server_port = temp_config["port"] if "port" in temp_config.keys() else config['port']
-    server_url_scheme = temp_config["protocol"] if "protocol" in temp_config.keys() else config["protocol"]
-
-    print("loading word list...")
-    wordListPath = temp_config["words"] if "words" in temp_config.keys() else config["words"]
-    app.words = WordList(wordListPath)
-    app.words.load()
-
-    print("Starting server...")
-    serve(app, host=server_host, port=server_port, url_scheme=server_url_scheme)
-    data.close()
-
-def dictionary_lookup(word):
-    #return true if the word exists in the dictionary (the nltk words corpus)
-    #or if the word is in the list of approved words
-    dictionaryType = ""
-    dictionary = word.lower() in app.english_words
-    acceptable = app.words.find(word)
-    digit = word.isnumeric()
-    if (dictionary):
-        dictionaryType = "DW"
-    elif (acceptable):
-        dictionaryType = "AW"
-    elif (digit):
-        dictionaryType = "DD"
-    else:
-        dictionaryType = "UC"
-    
-    return dictionaryType
-
-#route to check for and create a database if it does not exist already
-@app.route('/probe/<cache_id>')
-def probe(cache_id: str):
-    if os.path.exists("cache/"+cache_id+".db3"):
-        return "Opening existing identifier database..."
-    else:
-        return "First request will create identifier database: "+cache_id+"..."
-
-#route to tag an identifier name
-@app.route('/<identifier_name>/<identifier_context>')
-@app.route('/<identifier_name>/<identifier_context>/<cache_id>')
-def listen(identifier_name: str, identifier_context: str, cache_id: str = None) -> List[dict]:
-    #check if identifier name has already been used
-    cache = None
-    #find the existing cache in app.caches or create a new one if it doesn't exist
-    if cache_id != None:
-        if os.path.exists("cache/"+cache_id+".db3"):
-            #check if the identifier name is in this cache and return it if so
-            cache = AppCache("cache/"+cache_id+".db3")
-            data = cache.retrieve(identifier_name)
-            if data != False:
-                cache.encounter(identifier_name)
-                return data
-        else:
-            #create the cache
-            cache = AppCache("cache/"+cache_id+".db3")
-            cache.load()
-    
-    """
-    Process a web request to analyze an identifier within a specific context.
-
-    This route function takes two URL parameters (identifier_name, and identifier_context) from an
-    incoming HTTP request and performs data preprocessing and feature extraction on the identifier_name.
-    It then uses a trained classifier to annotate the identifier with part-of-speech tags and other linguistic features.
-
-    Args:
-        identifier_name (str): The name of the identifier to be analyzed.
-        identifier_context (str): The context in which the identifier appears.
-
-    Returns:
-        List[dict]: A list of dictionaries containing words and their predicted POS tags.
-    """
-    print(f"INPUT: {identifier_name} {identifier_context}")
-   
-    # Split identifier_name into words
-    words = ronin.split(identifier_name)
-
-    # # Create initial data frame
-    data = pd.DataFrame({
-        'WORD': words,
-        'SPLIT_IDENTIFIER': ' '.join(words),
-        'CONTEXT_NUMBER': context_to_number(identifier_context),  # Predefined context number
-    })
-
-    # create response JSON
-    # tags = list(annotate_identifier(app.model_data.ModelClassifier, data))
-    result = {
-        "words" : []
-    }
-
-    # Add features to the data
-    data = createFeatures(
-        data, 
-        mutable_feature_list,
-        modelGensimEnglish=app.model_data.ModelGensimEnglish,
-    )
-    
-    categorical_features = ['NLTK_POS']
-    category_variables = []
-
-    for category_column in categorical_features:
-        if category_column in data.columns:
-            category_variables.append(category_column)
-            data.loc[:, category_column] = data[category_column].astype(str)
-
-    for category_column in category_variables:
-        # Explicitly handle categorical conversion
-        unique_values = data[category_column].unique()
-        category_map = {}
-        for value in unique_values:
-            if value in universal_to_custom:
-                category_map[value] = custom_to_numeric[universal_to_custom[value]]
-            else:
-                category_map[value] = custom_to_numeric['NOUN']  # Assign 'NM' (8) for unknown categories
-
-        data.loc[:, category_column] = data[category_column].map(category_map)
-
-    # Convert categorical variables to numeric
-    # Load and apply the classifier
-    clf = joblib.load(os.path.join(SCRIPT_DIR, 'output', 'model_GradientBoostingClassifier.pkl'))
-    predicted_tags = annotate_identifier(clf, data)
-
-    # Combine words and their POS tags into a parseable format
-    #result = [{'word': word, 'pos_tag': tag} for word, tag in zip(words, predicted_tags)]
-
-    for i in range(len(words)):
-        #check dictionary
-        dictionary = "UC" #uncategorized
-        word = words[i]
-        dictionary = dictionary_lookup(word)
-        result["words"].append(
-            {
-                words[i] : {
-                    "tag" : predicted_tags[i],
-                    "dictionary" : dictionary
-                }
-            }
-        )
-
-    # append result to cache
-    if cache_id != None:
-        cache.add(identifier_name, result)
-
-    return result
-    
-def context_to_number(context):
-    """
-    Convert a textual context description to a numerical representation.
-
-    This function takes a context description as a string and maps it to a numerical representation according to a
-    predefined mapping.
-
-    Args:
-        context (str): The textual context description.
-
-    Returns:
-        int: The numerical representation of the context.
-
-    Raises:
-        ValueError: If the provided context is not one of the predefined values.
-
-    Example:
-        numeric_context = context_to_number("CLASS")
-    """
-    if context == "ATTRIBUTE":
-        return 1
-    elif context == "CLASS":
-        return 2
-    elif context == "DECLARATION":
-        return 3
-    elif context == "FUNCTION":
-        return 4 
-    elif context == "PARAMETER":
-        return 5
-
-def annotate_identifier(clf, data):
-    """
-    Annotate identifier tokens using a trained classifier.
-
-    This function takes a trained classifier and a dataset containing features for identifier tokens. It applies the
-    classifier to predict labels for the identifier tokens.
-
-    Args:
-        clf (Classifier): The trained classifier model.
-        data (pd.DataFrame): A DataFrame containing features for identifier tokens. The columns of the DataFrame should
-                             match the feature names used during training.
-
-    Returns:
-        np.array: An array of predicted labels for the identifier tokens.
-    """
-    # Drop unnecessary columns
-    data = data.drop(columns=['WORD', 'SPLIT_IDENTIFIER'], errors='ignore')
-
-    # Ensure only the features used during training are included
-    trained_features = clf.feature_names_in_  # Features expected by the classifier
-    missing_features = set(trained_features) - set(data.columns)
-    extra_features = set(data.columns) - set(trained_features)
-
-    if missing_features:
-        raise ValueError(f"The following expected features are missing: {missing_features}")
-    if extra_features:
-        print(f"Warning: The following unused features are being ignored: {extra_features}")
-        data = data[trained_features]
-
-    # Ensure feature order matches the trained model
-    df_features = data[trained_features]
-    
-    print("THESE")
-    print(df_features)
-    
-    print("THOSE")
-    print(clf.feature_names_in_)
-
-    # Make predictions
-    y_pred = clf.predict(df_features)
-    return y_pred
-#new ==========================================================================================
-import os
-import time
-import joblib
-import nltk
-import pandas as pd
-from src.feature_generator import createFeatures, universal_to_custom, custom_to_numeric
-from flask import Flask
-from waitress import serve
-from spiral import ronin
-import json
-from src.create_models import createModel, stable_features, mutable_feature_list
-app = Flask(__name__)
-
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-class ModelData:
-    def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> None:
-        """
-        Initialize an instance of the ModelData class with word vector models.
-
-        Args:
-            ModelTokens: Word vectors model for tokens.
-            ModelMethods: Word vectors model for methods.
-            ModelGensimEnglish: Word vectors model for general English words.
-        """
-
-        self.ModelTokens = modelTokens
-        self.ModelMethods = modelMethods
-        self.ModelGensimEnglish = modelGensimEnglish
-        self.wordCount = wordCount
-        # self.ModelClassifier = joblib.load('output/model_RandomForestClassifier.pkl')
-
-class AppCache:
-    def __init__(self, Path, Filename) -> None:
-        self.Cache = {}
-        self.Path = Path
-        self.Filename = Filename
-
-    def load(self): 
-        if not os.path.isdir(self.Path): 
-            raise Exception("Cannot load path: "+self.Path)
-        else:
-            if not os.path.isfile(self.Path+"/"+self.Filename):
-                JSONcache = open(self.Path+"/"+self.Filename, 'w')
-                json.dump({}, JSONcache)
-                JSONcache.close()
-            JSONcache = open(self.Path+"/"+self.Filename, 'r')
-            self.Cache = json.load(JSONcache)
-            JSONcache.close()
-
-    def add(self, identifier, result):
-        info = result
-        info.update({"firstEncounter": time.time()})
-        info.update({"lastEncounter": time.time()})
-        info.update({"count": 1})
-        info.update({"version": "SCANL 1.0"})
-        self.Cache.update({identifier : info})
-
-    def encounter(self, identifier):
-        self.Cache[identifier].update({"lastEncounter": time.time()})
-        self.Cache[identifier].update({"count": self.Cache[identifier]["count"]+1})
-        self.Cache[identifier].update({"version": "SCANL 1.0"})
-
-    def save(self):
-        JSONcache = open(self.Path+"/"+self.Filename, 'w')
-        json.dump(self.Cache, JSONcache)
-        JSONcache.close()
-
-class WordList:
-    def __init__(self, Path):
-        self.Words = set()
-        self.Path = Path
-    
-    def load(self):
-        if not os.path.isfile(self.Path):
-            print("Could not find word list file!")
-            return
-        with open(self.Path) as file: 
-            for line in file:
-                self.Words.add(line[:line.find(',')]) #stop at comma
-    
-    def find(self, item):
-        return item in self.Words
-
-def initialize_model():
-    """
-    Initialize and load word vectors for the application, and load a word count DataFrame.
-
-    This function initializes and loads word vectors using the 'createModel' function, and loads word counts 
-    from a JSON file into a Pandas DataFrame for use in the application.
-
-    Returns:
-        tuple: (ModelData, WORD_COUNT DataFrame)
-    """
-    print("Loading word vectors!!")
-    modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR)
-    print("Word vectors loaded!!")
-    
-    # Load the word count JSON file into a DataFrame
-    word_count_path = os.path.join("input", "word_count.json")
-    if os.path.exists(word_count_path):
-        print(f"Loading word count data from {word_count_path}...")
-        word_count_df = pd.read_json(word_count_path, orient='index', typ='series').reset_index()
-        word_count_df.columns = ['word', 'log_frequency']
-        print("Word count data loaded!")
-    else:
-        print(f"Word count file not found at {word_count_path}. Initializing empty DataFrame.")
-        word_count_df = pd.DataFrame(columns=['word', 'log_frequency'])
-    
-    # Create and store model data
-    app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df)
-
-def start_server(temp_config = {}):
-    """
-    Initialize the model and start the server.
-
-    This function first initializes the model by calling the 'initialize_model' function. Then, it starts the server using
-    the waitress `serve` method, allowing incoming HTTP requests to be handled.
-
-    The arguments to waitress serve are read from the configuration file `serve.json`. The default option is to
-    listen for HTTP requests on all interfaces (ip address 0.0.0.0, port 5000).
-
-    Returns:
-        None
-    """
-    print('initializing model...')
-    initialize_model()
-
-    print("loading cache...")
-    if not os.path.isdir("cache"): os.mkdir("cache")
-    app.cache = AppCache("cache", "cache.json")
-    app.studentCache = AppCache("cache", "student_cache.json")
-    app.cache.load()
-
-    app.english_words = set(w.lower() for w in nltk.corpus.words.words())
-    #insert english words from words/en.txt
-    if not os.path.exists("words/en.txt"):
-        print("could not find English words, using WordNet only!")
-    else:
-        with open("words/en.txt") as words:
-            for word in words:
-                app.english_words.add(word[:-1])
-
-    print('retrieving server configuration...')
-    data = open(os.path.join(SCRIPT_DIR, '..', 'serve.json'))
-    config = json.load(data)
-
-    server_host = temp_config["address"] if "address" in temp_config.keys() else config["address"]
-    server_port = temp_config["port"] if "port" in temp_config.keys() else config['port']
-    server_url_scheme = temp_config["protocol"] if "protocol" in temp_config.keys() else config["protocol"]
-
-    print("loading word list...")
-    wordListPath = temp_config["words"] if "words" in temp_config.keys() else config["words"]
-    app.words = WordList(wordListPath)
-    app.words.load()
-
-    print("Starting server...")
-    serve(app, host=server_host, port=server_port, url_scheme=server_url_scheme)
-    data.close()
-
-def dictionary_lookup(word):
-    #return true if the word exists in the dictionary (the nltk words corpus)
-    #or if the word is in the list of approved words
-    dictionaryType = ""
-    dictionary = word.lower() in app.english_words
-    acceptable = app.words.find(word)
-    digit = word.isnumeric()
-    if (dictionary):
-        dictionaryType = "DW"
-    elif (acceptable):
-        dictionaryType = "AW"
-    elif (digit):
-        dictionaryType = "DD"
-    else:
-        dictionaryType = "UC"
-    
-    return dictionaryType
-
-#TODO: this is not an intuitive way to save cache
-@app.route('/')
-def save():
-    app.cache.save()
-    app.studentCache.save()
-    return "successfully saved cache"
-
-#TODO: use a query string instead for specifying student cache
-@app.route('/<student>/<identifier_name>/<identifier_context>')
-def listen(student, identifier_name: str, identifier_context: str) -> list[dict]:
-    #check if identifier name has already been used
-    cache = None
-
-    if (student == "student"):
-        cache = app.studentCache
-    else: 
-        cache = app.cache
-
-    if (identifier_name in cache.Cache.keys()): 
-        cache.encounter(identifier_name)
-        return cache.Cache[identifier_name]
-    
-    """
-    Process a web request to analyze an identifier within a specific context.
-
-    This route function takes two URL parameters (identifier_name, and identifier_context) from an
-    incoming HTTP request and performs data preprocessing and feature extraction on the identifier_name.
-    It then uses a trained classifier to annotate the identifier with part-of-speech tags and other linguistic features.
-
-    Args:
-        identifier_name (str): The name of the identifier to be analyzed.
-        identifier_context (str): The context in which the identifier appears.
-
-    Returns:
-        List[dict]: A list of dictionaries containing words and their predicted POS tags.
-    """
-    print(f"INPUT: {identifier_name} {identifier_context}")
-   
-    # Split identifier_name into words
-    words = ronin.split(identifier_name)
-    
-    # # Create initial data frame
-    data = pd.DataFrame({
-        'WORD': words,
-        'SPLIT_IDENTIFIER': ' '.join(words),
-        'CONTEXT_NUMBER': context_to_number(identifier_context),  # Predefined context number
-    })
-
-    # create response JSON
-    # tags = list(annotate_identifier(app.model_data.ModelClassifier, data))
-    result = {
-        "words" : []
-    }
-
-    # Add features to the data
-    data = createFeatures(
-        data, 
-        mutable_feature_list,
-        modelGensimEnglish=app.model_data.ModelGensimEnglish,
-    )
-    
-    categorical_features = ['NLTK_POS','PREV_POS', 'NEXT_POS']
-    category_variables = []
-
-    for category_column in categorical_features:
-        if category_column in data.columns:
-            category_variables.append(category_column)
-            data.loc[:, category_column] = data[category_column].astype(str)
-
-    for category_column in category_variables:
-        # Explicitly handle categorical conversion
-        unique_values = data[category_column].unique()
-        category_map = {}
-        for value in unique_values:
-            if value in universal_to_custom:
-                category_map[value] = custom_to_numeric[universal_to_custom[value]]
-            else:
-                category_map[value] = custom_to_numeric['NOUN']  # Assign 'NM' (8) for unknown categories
-
-        data.loc[:, category_column] = data[category_column].map(category_map)
-
-    # Convert categorical variables to numeric
-    # Load and apply the classifier
-    clf = joblib.load(os.path.join(SCRIPT_DIR, '..', 'models', 'model_GradientBoostingClassifier.pkl'))
-    predicted_tags = annotate_identifier(clf, data)
-
-    # Combine words and their POS tags into a parseable format
-    #result = [{'word': word, 'pos_tag': tag} for word, tag in zip(words, predicted_tags)]
-
-    for i in range(len(words)):
-        #check dictionary
-        dictionary = "UC" #uncategorized
-        word = words[i]
-        dictionary = dictionary_lookup(word)
-        result["words"].append(
-            {
-                words[i] : {
-                    "tag" : predicted_tags[i],
-                    "dictionary" : dictionary
-                }
-            }
-        )
-
-    # append result to cache
-    cache.add(identifier_name, result)
-
-    return result
-    
-def context_to_number(context):
-    """
-    Convert a textual context description to a numerical representation.
-
-    This function takes a context description as a string and maps it to a numerical representation according to a
-    predefined mapping.
-
-    Args:
-        context (str): The textual context description.
-
-    Returns:
-        int: The numerical representation of the context.
-
-    Raises:
-        ValueError: If the provided context is not one of the predefined values.
-
-    Example:
-        numeric_context = context_to_number("CLASS")
-    """
-    if context == "ATTRIBUTE":
-        return 1
-    elif context == "CLASS":
-        return 2
-    elif context == "DECLARATION":
-        return 3
-    elif context == "FUNCTION":
-        return 4 
-    elif context == "PARAMETER":
-        return 5
-
-def annotate_identifier(clf, data):
-    """
-    Annotate identifier tokens using a trained classifier.
-
-    This function takes a trained classifier and a dataset containing features for identifier tokens. It applies the
-    classifier to predict labels for the identifier tokens.
-
-    Args:
-        clf (Classifier): The trained classifier model.
-        data (pd.DataFrame): A DataFrame containing features for identifier tokens. The columns of the DataFrame should
-                             match the feature names used during training.
-
-    Returns:
-        np.array: An array of predicted labels for the identifier tokens.
-    """
-    # Drop unnecessary columns
-    data = data.drop(columns=['WORD', 'SPLIT_IDENTIFIER'], errors='ignore')
-
-    # Ensure only the features used during training are included
-    trained_features = clf.feature_names_in_  # Features expected by the classifier
-    missing_features = set(trained_features) - set(data.columns)
-    extra_features = set(data.columns) - set(trained_features)
-
-    if missing_features:
-        raise ValueError(f"The following expected features are missing: {missing_features}")
-    if extra_features:
-        print(f"Warning: The following unused features are being ignored: {extra_features}")
-        data = data[trained_features]
-
-    # Ensure feature order matches the trained model
-    df_features = data[trained_features]
-    
-    # Make predictions
-    y_pred = clf.predict(df_features)
-    return y_pred
+import os
+import time
+import joblib
+import nltk
+import pandas as pd
+from src.feature_generator import createFeatures, universal_to_custom, custom_to_numeric
+from flask import Flask
+from waitress import serve
+from spiral import ronin
+import json
+import sqlite3
+from src.create_models import createModel, stable_features, mutable_feature_list
+app = Flask(__name__)
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+class ModelData:
+    def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> None:
+        """
+        Initialize an instance of the ModelData class with word vector models.
+
+        Args:
+            ModelTokens: Word vectors model for tokens.
+            ModelMethods: Word vectors model for methods.
+            ModelGensimEnglish: Word vectors model for general English words.
+        """
+
+        self.ModelTokens = modelTokens
+        self.ModelMethods = modelMethods
+        self.ModelGensimEnglish = modelGensimEnglish
+        self.wordCount = wordCount
+        # self.ModelClassifier = joblib.load('output/model_RandomForestClassifier.pkl')
+
+class AppCache:
+    def __init__(self, Path) -> None:
+        self.Path = Path
+
+    def load(self):
+        #create connection to database
+        conn = sqlite3.connect(self.Path)
+        #create the table of names if it doesn't exist
+        cursor = conn.cursor()
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS names (
+                       id INTEGER PRIMARY KEY AUTOINCREMENT,
+                       name TEXT NOT NULL,
+                       words TEXT, -- this is a JSON string
+                       firstEncounter INTEGER,
+                       lastEncounter INTEGER,
+                       count INTEGER
+                       )
+        ''')
+        #close the database connection
+        conn.commit()
+        conn.close()
+
+    def add(self, identifier, result):
+        #connection setup
+        conn = sqlite3.connect(self.Path)
+        cursor = conn.cursor()
+        #add identifier to table
+        record = {
+            "name": identifier,
+            "words": json.dumps(result["words"]),
+            "firstEncounter": time.time(),
+            "lastEncounter": time.time(),
+            "count": 1
+        }
+        cursor.execute('''
+            INSERT INTO names (name, words, firstEncounter, lastEncounter, count)
+            VALUES (:name, :words, :firstEncounter, :lastEncounter, :count)
+        ''', record)
+        #close the database connection
+        conn.commit()
+        conn.close()
+        
+    def retrieve(self, identifier):
+        #return a dictionary of the name, or false if not in database
+        conn = sqlite3.connect(self.Path)
+        cursor = conn.cursor()
+        cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ?", (identifier,))
+        row = cursor.fetchone()
+        conn.close()
+
+        if row:
+            return {
+                "name": row[0],
+                "words": json.loads(row[1]),
+                "firstEncounter": row[2],
+                "lastEncounter": row[3],
+                "count": row[4]
+            }
+        else:
+            return False
+
+    def encounter(self, identifier):
+        currentCount = self.retrieve(identifier)["count"]
+        #connection setup
+        conn = sqlite3.connect(self.Path)
+        cursor = conn.cursor()
+        #update record
+        cursor.execute('''
+            UPDATE names 
+            SET lastEncounter = ?, count = ?
+            WHERE name = ?
+        ''', (time.time(), currentCount+1, identifier))
+        #close connection
+        conn.commit()
+        conn.close()
+
+class WordList:
+    def __init__(self, Path):
+        self.Words = set()
+        self.Path = Path
+    
+    def load(self):
+        if not os.path.isfile(self.Path):
+            print("Could not find word list file!")
+            return
+        with open(self.Path) as file:
+            for line in file:
+                self.Words.add(line[:line.find(',')]) #stop at comma
+    
+    def find(self, item):
+        return item in self.Words
+
+def initialize_model():
+    """
+    Initialize and load word vectors for the application, and load a word count DataFrame.
+
+    This function initializes and loads word vectors using the 'createModel' function, and loads word counts 
+    from a JSON file into a Pandas DataFrame for use in the application.
+
+    Returns:
+        tuple: (ModelData, WORD_COUNT DataFrame)
+    """
+    print("Loading word vectors!!")
+    modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR)
+    print("Word vectors loaded!!")
+    
+    # Load the word count JSON file into a DataFrame
+    word_count_path = os.path.join("input", "word_count.json")
+    if os.path.exists(word_count_path):
+        print(f"Loading word count data from {word_count_path}...")
+        word_count_df = pd.read_json(word_count_path, orient='index', typ='series').reset_index()
+        word_count_df.columns = ['word', 'log_frequency']
+        print("Word count data loaded!")
+    else:
+        print(f"Word count file not found at {word_count_path}. Initializing empty DataFrame.")
+        word_count_df = pd.DataFrame(columns=['word', 'log_frequency'])
+    
+    # Create and store model data
+    app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df)
+
+def start_server(temp_config = {}):
+    """
+    Initialize the model and start the server.
+
+    This function first initializes the model by calling the 'initialize_model' function. Then, it starts the server using
+    the waitress `serve` method, allowing incoming HTTP requests to be handled.
+
+    The arguments to waitress serve are read from the configuration file `serve.json`. The default option is to
+    listen for HTTP requests on all interfaces (ip address 0.0.0.0, port 5000).
+
+    Returns:
+        None
+    """
+    print('initializing model...')
+    initialize_model()
+
+    print("loading cache...")
+    if not os.path.isdir("cache"): os.mkdir("cache")
+
+    print("laoding dictionary")
+    #TODO: if there's issues with uncateogorized words, it's porbably because this is commented out
+    #nltk.download("words")
+    app.english_words = set(w.lower() for w in nltk.corpus.words.words())
+    #insert english words from words/en.txt
+    if not os.path.exists("words/en.txt"):
+        print("could not find English words, using WordNet only!")
+    else:
+        with open("words/en.txt") as words:
+            for word in words:
+                app.english_words.add(word[:-1])
+
+    print('retrieving server configuration...')
+    data = open(os.path.join(SCRIPT_DIR, '..', 'serve.json'))
+    config = json.load(data)
+
+    server_host = temp_config["address"] if "address" in temp_config.keys() else config["address"]
+    server_port = temp_config["port"] if "port" in temp_config.keys() else config['port']
+    server_url_scheme = temp_config["protocol"] if "protocol" in temp_config.keys() else config["protocol"]
+
+    print("loading word list...")
+    wordListPath = temp_config["words"] if "words" in temp_config.keys() else config["words"]
+    app.words = WordList(wordListPath)
+    app.words.load()
+
+    print("Starting server...")
+    serve(app, host=server_host, port=server_port, url_scheme=server_url_scheme)
+    data.close()
+
+def dictionary_lookup(word):
+    #return true if the word exists in the dictionary (the nltk words corpus)
+    #or if the word is in the list of approved words
+    dictionaryType = ""
+    dictionary = word.lower() in app.english_words
+    acceptable = app.words.find(word)
+    digit = word.isnumeric()
+    if (dictionary):
+        dictionaryType = "DW"
+    elif (acceptable):
+        dictionaryType = "AW"
+    elif (digit):
+        dictionaryType = "DD"
+    else:
+        dictionaryType = "UC"
+    
+    return dictionaryType
+
+#route to check for and create a database if it does not exist already
+@app.route('/probe/<cache_id>')
+def probe(cache_id: str):
+    if os.path.exists("cache/"+cache_id+".db3"):
+        return "Opening existing identifier database..."
+    else:
+        return "First request will create identifier database: "+cache_id+"..."
+
+#route to tag an identifier name
+@app.route('/<identifier_name>/<identifier_context>')
+@app.route('/<identifier_name>/<identifier_context>/<cache_id>')
+def listen(identifier_name: str, identifier_context: str, cache_id: str = None) -> list[dict]:
+    #check if identifier name has already been used
+    cache = None
+    #find the existing cache in app.caches or create a new one if it doesn't exist
+    if cache_id != None:
+        if os.path.exists("cache/"+cache_id+".db3"):
+            #check if the identifier name is in this cache and return it if so
+            cache = AppCache("cache/"+cache_id+".db3")
+            data = cache.retrieve(identifier_name)
+            if data != False:
+                cache.encounter(identifier_name)
+                return data
+        else:
+            #create the cache
+            cache = AppCache("cache/"+cache_id+".db3")
+            cache.load()
+    
+    #TODO: update this documentation
+    """
+    Process a web request to analyze an identifier within a specific context.
+
+    This route function takes two URL parameters (identifier_name, and identifier_context) from an
+    incoming HTTP request and performs data preprocessing and feature extraction on the identifier_name.
+    It then uses a trained classifier to annotate the identifier with part-of-speech tags and other linguistic features.
+
+    Args:
+        identifier_name (str): The name of the identifier to be analyzed.
+        identifier_context (str): The context in which the identifier appears.
+
+    Returns:
+        List[dict]: A list of dictionaries containing words and their predicted POS tags.
+    """
+    print(f"INPUT: {identifier_name} {identifier_context}")
+   
+    # Split identifier_name into words
+    words = ronin.split(identifier_name)
+    
+    # # Create initial data frame
+    data = pd.DataFrame({
+        'WORD': words,
+        'SPLIT_IDENTIFIER': ' '.join(words),
+        'CONTEXT_NUMBER': context_to_number(identifier_context),  # Predefined context number
+    })
+
+    # create response JSON
+    # tags = list(annotate_identifier(app.model_data.ModelClassifier, data))
+    result = {
+        "words" : []
+    }
+
+    # Add features to the data
+    data = createFeatures(
+        data,
+        mutable_feature_list,
+        modelGensimEnglish=app.model_data.ModelGensimEnglish,
+    )
+    
+    categorical_features = ['NLTK_POS','PREV_POS', 'NEXT_POS']
+    category_variables = []
+
+    for category_column in categorical_features:
+        if category_column in data.columns:
+            category_variables.append(category_column)
+            data.loc[:, category_column] = data[category_column].astype(str)
+
+    for category_column in category_variables:
+        # Explicitly handle categorical conversion
+        unique_values = data[category_column].unique()
+        category_map = {}
+        for value in unique_values:
+            if value in universal_to_custom:
+                category_map[value] = custom_to_numeric[universal_to_custom[value]]
+            else:
+                category_map[value] = custom_to_numeric['NOUN']  # Assign 'NM' (8) for unknown categories
+
+        data.loc[:, category_column] = data[category_column].map(category_map)
+
+    # Convert categorical variables to numeric
+    # Load and apply the classifier
+    clf = joblib.load(os.path.join(SCRIPT_DIR, '..', 'models', 'model_GradientBoostingClassifier.pkl'))
+    predicted_tags = annotate_identifier(clf, data)
+
+    # Combine words and their POS tags into a parseable format
+    #result = [{'word': word, 'pos_tag': tag} for word, tag in zip(words, predicted_tags)]
+
+    for i in range(len(words)):
+        #check dictionary
+        dictionary = "UC" #uncategorized
+        word = words[i]
+        dictionary = dictionary_lookup(word)
+        result["words"].append(
+            {
+                words[i] : {
+                    "tag" : predicted_tags[i],
+                    "dictionary" : dictionary
+                }
+            }
+        )
+
+    # append result to cache
+    if cache_id != None:
+        cache.add(identifier_name, result)
+
+    return result
+    
+def context_to_number(context):
+    """
+    Convert a textual context description to a numerical representation.
+
+    This function takes a context description as a string and maps it to a numerical representation according to a
+    predefined mapping.
+
+    Args:
+        context (str): The textual context description.
+
+    Returns:
+        int: The numerical representation of the context.
+
+    Raises:
+        ValueError: If the provided context is not one of the predefined values.
+
+    Example:
+        numeric_context = context_to_number("CLASS")
+    """
+    if context == "ATTRIBUTE":
+        return 1
+    elif context == "CLASS":
+        return 2
+    elif context == "DECLARATION":
+        return 3
+    elif context == "FUNCTION":
+        return 4
+    elif context == "PARAMETER":
+        return 5
+
+def annotate_identifier(clf, data):
+    """
+    Annotate identifier tokens using a trained classifier.
+
+    This function takes a trained classifier and a dataset containing features for identifier tokens. It applies the
+    classifier to predict labels for the identifier tokens.
+
+    Args:
+        clf (Classifier): The trained classifier model.
+        data (pd.DataFrame): A DataFrame containing features for identifier tokens. The columns of the DataFrame should
+                             match the feature names used during training.
+
+    Returns:
+        np.array: An array of predicted labels for the identifier tokens.
+    """
+    # Drop unnecessary columns
+    data = data.drop(columns=['WORD', 'SPLIT_IDENTIFIER'], errors='ignore')
+
+    # Ensure only the features used during training are included
+    trained_features = clf.feature_names_in_  # Features expected by the classifier
+    missing_features = set(trained_features) - set(data.columns)
+    extra_features = set(data.columns) - set(trained_features)
+
+    if missing_features:
+        raise ValueError(f"The following expected features are missing: {missing_features}")
+    if extra_features:
+        print(f"Warning: The following unused features are being ignored: {extra_features}")
+        data = data[trained_features]
+
+    # Ensure feature order matches the trained model
+    df_features = data[trained_features]
+    
+    # Make predictions
+    y_pred = clf.predict(df_features)
+    return y_pred
+

From 9b7cd2e24c1a7ea9e4aef6c5f0ad8ce648a50323 Mon Sep 17 00:00:00 2001
From: Brandon Scholten <scholtenb8890@mtc.edu>
Date: Mon, 5 May 2025 20:33:40 -0400
Subject: [PATCH 24/51] Account for context when saving identifiers

---
 requirements.txt      |  2 +-
 src/tag_identifier.py | 25 +++++++++++++------------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 00f84f5..5c0ca05 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,7 +17,7 @@ filelock==3.17.0
 flair==0.15.0
 Flask==3.1.0
 fonttools==4.55.6
-fsspec==2024.12.0
+fsspec==2023.5.0
 ftfy==6.3.1
 gdown==5.2.0
 gensim==4.3.3
diff --git a/src/tag_identifier.py b/src/tag_identifier.py
index 0af80c1..6a3a889 100644
--- a/src/tag_identifier.py
+++ b/src/tag_identifier.py
@@ -43,6 +43,7 @@ def load(self):
             CREATE TABLE IF NOT EXISTS names (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        name TEXT NOT NULL,
+                       context TEXT NOT NULL,
                        words TEXT, -- this is a JSON string
                        firstEncounter INTEGER,
                        lastEncounter INTEGER,
@@ -53,31 +54,32 @@ def load(self):
         conn.commit()
         conn.close()
 
-    def add(self, identifier, result):
+    def add(self, identifier, result, context):
         #connection setup
         conn = sqlite3.connect(self.Path)
         cursor = conn.cursor()
         #add identifier to table
         record = {
             "name": identifier,
+            "context": context,
             "words": json.dumps(result["words"]),
             "firstEncounter": time.time(),
             "lastEncounter": time.time(),
             "count": 1
         }
         cursor.execute('''
-            INSERT INTO names (name, words, firstEncounter, lastEncounter, count)
-            VALUES (:name, :words, :firstEncounter, :lastEncounter, :count)
+            INSERT INTO names (name, context, words, firstEncounter, lastEncounter, count)
+            VALUES (:name, :context, :words, :firstEncounter, :lastEncounter, :count)
         ''', record)
         #close the database connection
         conn.commit()
         conn.close()
         
-    def retrieve(self, identifier):
+    def retrieve(self, identifier, context):
         #return a dictionary of the name, or false if not in database
         conn = sqlite3.connect(self.Path)
         cursor = conn.cursor()
-        cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ?", (identifier,))
+        cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ? AND context = ?", (identifier, context))
         row = cursor.fetchone()
         conn.close()
 
@@ -92,8 +94,8 @@ def retrieve(self, identifier):
         else:
             return False
 
-    def encounter(self, identifier):
-        currentCount = self.retrieve(identifier)["count"]
+    def encounter(self, identifier, context):
+        currentCount = self.retrieve(identifier, context)["count"]
         #connection setup
         conn = sqlite3.connect(self.Path)
         cursor = conn.cursor()
@@ -171,9 +173,8 @@ def start_server(temp_config = {}):
     if not os.path.isdir("cache"): os.mkdir("cache")
 
     print("laoding dictionary")
-    #TODO: if there's issues with uncateogorized words, it's porbably because this is commented out
-    #nltk.download("words")
     app.english_words = set(w.lower() for w in nltk.corpus.words.words())
+
     #insert english words from words/en.txt
     if not os.path.exists("words/en.txt"):
         print("could not find English words, using WordNet only!")
@@ -236,9 +237,9 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None)
         if os.path.exists("cache/"+cache_id+".db3"):
             #check if the identifier name is in this cache and return it if so
             cache = AppCache("cache/"+cache_id+".db3")
-            data = cache.retrieve(identifier_name)
+            data = cache.retrieve(identifier_name, identifier_context)
             if data != False:
-                cache.encounter(identifier_name)
+                cache.encounter(identifier_name, identifier_context)
                 return data
         else:
             #create the cache
@@ -329,7 +330,7 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None)
 
     # append result to cache
     if cache_id != None:
-        cache.add(identifier_name, result)
+        cache.add(identifier_name, result, identifier_context)
 
     return result
     

From dc1222cd588b680cc2f3da6611b118d9e820c61d Mon Sep 17 00:00:00 2001
From: Brandon Scholten <scholtenb8890@mtc.edu>
Date: Mon, 5 May 2025 21:02:41 -0400
Subject: [PATCH 25/51] Save time to tag an identifier in database

---
 src/tag_identifier.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/tag_identifier.py b/src/tag_identifier.py
index 6a3a889..305e390 100644
--- a/src/tag_identifier.py
+++ b/src/tag_identifier.py
@@ -47,14 +47,15 @@ def load(self):
                        words TEXT, -- this is a JSON string
                        firstEncounter INTEGER,
                        lastEncounter INTEGER,
-                       count INTEGER
+                       count INTEGER,
+                       tagTime INTEGER -- time it took to tag the identifier
                        )
         ''')
         #close the database connection
         conn.commit()
         conn.close()
 
-    def add(self, identifier, result, context):
+    def add(self, identifier, result, context, tag_time):
         #connection setup
         conn = sqlite3.connect(self.Path)
         cursor = conn.cursor()
@@ -65,11 +66,12 @@ def add(self, identifier, result, context):
             "words": json.dumps(result["words"]),
             "firstEncounter": time.time(),
             "lastEncounter": time.time(),
-            "count": 1
+            "count": 1,
+            "tagTime": tag_time
         }
         cursor.execute('''
-            INSERT INTO names (name, context, words, firstEncounter, lastEncounter, count)
-            VALUES (:name, :context, :words, :firstEncounter, :lastEncounter, :count)
+            INSERT INTO names (name, context, words, firstEncounter, lastEncounter, count, tagTime)
+            VALUES (:name, :context, :words, :firstEncounter, :lastEncounter, :count, :tagTime)
         ''', record)
         #close the database connection
         conn.commit()
@@ -263,6 +265,9 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None)
     """
     print(f"INPUT: {identifier_name} {identifier_context}")
    
+    # get the start time
+    start_time = time.perf_counter()
+
     # Split identifier_name into words
     words = ronin.split(identifier_name)
     
@@ -328,9 +333,12 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None)
             }
         )
 
+    # get time it took to tag the identifier
+    tag_time = time.perf_counter() - start_time
+
     # append result to cache
     if cache_id != None:
-        cache.add(identifier_name, result, identifier_context)
+        cache.add(identifier_name, result, identifier_context, tag_time)
 
     return result
     

From 0ac367b2e516cbefccadcd54395ab59ac4a0592d Mon Sep 17 00:00:00 2001
From: Brandon Scholten <brandon@fugio.money>
Date: Fri, 9 May 2025 21:41:46 -0400
Subject: [PATCH 26/51] Removed unused dependencies

---
 Dockerfile       |  5 +++--
 requirements.txt | 13 -------------
 serve.json       |  2 +-
 3 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 54f54e7..d0e20b1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,10 +2,11 @@ FROM python:3.12-slim
 
 # Install (and build) requirements
 COPY requirements.txt /requirements.txt
-RUN apt-get update --fix-missing && \
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && \
+    apt-get update --fix-missing && \
     apt-get install --allow-unauthenticated -y git curl && \
     pip install -r requirements.txt && \
-    rm -rf /var/lib/apt/lists/*
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 
 COPY . .
 RUN pip install -e .
diff --git a/requirements.txt b/requirements.txt
index 5c0ca05..450f86d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -42,18 +42,6 @@ mpmath==1.3.0
 networkx==3.4.2
 nltk==3.9.1
 numpy==1.26.4
-nvidia-cublas-cu12==12.4.5.8
-nvidia-cuda-cupti-cu12==12.4.127
-nvidia-cuda-nvrtc-cu12==12.4.127
-nvidia-cuda-runtime-cu12==12.4.127
-nvidia-cudnn-cu12==9.1.0.70
-nvidia-cufft-cu12==11.2.1.3
-nvidia-curand-cu12==10.3.5.147
-nvidia-cusolver-cu12==11.6.1.9
-nvidia-cusparse-cu12==12.3.1.170
-nvidia-nccl-cu12==2.21.5
-nvidia-nvjitlink-cu12==12.4.127
-nvidia-nvtx-cu12==12.4.127
 packaging==24.2
 pandas==2.2.3
 pillow==11.1.0
@@ -93,7 +81,6 @@ torch==2.5.1
 tqdm==4.67.1
 transformer-smaller-training-vocab==0.4.0
 transformers==4.48.1
-triton==3.1.0
 typing_extensions==4.12.2
 tzdata==2025.1
 urllib3==2.3.0
diff --git a/serve.json b/serve.json
index 3eeb486..6ecacbc 100644
--- a/serve.json
+++ b/serve.json
@@ -1,6 +1,6 @@
 {
     "address": "0.0.0.0",
-    "port": 8080,
+    "port": 5000,
     "protocol": "https",
     "words":""
 }

From 14dd5c1784b5208ddae19ec0cbabfd392e8e76b3 Mon Sep 17 00:00:00 2001
From: Brandon Scholten <brandon@fugio.money>
Date: Sat, 10 May 2025 04:34:25 -0400
Subject: [PATCH 27/51] Optional GPU accelaration in Docker

---
 Dockerfile | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index d0e20b1..b747297 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,11 +1,18 @@
 FROM python:3.12-slim
 
+#argument to enable GPU accelaration
+ARG GPU=false
+
 # Install (and build) requirements
 COPY requirements.txt /requirements.txt
+COPY requirements_gpu.txt /requirements_gpu.txt
 RUN apt-get clean && rm -rf /var/lib/apt/lists/* && \
     apt-get update --fix-missing && \
     apt-get install --allow-unauthenticated -y git curl && \
     pip install -r requirements.txt && \
+    if [ "$GPU" = true ]; then \
+        pip install -r requirements_gpu.txt; \
+    fi && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
 
 COPY . .
@@ -72,4 +79,4 @@ CMD date; \
     echo "Running..."; \
     /main -r --words words/abbreviationList.csv
 
-ENV TZ=US/Michigan
\ No newline at end of file
+ENV TZ=US/Michigan

From b4c62d15d1ba53b32cd3eef785ffa6c27b8db31a Mon Sep 17 00:00:00 2001
From: Brandon Scholten <brandon@fugio.money>
Date: Sat, 10 May 2025 04:38:34 -0400
Subject: [PATCH 28/51] Separate GPU accelaration dependencies

---
 requirements_gpu.txt | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 requirements_gpu.txt

diff --git a/requirements_gpu.txt b/requirements_gpu.txt
new file mode 100644
index 0000000..801da7d
--- /dev/null
+++ b/requirements_gpu.txt
@@ -0,0 +1,13 @@
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+triton==3.1.0
\ No newline at end of file

From 9eaa5763e4d76be3bc629cfe9fcceb72c4f4b348 Mon Sep 17 00:00:00 2001
From: Brandon Scholten <scholtenb8890@mtc.edu>
Date: Sat, 10 May 2025 23:02:26 -0400
Subject: [PATCH 29/51] =?UTF-8?q?=E2=80=9CUpdate=E2=80=9D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements_gpu.txt | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/requirements_gpu.txt b/requirements_gpu.txt
index 801da7d..c9a1ba1 100644
--- a/requirements_gpu.txt
+++ b/requirements_gpu.txt
@@ -2,12 +2,11 @@ nvidia-cublas-cu12==12.4.5.8
 nvidia-cuda-cupti-cu12==12.4.127
 nvidia-cuda-nvrtc-cu12==12.4.127
 nvidia-cuda-runtime-cu12==12.4.127
-nvidia-cudnn-cu12==9.1.0.70
+nvidia-cudnn-cu12==9.1.1.17
 nvidia-cufft-cu12==11.2.1.3
 nvidia-curand-cu12==10.3.5.147
 nvidia-cusolver-cu12==11.6.1.9
 nvidia-cusparse-cu12==12.3.1.170
-nvidia-nccl-cu12==2.21.5
+nvidia-nccl-cu12==2.23.4
 nvidia-nvjitlink-cu12==12.4.127
-nvidia-nvtx-cu12==12.4.127
-triton==3.1.0
\ No newline at end of file
+nvidia-nvtx-cu12==12.4.127
\ No newline at end of file

From 47dbb3b13e25a9092ec8b70833dfff68dcd4aaaa Mon Sep 17 00:00:00 2001
From: Brandon Scholten <scholtenb8890@mtc.edu>
Date: Sun, 18 May 2025 16:47:24 -0400
Subject: [PATCH 30/51] Update serve.json

---
 serve.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/serve.json b/serve.json
index 6ecacbc..3eeb486 100644
--- a/serve.json
+++ b/serve.json
@@ -1,6 +1,6 @@
 {
     "address": "0.0.0.0",
-    "port": 5000,
+    "port": 8080,
     "protocol": "https",
     "words":""
 }

From 73ac7d409674166dc6f5d02fc86759ed02d23145 Mon Sep 17 00:00:00 2001
From: Christian Newman <cdvnse@rit.edu>
Date: Mon, 2 Jun 2025 19:24:39 -0400
Subject: [PATCH 31/51] Add new type of tagger

---
 input/tagger_data.tsv                         | 2610 +++++++++++++++++
 main                                          |  207 +-
 requirements.txt                              |   17 +-
 src/lm_based_tagger/__init__.py               |    0
 .../distilbert_preprocessing.py               |  187 ++
 src/lm_based_tagger/distilbert_tagger.py      |  178 ++
 src/lm_based_tagger/train_model.py            |  127 +
 src/tag_identifier.py                         |   74 +-
 src/tree_based_tagger/__init__.py             |    0
 .../classifier_multiclass.py                  |  143 +-
 src/{ => tree_based_tagger}/create_models.py  |    0
 .../download_code2vec_vectors.py              |    0
 .../feature_generator.py                      |    0
 13 files changed, 3353 insertions(+), 190 deletions(-)
 create mode 100644 input/tagger_data.tsv
 create mode 100644 src/lm_based_tagger/__init__.py
 create mode 100644 src/lm_based_tagger/distilbert_preprocessing.py
 create mode 100644 src/lm_based_tagger/distilbert_tagger.py
 create mode 100644 src/lm_based_tagger/train_model.py
 create mode 100644 src/tree_based_tagger/__init__.py
 rename src/{ => tree_based_tagger}/classifier_multiclass.py (64%)
 rename src/{ => tree_based_tagger}/create_models.py (100%)
 rename src/{ => tree_based_tagger}/download_code2vec_vectors.py (100%)
 rename src/{ => tree_based_tagger}/feature_generator.py (100%)

diff --git a/input/tagger_data.tsv b/input/tagger_data.tsv
new file mode 100644
index 0000000..4b6a6df
--- /dev/null
+++ b/input/tagger_data.tsv
@@ -0,0 +1,2610 @@
+TYPE	SPLIT	CONTEXT	GRAMMAR_PATTERN	LANGUAGE	SYSTEM_NAME
+ABCOpt	ABC Opt	CLASS	NM N	C++	swift
+int64t	abs deadline n sec value	PARAMETER	NM NM NM NM N	C++	grpc
+AbstractBuildExecution	Abstract Build Execution	CLASS	NM NM N	Java	jenkins
+ofVec3f	accelerometer Data	PARAMETER	NM N	C	openFrameworks
+boolean	accepting Tasks	PARAMETER	NM NPL	Java	jenkins
+AccessorConformanceInfo	Accessor Conformance Info	CLASS	NM NM N	C++	swift
+Map	action To Index Map	ATTRIBUTE	N P NM N	Java	antlr4
+ActionTranslator	Action Translator	CLASS	NM N	Java	antlr4
+KToggleAction	action View Show Master Pages	ATTRIBUTE	NM N V NM NPL	C++	calligra
+AtomicInteger	active Copies	ATTRIBUTE	NM NPL	Java	elasticsearch
+QString	active Tool Id	DECLARATION	NM NM N	C++	calligra
+int	actual Count	ATTRIBUTE	NM N	Java	mockito
+int	actual Suffix	DECLARATION	NM N	Java	junit4
+void	Add Bezier Curve	FUNCTION	V NM N	C++	bullet3
+boolean	add Bias To Embedding	ATTRIBUTE	V N P N	Java	corenlp
+void	Add Log To Stream	FUNCTION	V N P N	C++	Telegram
+void	add Object Index	FUNCTION	V NM N	C++	blender
+void	add Ordered List	FUNCTION	V NM N	Java	mockito
+void	add Unicode Script Codes To Names	FUNCTION	V NM NM NPL P NPL	Java	antlr4
+Guid	Added Unicode Smp	ATTRIBUTE	NM NM N	C#	antlr4
+boolean	adding	DECLARATION	V	Java	jenkins
+grpcresolvedaddress	addr	PARAMETER	N	C	grpc
+AddressLowering	Address Lowering	CLASS	N V	C++	swift
+int	ADDRESS TYPE IPV4	ATTRIBUTE	NM NM N	Java	okhttp
+QGradient	adjusted Gradient	FUNCTION	NM N	C++	calligra
+gsecaeadcrypter	aead crypter seal	DECLARATION	NM NM N	C++	grpc
+void	after Channel	FUNCTION	P N	Java	jenkins
+List	all action Roots	DECLARATION	DT NM NPL	Java	antlr4
+vector	all Factors	PARAMETER	DT NPL	C++	QuantLib
+List	all Invocation Matchers	DECLARATION	DT NM NPL	Java	mockito
+QString	all Open Files String	FUNCTION	DT NM NM N	C++	kdevelop
+List	all Open Indices	DECLARATION	DT NM NPL	Java	elasticsearch
+Collection	all Stubbings	ATTRIBUTE	DT NPL	Java	mockito
+AltAndContextConfigEqualityComparator	Alt And Context Config Equality Comparator	CLASS	N CJ NM NM NM N	Java	antlr4
+Map	alt Label Ctxs	ATTRIBUTE	NM NM NPL	Java	antlr4
+String	alt Name	PARAMETER	NM N	Java	okhttp
+path	alternative Cache Name	PARAMETER	NM NM N	C++	irrlicht
+gboolean alwayspreview	always preview	DECLARATION	VM V	C	gimp
+ofFloatColor	ambient	ATTRIBUTE	N	C	openFrameworks
+float	ambient coefficient	PARAMETER	NM N	C++	bullet3
+QStringList	anchor Name	DECLARATION	NM N	C++	calligra
+AndroidByteBuddyMockMaker	Android Byte Buddy Mock Maker	CLASS	NM NM NM NM N	Java	mockito
+qreal	angle In Radian	DECLARATION	N P N	C++	calligra
+GtkWidget	animation box	DECLARATION	NM N	C	gimp
+KPrPredefinedAnimationsLoader	animations Data	PARAMETER	NM NPL	C++	calligra
+String	App ID	PARAMETER	NM N	Java	opencv
+void	append Python Style Escaped Code Point	FUNCTION	V NM N NM NM N	Java	antlr4
+void	append Sequence	FUNCTION	V N	C++	kdevelop
+void	apply Margin Insets	FUNCTION	V NM NPL	Java	Telegram
+ApplyRewriter	Apply Rewriter	CLASS	NM N	C++	swift
+ArabicSegmenter	Arabic Segmenter	CLASS	NM N	Java	corenlp
+AreaInfoDialog_t	Area Info Dialog t	CLASS	NM NM NM N	C	gimp.idents
+AresDnsResolver	Ares Dns Resolver	CLASS	NM NM N	C++	grpc
+int	ares expand name for response	FUNCTION	PRE V N P N	C	grpc
+Collection	arg Mismatch Stubbings	PARAMETER	NM NM NPL	Java	mockito
+void	ARGB Subtract Row C	FUNCTION	N V NM N	C++	Telegram
+Class	array Class	PARAMETER	NM N	Java	junit4
+ArrayCountPropagation	Array Count Propagation	CLASS	NM NM N	C++	swift
+SILValue	Array Struct Value	PARAMETER	NM NM N	C++	swift
+void	assign Lexer Token Types	FUNCTION	V NM NM NPL	Java	antlr4
+Set	assigns	PARAMETER	NPL	Java	junit4
+int	atn State	DECLARATION	NM N	C#	antlr4
+int	ATOM	ATTRIBUTE	N	Java	antlr4
+Attribute	Attribute	CLASS	N	C++	openFrameworks
+OrderedHashSet	attribute Decls	ATTRIBUTE	NM NPL	Java	antlr4
+QXmlStreamAttributes	attrs	DECLARATION	NPL	C++	calligra
+AUDSound	AUD Sound envelope	FUNCTION	PRE NM N	C++	blender
+Authentication	auth Result	PARAMETER	NM N	Java	jenkins
+uint	average In Bucket Used Slot Count	ATTRIBUTE	N P NM NM NM N	C	kdevelop
+uint8x8t	avg 1	DECLARATION	N D	C	opencv
+Map	avg Map	DECLARATION	NM N	Java	corenlp
+BusinessDayConvention	b d c	PARAMETER	NM NM N	C++	QuantLib
+bool	b Flip Horizontally	PARAMETER	PRE V VM	C++	openFrameworks
+boolean	b Is Playing	ATTRIBUTE	PRE V V	Java	openFrameworks
+bool	b Loop	ATTRIBUTE	PRE N	C++	openFrameworks
+bool	b Multi Fullscreen	PARAMETER	PRE NM N	C++	openFrameworks
+void	b3 Push Profile Timing	FUNCTION	PRE V NM N	C++	bullet3
+BackgroundParserPrivate	Background Parser Private	CLASS	NM N NM	C++	kdevelop
+boolean	backprop Training	PARAMETER	NM N	Java	corenlp
+String	base Category	DECLARATION	NM N	Java	corenlp
+BaseLoaderCallback	Base Loader Callback	CLASS	NM NM N	Java	opencv
+btTransform	base transform	DECLARATION	NM N	C++	bullet3
+BasicReplicationRequest	Basic Replication Request	CLASS	NM NM N	Java	elasticsearch
+BasketPayoff	Basket Payoff	CLASS	NM N	C++	QuantLib
+Map beanConfigs	bean Configs	ATTRIBUTE	NM NPL	Java	jenkins
+List	befores	ATTRIBUTE	NPL	Java	junit4
+guchar	best b	DECLARATION	NM N	C	gimp
+IndexedWord	best Node	DECLARATION	NM N	Java	corenlp
+XIMStyle	best Style	DECLARATION	NM N	C++	irrlicht
+QString	bibliography Type	PARAMETER	NM N	C++	calligra
+Constructor	biggest Constructor	FUNCTION	NM N	Java	mockito
+BinaryRule	Binary Rule	CLASS	NM N	Java	corenlp
+String	binary Search Bytes	FUNCTION	NM N NPL	Java	okhttp
+MMatrix	bind Matrix	DECLARATION	NM N	C++	ogre
+s16	binormal type	DECLARATION	NM N	C++	irrlicht
+u32	Bitmap Data Size	ATTRIBUTE	NM NM N	C	irrlicht
+int	BKE lattice index flip	FUNCTION	PRE NM N V	C	blender
+void	BLI str r strip	FUNCTION	PRE N VM V	C	blender
+unsigned	block Nr	PARAMETER	NM N	C++	calligra
+BMLog	bm log	ATTRIBUTE	NM N	C	blender
+vector	bone Positions	PARAMETER	NM NPL	C++	ogre
+int	bound Port	PARAMETER	NM N	C#	grpc
+float	Bounding Radius	ATTRIBUTE	NM N	C	irrlicht
+LongConsumer	breaker Consumer	PARAMETER	NM N	Java	elasticsearch
+BreakpointDataPtr	breakpoint	ATTRIBUTE	N	C++	kdevelop
+SILValue	Bridged Value Fun Arg	PARAMETER	NM NM NM N	C++	swift
+quint32	brush Hatch	PARAMETER	NM N	C++	calligra
+btCollisionShape	bt Collision Shape	CLASS	PRE NM N	C++	bullet3
+class btDeformableContactConstraint	bt Deformable Contact Constraint	CLASS	PRE NM NM N	C++	bullet3
+btMaterial	bt Material	CLASS	PRE N	C++	bullet3
+BucketCollector	Bucket Collector	CLASS	NM N	Java	elasticsearch
+ParseField	BUCKETS PATH FIELD	ATTRIBUTE	NM NM N	Java	elasticsearch
+u32	buf Num	PARAMETER	NM N	C++	irrlicht
+vector	buffers	ATTRIBUTE	NPL	C++	Telegram
+BuildContext	build Context	PARAMETER	NM N	Java	antlr4
+void	build Group Query	FUNCTION	V NM N	Java	elasticsearch
+void	build Lexer Rule Actions	FUNCTION	V NM NM NPL	Java	antlr4
+gint	build time minor	PARAMETER	NM NM N	C	gimp
+long	build Timestamp	ATTRIBUTE	NM N	Java	jenkins
+BuildWrapper	Build Wrapper	CLASS	NM N	Java	jenkins
+Path	built Url	PARAMETER	NM N	C++	kdevelop
+ByteArrayView	byte Array View	PARAMETER	NM NM N	C++	kdevelop
+uint32t	byte Count	DECLARATION	NM N	C	Telegram
+int8t	byte Src	DECLARATION	NM N	C++	Telegram
+long	bytes Received	ATTRIBUTE	NPL NM	Java	okhttp
+CAnimatedMeshSceneNode	C Animated Mesh Scene Node	CLASS	PRE NM NM NM N	C++	irrlicht
+CArchiveLoaderTAR	C Archive Loader TAR	CLASS	PRE NM N NM	C++	irrlicht
+CAtBackgroundColor	C At Background Color	CLASS	PRE NM NM N	C++	openFrameworks
+CAtFont	C At Font	CLASS	PRE NM N	C++	openFrameworks
+CAtGrayScale	C At Grayscale	CLASS	PRE NM N	C++	openFrameworks
+CBurningShader_Raster_Reference	C Burning Shader Raster Reference	CLASS	PRE NM NM NM N	C++	irrlicht
+CD3D9RenderTarget	C D3D9 Render Target	CLASS	PRE NM NM N	C++	irrlicht
+CFileReadCallBack	C File Read Callback	CLASS	PRE NM NM N	C++	irrlicht
+Chttp2Connector	C http2 Connector	CLASS	PRE NM N	C++	grpc
+CImage	C Image	CLASS	PRE N	C++	irrlicht
+CImageWriterJPG	C Image Writer JPG	CLASS	PRE NM N NM	C++	irrlicht
+ceresproblemt	c problem	PARAMETER	NM N	C++	blender
+CSkinnedMesh	C Skinned Mesh	CLASS	PRE NM N	C++	irrlicht
+Sink	cache Body Unbuffered	DECLARATION	NM N NM	Java	okhttp
+IndexShardCacheEntity	cache Entity	DECLARATION	NM N	Java	elasticsearch
+QString	cache file	PARAMETER	NM N	C++	kdevelop
+CacheStrategy	Cache Strategy	CLASS	NM N	Java	okhttp
+CachedRegionTracker	Cached Region Tracker	CLASS	NM NM N	Java	Telegram
+BytecodeGenerator	caching Mock Bytecode Generator	ATTRIBUTE	NM NM NM N	Java	mockito
+u32	calc LUT	FUNCTION	V N	C++	opencv
+bool	calibration Phase	ATTRIBUTE	NM N	C++	QuantLib
+CallErrorExtensions	Call Error Extensions	CLASS	NM NM NPL	C#	grpc
+long	call Id	PARAMETER	NM N	Java	okhttp
+float	cam Rot Z	DECLARATION	NM NM N	C++	bullet3
+CameraData	Camera Data	CLASS	NM N	C++	irrlicht
+GimpCanvasItem	canvas item	ATTRIBUTE	NM N	C	gimp
+vector	cap Times	DECLARATION	NM NPL	C++	QuantLib
+CapletVarianceCurve	Caplet Variance Curve	CLASS	NM NM N	C++	QuantLib
+sharedptr	cat Risk	ATTRIBUTE	NM N	C++	QuantLib
+CategoryFilterFactory	Category Filter Factory	CLASS	NM NM N	Java	junit4
+Matcher	cause Matcher	PARAMETER	NM N	Java	junit4
+grpcsslcertificateconfigreloadstatus	cb result	DECLARATION	NM N	C++	grpc
+CborXContent	Cbor X Content	CLASS	NM NM N	Java	elasticsearch
+FrameIterator	cell Cursor	DECLARATION	NM N	C++	calligra
+CertificateChainCleaner	certificate Chain Cleaner	ATTRIBUTE	NM NM N	Java	okhttp
+char	Ch	DECLARATION	N	C++	grpc
+ChangeScroll	Change Scroll	CLASS	V N	C++	calligra
+ChapterTocFrame	Chapter Toc Frame	CLASS	NM NM N	Java	Telegram
+stbttbuf	char strings	ATTRIBUTE	NM NPL	C	bullet3
+bool	check GL Support	FUNCTION	V NM N	C++	openFrameworks
+QSet	check Next	DECLARATION	V N	C++	kdevelop
+void	check Not Interface	FUNCTION	V VM N	Java	mockito
+void	check Not Local	FUNCTION	V VM N	Java	mockito
+void	check Sign	FUNCTION	V N	C++	QuantLib
+void	check T	FUNCTION	V N	C++	QuantLib
+void	Check Writeable	FUNCTION	V NM	C#	grpc
+Set	child Categories	DECLARATION	NM NPL	Java	junit4
+ChildLocation	Child Location	CLASS	NM N	C	gimp.idents
+Runnable	child Statement	PARAMETER	NM N	Java	junit4
+ChineseTreebankParserParams	Chinese Treebank Parser Params	CLASS	NM NM NM NPL	Java	corenlp
+void	choose Document	FUNCTION	V N	C++	kdevelop
+QList	chosen Overrides	ATTRIBUTE	NM NPL	C++	kdevelop
+CipherSuite	Cipher Suite	CLASS	NM N	Java	okhttp
+auto	clang Can Ty	DECLARATION	PRE NM N	C++	swift
+Annotation[]	class Annotations	FUNCTION	NM NPL	Java	junit4
+ClassLoaders	Class Loaders	CLASS	NM NPL	Java	mockito
+byte[]	classfile Buffer	PARAMETER	NM N	Java	mockito
+ClauseMatrix	Clause Matrix	CLASS	NM N	C++	swift
+void	Clear Active ID	FUNCTION	V NM N	C++	bullet3
+OkHttpClient	client	ATTRIBUTE	N	Java	okhttp
+HandshakeCertificates	client Certificates	DECLARATION	NM NPL	Java	okhttp
+int	client Number	ATTRIBUTE	NM N	Java	corenlp
+bool	clip Dist Bug	DECLARATION	NM NM N	C++	ogre
+Clock	Clock	CLASS	N	C++	openFrameworks
+ClusterUpdateSettingsResponse	Cluster Update Settings Response	CLASS	NM NM NM N	Java	elasticsearch
+List	cmd Lines	DECLARATION	NM NPL	Java	jenkins
+CmdOutputOperationType	Cmd Output Operation Type	CLASS	NM NM NM N	C++	ogre
+String	cmd Str	PARAMETER	NM N	Java	corenlp
+sharedptr	cms Pricer	DECLARATION	NM N	C++	QuantLib
+String	code Exception Message	DECLARATION	NM NM N	Java	okhttp
+CodeGeneratorPrivate	Code Generator Private	CLASS	NM N NM	C++	kdevelop
+int	code Point To	PARAMETER	NM N P	Java	antlr4
+int[]	CODES	ATTRIBUTE	NPL	Java	okhttp
+long	col Length Left	DECLARATION	NM NM N	C++	bullet3
+KoColorConversionSystem	color Conversion System	ATTRIBUTE	NM NM N	C++	calligra
+ColorMap	Color Map	CLASS	NM N	C++	calligra
+QMap	color Map	PARAMETER	NM N	C++	opencv
+uint	colored Count	ATTRIBUTE	NM N	C	kdevelop
+CombinePaintMaskToCanvasBufferToPaintBufAlpha	Combine Paint Mask To Canvas Buffer To Paint Buf Alpha	CLASS	NM NM N P NM N P NM NM N	C++	gimp.idents
+Command_t	Command t	CLASS	NM N	C	gimp.idents
+CompMask	Comp Mask	CLASS	NM N	C++	gimp.idents
+Field	compare Fields By Name	DECLARATION	V NPL P N	Java	mockito
+int	compare Inverse And Forward Dynamics	FUNCTION	V NM CJ NM NPL	C++	bullet3
+TValue	comparison Value	PARAMETER	NM N	C#	antlr4
+CompilerItem	Compiler Item	CLASS	NM N	C++	kdevelop
+bool	compiling Contexts	DECLARATION	V NPL	C	kdevelop
+BatchCompletionDelegate	Completion Handler I Unary Response Client Callback	ATTRIBUTE	NM N NM NM NM NM N	C#	grpc
+PixelComponentType	component Type	ATTRIBUTE	NM N	C	ogre
+List	concrete Follower Indices	PARAMETER	NM NM NPL	Java	elasticsearch
+mat4	cone translation	DECLARATION	NM N	C++	openFrameworks
+ConfigureTimeouts	Configure Timeouts	CLASS	NM NPL	Java	okhttp
+serverconnectionstate	connection state	PARAMETER	NM N	C++	grpc
+ConstParameterFloat	Const Parameter Float	CLASS	NM NM N	C++	ogre
+ContainerTabBar	Container Tab Bar	CLASS	NM NM N	C++	kdevelop
+ContentPath	Content Path	CLASS	NM N	Java	elasticsearch
+String	content Type String	PARAMETER	NM NM N	Java	okhttp
+int	context Length	ATTRIBUTE	NM N	Java	junit4
+ContextTokenListIndexedGetterDecl	Context Token List Indexed Getter Decl	CLASS	NM NM NM NM NM N	Java	antlr4
+QList	context Url List	FUNCTION	NM NM N	C++	kdevelop
+ContextualizeClosures	Contextualize Closures	CLASS	V NPL	C++	swift
+QVector	control Points	DECLARATION	NM NPL	C++	calligra
+ControlledObject	Controlled Object	CLASS	NM N	C++	blender
+ControllerValue	Controller Value	CLASS	NM N	C++	ogre
+string	Convert To Php Namespace	FUNCTION	V P NM N	C++	grpc
+b3ConvexPolyhedronData	convex Shapes	PARAMETER	NM NPL	C	bullet3
+CookieJar	cookie Jar	ATTRIBUTE	NM N	Java	okhttp
+ActionBarMenuItem	copy Item	DECLARATION	NM N	Java	Telegram
+GsrProcessCore	core	ATTRIBUTE	N	C++	QuantLib
+opusint64[]	corr QC	DECLARATION	NM N	C	Telegram
+sharedptr	coterminal Model	DECLARATION	NM N	C++	QuantLib
+char[]	cp 1254	DECLARATION	N D	C++	calligra
+CppGeneratorServices	Cpp Generator Services	CLASS	NM NM NPL	C#	grpc
+CqEventQueue	Cq Event Queue	CLASS	NM NM N	C++	grpc
+int	create Duplicate Change Id	FUNCTION	V NM NM N	C++	calligra
+ParameterPtr	create In Indices	FUNCTION	V NM NPL	C++	ogre
+bool	create metadata array	FUNCTION	V NM N	C++	grpc
+Buffer	create Sub Buffer	FUNCTION	V NM N	C++	opencv
+grpc_server_credentials* creds_	creds	ATTRIBUTE	NPL	C	grpc
+CRFNonLinearLogConditionalObjectiveFunction	CRF Non Linear Log Conditional Objective Function	CLASS	NM NM NM NM NM NM N	Java	corenlp
+GeglNode	crop node	ATTRIBUTE	NM N	C	gimp
+Exception	curr Thread Exception	DECLARATION	NM NM N	Java	junit4
+Mat	current Charuco Corners	DECLARATION	NM NM NPL	C++	opencv
+String	current Filename	ATTRIBUTE	NM N	Java	corenlp
+InvocationOnMock	current Invocation	PARAMETER	NM N	Java	mockito
+int16_t current_median	current median	PARAMETER	NM N	C	Telegram
+int	current Open Shards	DECLARATION	NM NM NPL	Java	elasticsearch
+String	current Prefix	PARAMETER	NM N	Java	jenkins
+Matrix	current Root	PARAMETER	NM N	C++	QuantLib
+ISceneNode	Current Scene Node	ATTRIBUTE	NM NM N	C++	irrlicht
+vector	current Sequence	DECLARATION	NM N	C++	QuantLib
+int	current Slide	FUNCTION	NM N	C++	calligra
+gint	curvatures height	PARAMETER	NM N	C	gimp
+vector	curves	FUNCTION	NPL	C++	QuantLib
+DBusProxy	D Bus Proxy	CLASS	NM NM N	C++	kdevelop
+float	d inf	DECLARATION	NM N	Java	Telegram
+Mat	D mat	PARAMETER	NM N	C++	opencv
+DobjPoints	D obj Points	CLASS	NM NM NPL	C	gimp.idents
+D3D11UnsupportedGpuProgram	D3D11 Unsupported Gpu Program	CLASS	NM NM NM N	C++	ogre
+DataAccessRepositoryPrivate	Data Access Repository Private	CLASS	NM NM N NM	C++	kdevelop
+FrameworkMethod	data Point Method	PARAMETER	NM NM N	Java	junit4
+GimpDebugPolicy	debug policy	ATTRIBUTE	NM N	C	gimp
+DeclarationContextPrivate	Declaration Context Private	CLASS	NM N NM	C++	kdevelop
+DeclarationItem	Declaration Item	CLASS	NM N	C++	kdevelop
+void	declare Component	FUNCTION	V N	C++	kdevelop
+Object	deep Stub	FUNCTION	NM N	Java	mockito
+guchar	default	PARAMETER	N	C	gimp
+Answer defaultAnswer	default Answer	ATTRIBUTE	NM N	Java	mockito
+AutoConstantEntry	default Auto Entry	PARAMETER	NM NM N	C++	ogre
+BlockedItem	DEFAULT BLOCKED ITEM COMPARATOR	DECLARATION	NM NM NM N	Java	jenkins
+ParametersRunnerFactory	DEFAULT FACTORY	ATTRIBUTE	NM N	Java	junit4
+int	DEFAULT FLAGS	ATTRIBUTE	NM NPL	Java	elasticsearch
+DefaultInjectionEngine	Default Injection Engine	CLASS	NM NM N	Java	mockito
+void	default instance void	ATTRIBUTE	NM NM N	C	opencv
+ColorManagedLook	default look	DECLARATION	NM N	C	blender
+Matx	default mat x	PARAMETER	NM NM N	C++	opencv
+DefaultMockingDetails	Default Mocking Details	CLASS	NM NM NPL	Java	mockito
+bool	default open	PARAMETER	NM N	C	bullet3
+String	default Role	PARAMETER	NM N	Java	jenkins
+DefaultSslRootsOverride	Default Ssl Roots Override	CLASS	NM NM NPL NM	C#	grpc
+Define	Define	CLASS	N	C++	calligra
+int	delay agnostic enabled	ATTRIBUTE	NM N V	C	Telegram
+DeleteResponse	delete	FUNCTION	V	Java	elasticsearch
+void	Delete Input	FUNCTION	V N	C	grpc
+AbstractCoreLabel	dep H	PARAMETER	NM N	Java	corenlp
+ST	dependencies ST	DECLARATION	NM N	Java	antlr4
+QString	DEPOT MESSAGE START	DECLARATION	NM NM N	C++	kdevelop
+freenectdepthcb	depth cb	ATTRIBUTE	NM N	C	openFrameworks
+bool	depth Stencil As Texture	ATTRIBUTE	NM N P N	C	openFrameworks
+DeserializationContext	Deserialization Context	CLASS	NM N	C#	grpc
+dimension2d	desktop Size	PARAMETER	NM N	C++	irrlicht
+GimpVector2	dest points	PARAMETER	NM NPL	C	gimp
+gboolean	destroy with parent	PARAMETER	V P N	C	gimp
+bool	Detect Leaks	ATTRIBUTE	V NPL	C	grpc
+Strictness	determine Strictness	FUNCTION	V N	Java	mockito
+InstallState	DEVELOPMENT	ATTRIBUTE	N	Java	jenkins
+DeviceHandler	Device Handler	CLASS	NM N	C++	opencv
+DialogElements	Dialog Elements	CLASS	NM NPL	C	gimp.idents
+void	dialog info update	FUNCTION	NM N V	C	gimp
+Real	discount Factor	PARAMETER	NM N	C++	QuantLib
+DiskLruCache	Disk Lru Cache	CLASS	NM NM N	Java	okhttp
+DispatchMaskBufferIterator	Dispatch Mask Buffer Iterator	CLASS	NM NM NM N	C++	gimp.idents
+DispatchPaintMask	Dispatch Paint Mask	CLASS	NM NM N	C++	gimp.idents
+DisposeType	dispose	PARAMETER	N	C	gimp
+DiscountFactor	dividend Discount Mother	FUNCTION	NM NM N	C++	QuantLib
+bool	do Caps	PARAMETER	V NPL	C++	QuantLib
+HttpResponse	do Forward	FUNCTION	V V	Java	jenkins
+HttpResponse	do Install Status	FUNCTION	V NM N	Java	jenkins
+void	do Login Entry	FUNCTION	V NM N	Java	jenkins
+bool	do Print After	FUNCTION	V V P	C++	swift
+DockWidgetArea	docking Area	PARAMETER	NM N	C++	kdevelop
+DOTGenerator	DOT Generator	CLASS	NM N	Java	antlr4
+int	down sample	PARAMETER	NM N	C	Telegram
+Real	drift term	DECLARATION	NM N	C++	QuantLib
+String	driver Type Name	PARAMETER	NM NM N	C++	ogre
+OutputArray	dst map 2	PARAMETER	NM N D	C++	opencv
+float	dst Saturation	DECLARATION	NM N	C	calligra
+int	dst Type 1	DECLARATION	NM N D	C++	opencv
+DualConInputReader	Dual Con Input Reader	CLASS	NM NM NM N	C++	blender
+void	dump cemd cmd	FUNCTION	V NM N	C	openFrameworks
+void	dump Core Map To String Builder	FUNCTION	V NM N P NM N	Java	corenlp
+void	dump Frame buffer Formats	FUNCTION	V NM NM NPL	C	irrlicht
+DvbSubtitleReader	Dvb Subtitle Reader	CLASS	NM NM N	Java	Telegram
+int	dynamic Table Byte Count	ATTRIBUTE	NM NM NM N	Java	okhttp
+int	dynamic Table Index	FUNCTION	NM NM N	Java	okhttp
+DynamicTexturedCubeDemo	Dynamic Textured Cube Demo	CLASS	NM NM NM N	C++	bullet3
+bool	echo path change	PARAMETER	V NM N	C	Telegram
+gint	edit count	ATTRIBUTE	NM N	C	gimp
+unsigned	Edit Length	PARAMETER	NM N	C++	swift
+Style	effective Style	FUNCTION	NM N	C++	calligra
+GLboolean	EGLEW ANDROID frame buffer target	ATTRIBUTE	PRE PRE NM NM N	C	blender
+PFNCREATEPLATFORMWINDOWSURFACE	eglew Create Platform Window Surface	ATTRIBUTE	PRE V NM NM N	C	blender
+GLboolean	EGLEW KHR stream fifo	ATTRIBUTE	PRE PRE NM N	C	blender
+GParamSpec	element spec	DECLARATION	NM N	C	gimp
+VersionNumber	EMBEDDED VERSION	ATTRIBUTE	NM N	Java	jenkins
+ManagedValue	emit Address	FUNCTION	V N	C++	swift
+FileObserver[]	EMPTY DIRECTORY	ATTRIBUTE	NM N	Java	elasticsearch
+void	enable Background Opacity	FUNCTION	V NM N	C++	calligra
+void	enable Materials	FUNCTION	V NPL	C++	openFrameworks
+freenectdeviceflags	enabled subdevices	ATTRIBUTE	NM NPL	C	openFrameworks
+List	encoded Path Segments	ATTRIBUTE	NM NM NPL	Java	okhttp
+List	encoded Values	PARAMETER	NM N	Java	okhttp
+boolean	end Of Input	DECLARATION	N P N	Java	antlr4
+ImmutableTextSnapshotRef	End Snapshot	PARAMETER	NM N	C++	swift
+int	enum Count	DECLARATION	NM N	Java	junit4
+String	env Name	PARAMETER	NM N	Java	antlr4
+EnvVarsSlaveInfo_DisplayName	Env Vars Slave Info Display Name	CLASS	NM NM NM NM NM N	Java	jenkins
+Real	EPS	PARAMETER	N	C++	QuantLib
+EqualsBuilder	equals Builder	DECLARATION	NM N	Java	mockito
+int	equals Offset	DECLARATION	NM N	Java	okhttp
+T	err ret	PARAMETER	NM N	C++	Telegram
+uint	error Mark Type	DECLARATION	NM NM N	C++	kdevelop
+ESetTextureActive	esa	PARAMETER	N	C	irrlicht
+Date	event 0	DECLARATION	N D	C++	QuantLib
+ArrayList	exception Channels	ATTRIBUTE	NM NPL	Java	Telegram
+int	excess Workload	PARAMETER	NM N	Java	jenkins
+Executor	Executor	CLASS	N	Java	jenkins
+List	executors	DECLARATION	NPL	Java	jenkins
+path	existing symlink	PARAMETER	NM N	C++	grpc
+Set	expand Headers From Request	FUNCTION	V NPL P N	Java	elasticsearch
+int	expect rows	DECLARATION	NM NPL	C++	opencv
+ExpectedException	Expected Exception	CLASS	NM N	Java	junit4
+String	expected String	DECLARATION	NM N	Java	junit4
+ExplicitEulerScheme	Explicit Euler Scheme	CLASS	NM NM N	C++	QuantLib
+ExpressionFinder	Expression Finder	CLASS	NM N	C++	swift
+ClusteredBitVector	extra Inhabitants Mask	DECLARATION	NM NM N	C++	swift
+List	extra Interfaces	DECLARATION	NM NPL	Java	mockito
+TimeSeries	extract Component	FUNCTION	V N	C++	QuantLib
+boolean	extract Events	ATTRIBUTE	V NPL	Java	corenlp
+Class	extract Raw Type Of	FUNCTION	V NM NM P	Java	mockito
+int	f Context Length	ATTRIBUTE	PRE NM N	Java	junit4
+Real	f Cos	DECLARATION	NM N	C++	ogre
+bool	f curve Found	DECLARATION	NM N V	C++	ogre
+FakeMetaMethod	f m m	DECLARATION	NM NM N	C++	kdevelop
+Matcher	f Matcher	ATTRIBUTE	PRE N	Java	junit4
+float	f Ptr Out	DECLARATION	PRE NM N	C++	openFrameworks
+Real	f Tolerance	PARAMETER	NM N	C++	ogre
+int	face index	DECLARATION	NM N	C	blender
+Class	factory Class	DECLARATION	NM N	Java	junit4
+bool	fast Load Success	DECLARATION	VM V N	C++	ogre
+FdBlackScholesVanillaEngine	Fd Black Scholes Vanilla Engine	CLASS	NM NM NM NM N	C++	QuantLib
+auto	feature Iterator	DECLARATION	NM N	C++	antlr4
+FeedAdapter	FEED ADAPTER	ATTRIBUTE	NM N	Java	jenkins
+void	fence	FUNCTION	N	C++	opencv
+unsigned	field Offset Vector	DECLARATION	NM NM N	C++	swift
+SourceFile	File	ATTRIBUTE	N	C++	swift
+FileChannel	file Channel	PARAMETER	NM N	Java	okhttp
+RepeatedField	file Descriptor Proto	ATTRIBUTE	NM NM N	C#	grpc
+FileItemDelegate	File Item Delegate	CLASS	NM NM N	C++	calligra
+FileOperator	File Operator	CLASS	NM N	Java	okhttp
+FilePathFilter	File Path Filter	CLASS	NM NM N	Java	jenkins
+FileSystemArchive	File System Archive	CLASS	NM NM N	C++	ogre
+String	filter Spec	DECLARATION	NM N	Java	junit4
+List	filtered Children	ATTRIBUTE	NM NPL	Java	junit4
+Response	final Response	PARAMETER	NM N	Java	elasticsearch
+Metadata	find Hashable Base Type	FUNCTION	V NM NM N	C++	swift
+String	find Source Subdir	FUNCTION	V NM N	Java	antlr4
+int	fine priority	PARAMETER	NM N	C	Telegram
+AlertDialog	fingerprint Dialog	ATTRIBUTE	NM N	Java	Telegram
+boolean	finished Normally	DECLARATION	V VM	Java	elasticsearch
+int	first Space	DECLARATION	NM N	Java	okhttp
+gchar	first type label	PARAMETER	NM NM N	C	gimp
+ICameraSceneNode	fixed Cam	DECLARATION	NM N	C++	irrlicht
+Frequency	fixed Leg Frequency	DECLARATION	NM NM N	C++	QuantLib
+FixedObject	Fixed Object	CLASS	NM N	C++	blender
+int	flag	ATTRIBUTE	N	C	blender
+Pair	flags Classifier Pair	PARAMETER	NM NM N	Java	corenlp
+float	Float	FUNCTION	N	C++	bullet3
+DayCounter	float Day Counter	ATTRIBUTE	NM NM N	C++	QuantLib
+vector	folder Names	PARAMETER	NM NPL	C++	openFrameworks
+MetadataResponse	follow Component	FUNCTION	V N	C++	swift
+Request	followUp	DECLARATION	N	Java	okhttp
+FootnotesPosition	footnotes Position	ATTRIBUTE	NM N	C++	calligra
+bool	force Direct	PARAMETER	V N	C++	kdevelop
+bool	forked iter	DECLARATION	V N	C	blender
+String	format Display Name	FUNCTION	V NM N	Java	junit4
+boolean	format OK	DECLARATION	N NM	Java	antlr4
+FormattedText	Formatted Text	CLASS	NM N	Java	mockito
+opusint16[]	frame	PARAMETER	N	C	Telegram
+Frame2	Frame 2	CLASS	N D	C++	blender
+float[]	frame pixel coords	PARAMETER	NM NM NPL	C	blender
+pointer	free cell	ATTRIBUTE	NM N	C	gimp
+int[]	free Positions	DECLARATION	NM NPL	Java	corenlp
+BytesReference	from Byte Buffers	FUNCTION	P NM NPL	Java	elasticsearch
+btVector3	from Local Aabb Min	DECLARATION	P NM NM N	C++	bullet3
+boolean	from Server	PARAMETER	P N	Java	Telegram
+gchar	full path	PARAMETER	NM N	C	gimp
+InputArray	Func	PARAMETER	N	C++	opencv
+IDocument	future Active Doc	DECLARATION	NM NM N	C++	kdevelop
+Real	gauss Lobatto Eps	PARAMETER	NM NM N	C++	QuantLib
+String	GEN SUBJ	ATTRIBUTE	NM N	Java	corenlp
+bool	generate Rule Bypass Transitions	DECLARATION	V NM NM NPL	C#	antlr4
+void	Generate Service Descriptor Property	FUNCTION	V NM NM N	C++	grpc
+String	generated Token On Creation	DECLARATION	NM N P N	Java	jenkins
+GenericEmissiveClosure	Generic Emissive Closure	CLASS	NM NM N	C++	blender
+GenericTypeExtractor	Generic Type Extractor	CLASS	NM NM N	Java	mockito
+GeometryInterface	Geometry Interface	CLASS	NM N	C++	bullet3
+Object[]	get Actual Values	FUNCTION	V NM NPL	Java	junit4
+String	get Artificial Op Prec Rule	FUNCTION	V NM NM NM N	Java	antlr4
+Side	get Binary Side	FUNCTION	V NM N	Java	corenlp
+String	get Commit Id	FUNCTION	V NM N	Java	jenkins
+long	get Completed	FUNCTION	V NM	Java	elasticsearch
+ConstantReference	get Constant Reference For Protocol Descriptor	FUNCTION	V NM N P NM N	C++	swift
+void	get Controller Transform	FUNCTION	V NM N	C++	bullet3
+TextureAtlasAttib	get Default Atlasing Attributes	FUNCTION	V NM NM NPL	C++	ogre
+InstallState	get Default Install State	FUNCTION	V NM NM N	Java	jenkins
+ColourValue	get Diffuse Colour	FUNCTION	V NM N	C++	ogre
+String	get Display Path	FUNCTION	V NM N	Java	jenkins
+bool	Get DMF Header	FUNCTION	V NM N	C	irrlicht
+List	get Error Listeners	FUNCTION	V NM NPL	Java	antlr4
+Long	get Failure Timestamp	FUNCTION	V NM N	Java	junit4
+GetHomeDirectory	Get Home Directory	CLASS	V NM N	Java	jenkins
+String	get Id For Name	FUNCTION	V N P N	Java	elasticsearch
+int	get Initial Window Size	FUNCTION	V NM NM N	Java	okhttp
+Descriptor	get Item Type Descriptor	FUNCTION	V NM NM N	Java	jenkins
+ematrix6	get Jf	FUNCTION	V N	C++	blender
+Tree	get Leftmost Descendant	FUNCTION	V NM N	Java	corenlp
+double	Get Longitude	FUNCTION	V N	C#	grpc
+int	get Max Shingle Diff	FUNCTION	V NM NM N	Java	elasticsearch
+int	Get Meta Index	FUNCTION	V NM N	C	Telegram
+FunctionType	get Msg Send Super Ty	FUNCTION	V NM NM NM N	C++	swift
+int	get Number Of Transitions	FUNCTION	V N P NPL	Java	antlr4
+long	get Operations Reads	FUNCTION	V NM NPL	Java	elasticsearch
+String	get Phrase 1	FUNCTION	V N D	Java	corenlp
+int	get PID No Exceptions	FUNCTION	V N DT NPL	Java	corenlp
+Vector3	get Plane Point	FUNCTION	V NM N	C	ogre
+Method	get Protocol Method	PARAMETER	NM NM N	Java	okhttp
+Object[]	get Raw Arguments	FUNCTION	V NM NPL	Java	mockito
+float	get Red Adjust 2	FUNCTION	V NM N D	C++	ogre
+double	Get Related View Data Row Double	FUNCTION	V NM NM NM N NM	C++	grpc
+String	GET SOURCE NAME	ATTRIBUTE	NM NM N	Java	elasticsearch
+Real	get Top Border Size	FUNCTION	V NM NM N	C++	ogre
+GetTotalDiskSpace	Get Total Disk Space	CLASS	V NM NM N	Java	jenkins
+path	get Working Directory	FUNCTION	V NM N	C++	irrlicht
+int	Get XCR 0	FUNCTION	V N D	C++	Telegram
+void	gimp canvas rectangle set property	FUNCTION	PRE NM N V N	C	gimp
+gboolean	gimp devices check change	FUNCTION	PRE NPL V V	C	gimp
+gboolean	gimp eraser default	FUNCTION	PRE N NM	C	gimp
+void	gimp filter tool set gyroscope	FUNCTION	PRE NM N V N	C	gimp
+GimpThumbnail	gimp image file get thumbnail	FUNCTION	PRE NM N V N	C	gimp
+void	gimp param drawable id init	FUNCTION	PRE NM NM N V	C	gimp
+void	gimp selection tool class init	FUNCTION	PRE NM NM N V	C	gimp
+void	gimp status bar progress canceled	FUNCTION	PRE NM NM N V	C	gimp
+void	gimp value set int32 array	FUNCTION	PRE N V NM N	C	gimp
+gboolean	gimp wire compare	FUNCTION	PRE N V	C	gimp
+GLConfigAttribs	GL Config Attribs	CLASS	PRE NM NPL	C++	ogre
+PFNGLINDEXMASKPROC	glew Index Mask	ATTRIBUTE	PRE NM N	C	blender
+PFNGLRENDERMODEPROC	glew Render Mode	ATTRIBUTE	PRE NM N	C	blender
+PFNGLVALIDATEPROGRAMPIPELINEPROC	glew Validate Program Pipeline	ATTRIBUTE	PRE V NM N	C	ogre
+ObjectValue	global Scope	FUNCTION	NM N	C++	kdevelop
+GlobalsAsMembersTableReaderInfo	Globals As Members Table Reader Info	CLASS	NPL P NPL NM NM N	C++	swift
+GLXConfigurator	GLX Configurator	CLASS	NM N	C++	ogre
+int	gpencil primitive modal	FUNCTION	NM NM N	C	blender
+QList	group Boxes	ATTRIBUTE	NM NPL	C++	kdevelop
+GroupByContext	Group By Context	CLASS	N P N	Java	elasticsearch
+String	group Role Attribute	PARAMETER	NM NM N	Java	jenkins
+string	grouping	PARAMETER	N	C++	grpc
+void	grpc chttp2 mark stream writable	FUNCTION	PRE PRE V N NM	C++	grpc
+void	grpc json writer value string	FUNCTION	PRE NM NM NM N	C++	grpc
+void	grpc sock addr make wildcards	FUNCTION	PRE NM N V NPL	C++	grpc
+Hpack	H pack	CLASS	NM N	Java	okhttp
+void	H Wnd	ATTRIBUTE	N NM	C	irrlicht
+bool	Handle If	FUNCTION	V CJ	C++	ogre
+QPointF	handle Pos	PARAMETER	NM N	C++	calligra
+DenseSet	handled Boxes	ATTRIBUTE	NM NPL	C++	swift
+HandshakeMode	Handshake Mode	CLASS	NM N	Java	elasticsearch
+bool	has Composite Op	FUNCTION	V NM N	C++	calligra
+bool	has View Relative Texture Coordinate Generation	FUNCTION	V NM NM NM NM N	C++	ogre
+bool	have Mask	DECLARATION	V N	C++	opencv
+HDRListener	HDR Listener	CLASS	NM N	C++	ogre
+HeatVisionListener	Heat Vision Listener	CLASS	NM NM N	C++	ogre
+gint	height int	ATTRIBUTE	NM N	C	gimp
+FastVectorHighlighter	highlighter	PARAMETER	N	Java	elasticsearch
+QAction	history Action	ATTRIBUTE	NM N	C++	kdevelop
+HtmlFile	html File	PARAMETER	NM N	C++	calligra
+Http1ExchangeCodec	Http1 Exchange Codec	CLASS	NM NM N	Java	okhttp
+int	http2 Error Code	PARAMETER	NM NM N	Java	okhttp
+IllegalArgumentException	i ar e	PARAMETER	NM NM N	Java	junit4
+IReadFile	I Read File	CLASS	NM NM N	C++	irrlicht
+SILDeclRef	i var Initializer	PARAMETER	NM NM N	C++	swift
+gint32	ico load layer	FUNCTION	PRE V N	C	gimp
+IcyDecoder	Icy Decoder	CLASS	NM N	Java	Telegram
+long	id A	PARAMETER	NM N	Java	jenkins
+int	ID LUMNINANCE MIN	ATTRIBUTE	NM NM N	Java	Telegram
+Map	id Mention	DECLARATION	N NM	Java	corenlp
+GQuark	identifier quark	DECLARATION	NM N	C	gimp
+long	idle Delay	ATTRIBUTE	NM N	Java	jenkins
+IdpConfiguration	idp Configuration	DECLARATION	NM N	Java	elasticsearch
+int	idx u	DECLARATION	NM N	C	blender
+vector	im out shape	ATTRIBUTE	NM NM N	C++	opencv
+ImageManager	Image Manager	CLASS	NM N	C++	blender
+void	imb stereo3d read interlace	FUNCTION	PRE NM V N	C	blender
+stbi_uc	img buffer original	ATTRIBUTE	NM N NM	C	ogre
+IplImage[]	img stub	DECLARATION	NM N	C++	opencv
+FREEIMAGETYPE	img Type	DECLARATION	NM N	C++	openFrameworks
+String	IMPLICIT GROUP KEY	DECLARATION	NM NM N	Java	elasticsearch
+Grammar	import G	PARAMETER	NM N	Java	antlr4
+wchart	in	PARAMETER	N	C++	bullet3
+float	in buffer	PARAMETER	NM N	C	openFrameworks
+Strategy	in Cache	DECLARATION	P N	Java	Telegram
+VerificationMode inOrderWrappedVerificationMode	in Order Wrapped Verification Mode	ATTRIBUTE	P N NM NM N	Java	mockito
+bool	in SCC	FUNCTION	P N	C++	swift
+boolean	inbound	PARAMETER	NM	Java	okhttp
+IndexData	index Data	ATTRIBUTE	NM NPL	C	ogre
+String	INDEX PREFIX WITH TEMPLATE	ATTRIBUTE	NM N P N	Java	elasticsearch
+int	index Tensor	PARAMETER	NM N	C++	opencv
+int	index To Loc Format	ATTRIBUTE	N P NM N	C	bullet3
+Integer[]	indexes Of Suspicious Args	DECLARATION	NPL P NM NPL	Java	mockito
+IntPtr	inertial Frame	DECLARATION	NM N	C#	bullet3
+InetSocketAddress	inet Socket Address	ATTRIBUTE	NM NM N	Java	okhttp
+Real	inflation Leg NPV	FUNCTION	NM NM N	C++	QuantLib
+void	init For Group	FUNCTION	V P N	Java	Telegram
+Throwable	INITIALIZATION ERROR	ATTRIBUTE	NM N	Java	mockito
+void	Initialize Dual Graph	FUNCTION	V NM N	C++	bullet3
+float	inner Alpha	ATTRIBUTE	NM N	Java	Telegram
+Radian	inner Angle	PARAMETER	NM N	C++	ogre
+auto	inst Results	DECLARATION	NM NPL	C++	swift
+InstallUncaughtExceptionHandler	Install Uncaught Exception Handler	CLASS	V NM NM N	Java	jenkins
+QString	institution	PARAMETER	N	C++	calligra
+string	int Hex String	PARAMETER	NM NM N	C++	openFrameworks
+Type[]	interface Bounds	DECLARATION	NM NPL	Java	mockito
+Internal	Internal	CLASS	NM	Java	okhttp
+InternalFFMpegRegister	Internal FFMpeg Register	CLASS	NM NM N	C++	opencv
+Real	inv Flight K 2	DECLARATION	NM NM N D	C++	QuantLib
+float	inv unit scale	DECLARATION	NM NM N	C	blender
+Interval	INVALID	ATTRIBUTE	NM	Java	antlr4
+String	INVALID HOST	ATTRIBUTE	NM N	Java	okhttp
+int	INVALID STATE NUMBER	ATTRIBUTE	NM NM N	Java	antlr4
+InverseDynamicsExample	Inverse Dynamics Example	CLASS	NM NM N	C++	bullet3
+bool	Invert Success	ATTRIBUTE	V N	C++	swift
+Vector3	inverted Direction	DECLARATION	NM N	C++	ogre
+OutputStream	ios	PARAMETER	N	Java	jenkins
+int	is a empty	DECLARATION	V DT N	C	grpc
+boolean	is Android	FUNCTION	V N	Java	mockito
+boolean	is Blocked By Shutdown	FUNCTION	V V P N	Java	jenkins
+BitVector	Is Bridged Argument	ATTRIBUTE	V NM N	C++	swift
+bool	is canon	FUNCTION	V N	C	blender
+boolean	is Conscrypt Preferred	FUNCTION	V N V	Java	okhttp
+boolean	is Dependency Changed	FUNCTION	V N V	Java	antlr4
+ScorePhraseMeasures	IS FIRST CAPITAL	ATTRIBUTE	V NM N	Java	corenlp
+boolean	is First Frame	ATTRIBUTE	V NM N	Java	okhttp
+bool	is Friend	PARAMETER	V N	C++	kdevelop
+bool	is Generic Type Disambiguating Token	FUNCTION	V NM NM NM N	C++	swift
+bool	Is Indirect Result	ATTRIBUTE	V NM N	C	swift
+bool	is Node A Left Child Leaf	DECLARATION	V N DT NM NM N	C++	bullet3
+int	is partial	ATTRIBUTE	V N	C	opencv
+bool	Is Return Bridged	ATTRIBUTE	V N NM	C++	swift
+bool	is Stdlib Module	FUNCTION	V NM N	C++	swift
+bool	is Sum Supported	FUNCTION	V N V	C++	opencv
+bool	is Trained	DECLARATION	V V	C++	opencv
+IsVariadic	is V	PARAMETER	V NM	C++	swift
+bool	is Vaild	PARAMETER	V NM	C++	ogre
+bool	is Valid Trailing Closure	FUNCTION	V NM NM N	C++	swift
+double	items per second	DECLARATION	NPL P N	C++	grpc
+JarURLConnection	jar URL Connection	DECLARATION	NM NM N	Java	jenkins
+JFlexDummyLexer	JFlex Dummy Lexer	CLASS	PRE NM N	Java	corenlp
+JntArrayAcc	Jnt Array Acc	CLASS	NM NM N	C++	blender
+u32	joint Start	DECLARATION	NM N	C++	irrlicht
+bool	keep Aspect	ATTRIBUTE	V N	C++	calligra
+List	keep Readability Only On Descendants	FUNCTION	V NM VM P NPL	Java	jenkins
+QString	key	PARAMETER	N	C++	kdevelop
+int	key Begin	DECLARATION	NM N	Java	okhttp
+queue	key Events Copy	DECLARATION	NM NM N	C++	openFrameworks
+KeyFrame	key Frame 2	PARAMETER	NM N D	C++	ogre
+Setting	KEY PASSWORD PROFILES	ATTRIBUTE	NM NM NPL	Java	elasticsearch
+KeyStatus	Key Status	CLASS	NM N	Java	Telegram
+Keysym	key sym	ATTRIBUTE	NM N	C	ogre
+map	kinects Copy	DECLARATION	NM N	C++	openFrameworks
+KoMainWindowPrivate	Ko Main Window Private	CLASS	PRE NM N NM	C++	calligra
+KoPathPointRemoveCommandPrivate	Ko Path Point Remove Command Private	CLASS	PRE NM NM NM N NM	C++	calligra
+KoRgbU8InvertColorTransformation	Ko Rgb U8 Invert Color Transformation	CLASS	PRE NM NM NM NM N	C++	calligra
+KoSectionEndPrivate	Ko Section End Private	CLASS	PRE NM N NM	C++	calligra
+KoShadowStylePrivate	Ko Shadow Style Private	CLASS	PRE NM N NM	C++	calligra
+ListBase	l b layer	PARAMETER	NM NM N	C	blender
+LabelAction	Label Action	CLASS	NM N	C++	calligra
+gchar	label casefold	DECLARATION	NM N	C	gimp
+LabelDrawingWidget	Label Drawing Widget	CLASS	NM NM N	C++	calligra
+int	label Op	PARAMETER	NM N	Java	antlr4
+LabelElementPair	label Pair	PARAMETER	NM N	Java	antlr4
+Pattern	label Regex	ATTRIBUTE	NM N	Java	corenlp
+int[]	labels	PARAMETER	NPL	C++	opencv
+String	labels File	PARAMETER	NM N	Java	corenlp
+HashMap	language To Rules Files	ATTRIBUTE	N P NM NPL	Java	corenlp
+Array	last Gradient	FUNCTION	NM N	C++	QuantLib
+Size	last Saved Step	ATTRIBUTE	NM NM N	C++	QuantLib
+bool	last Token Was Delete Or Default	DECLARATION	NM N V N CJ N	C++	kdevelop
+String	LATENCY ARG	ATTRIBUTE	NM N	Java	elasticsearch
+GimpValueArray	layer get composite mode invoker	FUNCTION	NM V NM NM N	C	gimp
+GeglNode	layer mask source node	ATTRIBUTE	NM NM NM N	C	gimp
+LayerParameter	Layer Parameter	CLASS	NM N	C++	opencv
+vector	layer sizes	DECLARATION	NM NPL	C++	opencv
+LayoutData	Layout Data	CLASS	NM N	C++	calligra
+double[]	learned Lop Expert Weights 2 D	PARAMETER	NM NM NM NPL D N	Java	corenlp
+Lease	Lease	CLASS	N	Java	jenkins
+int	led Color	PARAMETER	NM N	Java	Telegram
+LeftRecursiveRuleFunction	Left Recursive Rule Function	CLASS	NM NM NM N	Java	antlr4
+int	left Sisters Buffer	DECLARATION	NM NM N	Java	corenlp
+int	len r	PARAMETER	NM N	C++	bullet3
+LessDummyGuiHelper	Less Dummy Gui Helper	CLASS	NM NM NM N	C++	bullet3
+unsigned	lhs Component	DECLARATION	NM N	C++	swift
+LightAttenuationValue	Light Attenuation Value	CLASS	NM NM N	C++	ogre
+vector3df	light Dim	PARAMETER	NM N	C++	irrlicht
+double	line 1 grad	DECLARATION	NM D N	C	gimp
+int	line index mask len	PARAMETER	NM NM NM N	C	blender
+int	line Y	DECLARATION	NM N	C++	calligra
+c8[]	Link Name	ATTRIBUTE	NM N	C	irrlicht
+QUrl	link URL	DECLARATION	NM N	C++	calligra
+ListLevel	List Level	CLASS	NM N	C++	calligra
+ResultBucket	literal Bucket	PARAMETER	NM N	C++	swift
+int	loaded mentions count	DECLARATION	NM NM N	Java	Telegram
+LocalRef	Local Ref	CLASS	NM N	Java	antlr4
+vector	locations	ATTRIBUTE	NPL	C++	QuantLib
+bool	locking	PARAMETER	V	C++	grpc
+LogMixedLinearCubicInterpolation	Log Mixed Linear Cubic Interpolation	CLASS	NM NM NM NM N	C++	QuantLib
+Logger	LOGGER	ATTRIBUTE	N	Java	jenkins
+String[]	logger Name Parts	DECLARATION	NM NM NPL	Java	jenkins
+Real	lower Boundary Factor	FUNCTION	NM NM N	C++	QuantLib
+int	lower tail	DECLARATION	NM N	C++	calligra
+String	lp Binary Path Name	PARAMETER	PRE NM NM N	Java	jenkins
+LsmBasisSystem	Lsm Basis System	CLASS	NM NM N	C++	QuantLib
+LVLCurrency	LVL Currency	CLASS	NM N	C++	QuantLib
+VcsAnnotation	m annotation	ATTRIBUTE	PRE N	C++	kdevelop
+b3Scalar	m contact Motion 1	ATTRIBUTE	PRE NM N D	C++	bullet3
+int	m count Activities	ATTRIBUTE	PRE NM NPL	Java	openFrameworks
+bool	m execute On Host	ATTRIBUTE	PRE V P N	C++	kdevelop
+int	m FBO Height	ATTRIBUTE	PRE NM N	Java	opencv
+uint8 mFirstRenderQueue	m First Render Queue	ATTRIBUTE	PRE NM NM N	C++	ogre
+b3OpenCLArray	m gpu Rays	ATTRIBUTE	PRE NM NPL	C++	bullet3
+bool	m is indx present	ATTRIBUTE	PRE V NM N	C++	opencv
+AtomicBoolean	m is Worker Done	ATTRIBUTE	PRE V N NM	Java	openFrameworks
+Cursor	m last Changed Location	ATTRIBUTE	PRE NM NM N	C++	kdevelop
+float	m Line Dash Offset	ATTRIBUTE	PRE NM NM N	C++	openFrameworks
+int	m num Visual Shapes Copied	ATTRIBUTE	PRE NM NM NM N	C	bullet3
+Mode	m paste Mode	ATTRIBUTE	PRE NM N	C++	calligra
+char[]	m post Fix	ATTRIBUTE	PRE NM N	C	bullet3
+boolean	m Preview Started	ATTRIBUTE	PRE N NM	Java	opencv
+MultiBodyTree	m reference	ATTRIBUTE	PRE N	C++	bullet3
+UserDataRequestArgs	m remove User Data Response Args	ATTRIBUTE	PRE NM NM NM NM NPL	C	bullet3
+Resources	m Resources	ATTRIBUTE	PRE NPL	Java	opencv
+streambuf	m sbuf	ATTRIBUTE	PRE N	C++	ogre
+Pass	m Shadow Receiver Pass	ATTRIBUTE	PRE NM NM N	C	ogre
+Quaternion	m Sky Box Orientation	ATTRIBUTE	PRE NM NM N	C	ogre
+uint8	m Sky Plane Render Queue	ATTRIBUTE	PRE NM NM NM N	C	ogre
+QStringList	m text Types	ATTRIBUTE	PRE NM NPL	C++	kdevelop
+b3TransformChangeNotificationArgs	m transform Change Args	ATTRIBUTE	PRE NM NM NPL	C	bullet3
+ParameterPtr	m VS Out Light Position	ATTRIBUTE	PRE NM NM NM N	C	ogre
+Queue	m weaver	ATTRIBUTE	PRE N	C++	kdevelop
+int	m window Width	ATTRIBUTE	PRE NM N	C++	bullet3
+CodeCompletionWorker	m worker	ATTRIBUTE	PRE N	C++	kdevelop
+ZoomController	m zoom Controller	ATTRIBUTE	PRE NM N	C++	kdevelop
+MainClass	Main Class	CLASS	NM N	C#	grpc
+String	make HTML Table	FUNCTION	V NM N	Java	corenlp
+String	MANIFEST FILE PREFIX	ATTRIBUTE	NM NM N	Java	elasticsearch
+guchar	mapped color	PARAMETER	NM N	C	gimp
+vector	mapped labels	DECLARATION	NM NPL	C++	opencv
+Maps	Maps	CLASS	NPL	Java	corenlp
+void	mark As Fetching	FUNCTION	V P V	Java	elasticsearch
+MaskComponents	Mask Components	CLASS	NM NPL	C++	gimp.idents
+boolean	match By IP	FUNCTION	V P N	Java	elasticsearch
+ExpectedExceptionMatcherBuilder	matcher Builder	ATTRIBUTE	NM N	Java	junit4
+boolean	matches Any Parent Categories	FUNCTION	V DT NM NPL	Java	junit4
+long	max Age Seconds Long	DECLARATION	NM NM NPL NM	Java	okhttp
+sharedptr	max basket	DECLARATION	NM N	C++	QuantLib
+int	max Buffer Size	ATTRIBUTE	NM NM N	C++	opencv
+MaxCore	Max Core	CLASS	NM N	Java	junit4
+int	max Draw Buffers	FUNCTION	NM NM NPL	C++	openFrameworks
+quint64	max File Open	ATTRIBUTE	NM N NM	C++	kdevelop
+sizet	max input size	DECLARATION	NM NM N	C++	grpc
+int	max Intermediate Cas	PARAMETER	NM NM NPL	Java	okhttp
+int	max Requests Per Host	PARAMETER	NM NPL P N	Java	okhttp
+double	max scale f	PARAMETER	NM NM N	C++	opencv
+int	max width	ATTRIBUTE	NM N	C++	openFrameworks
+sizet	max Work Group Size	ATTRIBUTE	NM NM NM N	C++	opencv
+void	maximize All	FUNCTION	V DT	C++	openFrameworks
+int	media Chunk Index	PARAMETER	NM NM N	Java	Telegram
+MediaChunkIterator	Media Chunk Iterator	CLASS	NM NM N	Java	Telegram
+uint	melanin ofs	DECLARATION	NM N	C	blender
+MentionDetectionEvaluator	Mention Detection Evaluator	CLASS	NM NM N	Java	corenlp
+String	merged Type	DECLARATION	NM N	Java	elasticsearch
+MetadataSnapshot	metadata Snapshot	FUNCTION	NM N	Java	elasticsearch
+Class	method Handles	DECLARATION	NM NPL	Java	mockito
+auto	method index	DECLARATION	NM N	C++	grpc
+String[]	method Name Prefixes	PARAMETER	NM NM NPL	Java	junit4
+double	Min Error	PARAMETER	NM N	C++	opencv
+int	min Font Size	ATTRIBUTE	NM NM N	Java	corenlp
+int	min Fresh	PARAMETER	NM N	Java	okhttp
+int	min Fresh Seconds	PARAMETER	NM NM NPL	Java	okhttp
+TransducerGraph	minimized Random FA	DECLARATION	NM NM N	Java	corenlp
+MINIMUM_SUPPORTED_VERSION	MINIMUM SUPPORTED VERSION	CLASS	NM NM N	Java	jenkins
+Set	missing Classes	DECLARATION	NM NPL	Java	elasticsearch
+vector	mkt Factors	PARAMETER	NM NPL	C++	QuantLib
+MockCreationSettings	mock Creation Settings	PARAMETER	NM NM NPL	Java	mockito
+MockCreationValidator	Mock Creation Validator	CLASS	NM NM N	Java	mockito
+Method	mock Method	DECLARATION	NM N	Java	mockito
+MockReference	mock Ref	ATTRIBUTE	NM N	Java	mockito
+MockitoAssertionError	Mockito Assertion Error	CLASS	PRE NM N	Java	mockito
+ModificationInterface	mod Iface	DECLARATION	NM N	C++	kdevelop
+Sezpoz	module Finder	ATTRIBUTE	NM N	Java	jenkins
+ModuleHandler	Module Handler	CLASS	NM N	Java	mockito
+MonoPInvokeCallbackAttribute	Mono P Invoke Callback Attribute	CLASS	NM NM NM NM N	C#	grpc
+int	MORE	ATTRIBUTE	DT	Java	antlr4
+TsurgeonPattern	move RBT surgeon	ATTRIBUTE	NM NM N	Java	corenlp
+MediaHandler	movie Media Handler	DECLARATION	NM NM N	C++	openFrameworks
+Mp4Extractor	Mp4 Extractor	CLASS	NM N	Java	Telegram
+MP4Input	MP4 Input	CLASS	NM N	Java	Telegram
+MpegAudioReader	Mpeg Audio Reader	CLASS	NM NM N	Java	Telegram
+mutex	mtx	ATTRIBUTE	N	C++	opencv
+MultiCubicSpline	MultiCubic Spline	CLASS	NM N	C++	QuantLib
+MultipleFailureException	Multiple Failure Exception	CLASS	NM NM N	Java	junit4
+bool	multiple Occurences	ATTRIBUTE	NM NPL	C++	calligra
+MultiPolygon	MultiPolygon	CLASS	N	Java	elasticsearch
+ThreadMXBean	mx Bean	DECLARATION	NM N	Java	junit4
+NormalDistribution	n d	ATTRIBUTE	NM N	C++	QuantLib
+gint32	n layers	PARAMETER	NM NPL	C	gimp
+int nrepeats	n repeats	DECLARATION	NM NPL	C++	opencv
+float	n Shininess	PARAMETER	NM N	C++	openFrameworks
+vector3d	n vector	PARAMETER	NM N	C	irrlicht
+LinearLayout	name Container	ATTRIBUTE	NM N	Java	Telegram
+List	named Writeables	PARAMETER	NM NPL	Java	elasticsearch
+bool	Nav Visible	ATTRIBUTE	N NM	C	bullet3
+int	nbr gaps	DECLARATION	NM NPL	C	blender
+int	nd Formula	DECLARATION	NM N	C++	calligra
+int	nearest point	DECLARATION	NM N	C	blender
+bool	need fallback	DECLARATION	V N	C	blender
+guint	neighbor pos	DECLARATION	NM N	C	gimp
+NetStateRuleDefaultTypeInternal	Net State Rule Default Type Internal	CLASS	NM NM NM NM N NM	C++	opencv
+Object	network Security Policy	DECLARATION	NM NM N	Java	okhttp
+NewAggregateBuilderMap	New Aggregate Builder Map	CLASS	NM NM NM N	C++	swift
+Exchange	new Exchange	FUNCTION	NM N	Java	okhttp
+MappedFieldType	new Field Type	PARAMETER	NM NM N	Java	elasticsearch
+List	new Files	PARAMETER	NM NPL	Java	corenlp
+gint	new image height	DECLARATION	NM NM N	C	gimp
+int	new order	DECLARATION	NM N	C	blender
+String	new Rule Text	DECLARATION	NM NM N	Java	antlr4
+SILType	new Sil Type	DECLARATION	NM NM N	C++	swift
+QList	new Strokes	ATTRIBUTE	NM NPL	C++	calligra
+VerificationMode	new Verification Mode	PARAMETER	NM NM N	Java	mockito
+long	new Warning Header Size	DECLARATION	NM NM NM N	Java	elasticsearch
+float	new Y	ATTRIBUTE	NM N	C++	calligra
+QRegularExpression	next Fragment Expression	DECLARATION	NM NM N	C++	kdevelop
+char	next func	PARAMETER	NM N	C	blender
+int	next Giphy Search Offset	ATTRIBUTE	NM NM NM N	Java	Telegram
+string	next Line	PARAMETER	NM N	C++	kdevelop
+byte[]	next Search	DECLARATION	NM N	Java	elasticsearch
+ParameterSignature	next Unassigned	FUNCTION	NM NM	Java	junit4
+bool	nla invert combine value	FUNCTION	NM V NM N	C	blender
+ShaderOutput	node find output by name	FUNCTION	N V N P N	C++	blender
+NodeShape	Node Shape	CLASS	NM N	C++	blender
+CSGNoiseSource	noise 1	DECLARATION	N D	C++	ogre
+JsonArrayBuilder	non Greedy States Builder	DECLARATION	NM NM NM N	Java	antlr4
+sizet	nonce length	ATTRIBUTE	NM N	C++	grpc
+void	notify Touch Down	FUNCTION	V NM N	C++	openFrameworks
+MockitoException	null Passed To Verify No More Interactions	FUNCTION	N V P V VM DT NPL	Java	mockito
+NullProgram	Null Program	CLASS	NM N	C++	ogre
+Class	nullable Class	PARAMETER	NM N	Java	junit4
+boolean	nulls Ok	FUNCTION	NPL NM	Java	junit4
+Num	Num	CLASS	N	C#	grpc
+int	num Active Contexts	PARAMETER	NM NM NPL	C++	bullet3
+u16	Num Active Tris	ATTRIBUTE	NM NM NPL	C++	irrlicht
+gint	num axis events	ATTRIBUTE	NM NM NPL	C	gimp
+u32	num body parts	ATTRIBUTE	NM NM NPL	C	irrlicht
+int	num cols	ATTRIBUTE	NM NPL	C	blender
+Size	num Factors	PARAMETER	NM NPL	C++	QuantLib
+int	num Files	PARAMETER	NM NPL	C++	openFrameworks
+u32	num groups	ATTRIBUTE	NM NPL	C	irrlicht
+int	num Keys	PARAMETER	NM NPL	Java	corenlp
+gint	num light	ATTRIBUTE	NM N	C	gimp
+sizet	num metadata	DECLARATION	NM N	C++	grpc
+int	num Outs	ATTRIBUTE	NM NPL	Java	openFrameworks
+sizet	num primes	DECLARATION	NM NPL	C	Telegram
+int	num States	DECLARATION	NM NPL	Java	corenlp
+int	num Tess Face Data	ATTRIBUTE	NM NM NM N	C	blender
+int16t	num Vec Per Segment	DECLARATION	NM N P N	C	Telegram
+int	num Verts In A	DECLARATION	NM NPL P N	C	bullet3
+Size	number Elementary Vegas	ATTRIBUTE	NM NM NPL	C++	QuantLib
+u32	number Of Joysticks	DECLARATION	N P NPL	C++	irrlicht
+u16	Number Start	ATTRIBUTE	NM N	C	irrlicht
+String	number Str	PARAMETER	NM N	Java	jenkins
+QString	numbering Path	DECLARATION	NM N	C++	calligra
+OAuthSession	OAuth Session	CLASS	NM N	Java	okhttp
+OAuthSessionFactory	OAuth Session Factory	CLASS	NM NM N	Java	okhttp
+ObjectProjection	obj Proj	DECLARATION	NM N	C++	swift
+Object	object With To String	FUNCTION	N P P N	Java	junit4
+OdfSymbolType	odf Symbol Type	ATTRIBUTE	NM NM N	C++	calligra
+OFAndroidLifeCycleHelper	OF Android Life Cycle Helper	CLASS	PRE NM NM NM N	Java	openFrameworks
+OFAndroidObject	OF Android Object	CLASS	PRE NM N	Java	openFrameworks
+OFAndroidSoundPlayer	OF Android Sound Player	CLASS	PRE NM NM N	Java	openFrameworks
+OFAndroidWindow	OF Android Window	CLASS	PRE NM N	Java	openFrameworks
+OFOrientationListener	OF Orientation Listener	CLASS	PRE NM N	Java	openFrameworks
+string	of To Binary	FUNCTION	PRE P N	C++	openFrameworks
+grpcclosure	on connect	PARAMETER	P N	C	grpc
+void	on Group Call Key Sent	FUNCTION	P NM NM N NM	Java	Telegram
+Void	on Implies	FUNCTION	P V	Java	jenkins
+OnItemLongClickListener	on Item Long Click Listener	ATTRIBUTE	P NM NM NM N	Java	Telegram
+boolean	one Document	DECLARATION	NM N	Java	corenlp
+Notification	ongoing Call Notification	ATTRIBUTE	NM NM N	Java	Telegram
+int	OP CODE CONTINUATION	ATTRIBUTE	NM NM N	Java	okhttp
+OpPool	Op Pool	CLASS	NM N	C++	opencv
+String[]	open Class Tags	DECLARATION	NM NM NPL	Java	corenlp
+DeclTable	operator Method Decls	PARAMETER	NM NM NPL	C++	swift
+bool	optimize Identity Cast Composition	FUNCTION	V NM NM N	C++	swift
+OrderWith	order With	DECLARATION	V P	Java	junit4
+Request	order With	FUNCTION	V P	Java	junit4
+List	ordered Invocations	PARAMETER	NM NPL	Java	mockito
+float	ori W	DECLARATION	NM N	C++	bullet3
+MockingDetails	original Mocking Details	ATTRIBUTE	NM NM NPL	Java	mockito
+Set	original Set	PARAMETER	NM N	Java	elasticsearch
+long	other Data Len Bits	ATTRIBUTE	NM NM NM NPL	Java	Telegram
+gdouble	other side x	ATTRIBUTE	NM NM N	C	gimp
+char	out buf	PARAMETER	NM N	C	irrlicht
+double[]	out d G	DECLARATION	N NM NM	C	Telegram
+char[]	out table	PARAMETER	NM N	C	bullet3
+TestRule	outer Rule	PARAMETER	NM N	Java	junit4
+T	output array	PARAMETER	NM N	C++	grpc
+OutputDelegatePrivate	Output Delegate Private	CLASS	NM N NM	C++	kdevelop
+int	overlap	PARAMETER	N	C	Telegram
+auto	overriden Function It	DECLARATION	NM NM N	C++	kdevelop
+short	own flags	PARAMETER	NM NPL	C	blender
+unzglobalinfo	p global info 32	PARAMETER	PRE NM N D	C	bullet3
+long	packet Sample Count	PARAMETER	NM NM N	Java	Telegram
+Optional	packet sent	ATTRIBUTE	NM N	C	grpc
+auto	PAI Arg	DECLARATION	NM N	C++	swift
+GtkWidget	paint radio	DECLARATION	NM N	C	gimp
+bool	palette poll	FUNCTION	N V	C	blender
+sizet	palette size	PARAMETER	NM N	C++	bullet3
+ParallelComputer	Parallel Computer	CLASS	NM N	Java	junit4
+ParameterDef	param Def	DECLARATION	NM N	C	ogre
+Assignments	parameter Assignment	PARAMETER	NM N	Java	junit4
+ParameterSignature	Parameter Signature	CLASS	NM N	Java	junit4
+String	PARENT	ATTRIBUTE	N	Java	jenkins
+vector	parent field	PARAMETER	NM N	C++	opencv
+boolean	parent Had Big Change	PARAMETER	N V NM N	Java	elasticsearch
+List	parent Pairs	FUNCTION	NM NPL	Java	corenlp
+Attribute	parse Attribute Def	FUNCTION	V NM N	Java	antlr4
+long	parse Expires	FUNCTION	V NPL	Java	okhttp
+ParseJobPrivate	Parse Job Private	CLASS	NM N NM	C++	kdevelop
+long	parse Max Age	FUNCTION	V NM N	Java	okhttp
+List parserErrors	parser Errors	ATTRIBUTE	NM NPL	Java	junit4
+bool	parsing	PARAMETER	V	C++	calligra
+PartDocumentPrivate	Part Document Private	CLASS	NM N NM	C++	kdevelop
+int	parts Size	DECLARATION	NM N	Java	elasticsearch
+MachineInstContainer	Pass Machine Instructions	PARAMETER	NM NM NPL	C	ogre
+string	passphrase	DECLARATION	N	C++	openFrameworks
+PatchCoordBuffer	Patch Coord Buffer	CLASS	NM NM N	C++	blender
+QString	path With Native Separators	FUNCTION	N P NM NPL	C	kdevelop
+PatternsAnnotations	Patterns Annotations	CLASS	NM NPL	Java	corenlp
+Real	pd Sum	DECLARATION	NM N	C++	QuantLib
+char	pem key	PARAMETER	NM N	C++	grpc
+bool	pen Loaded	PARAMETER	N V	C++	calligra
+PhysicsClientExample	Physics Client Example	CLASS	NM NM N	C++	bullet3
+int	pi Hash	PARAMETER	PRE N	C	Telegram
+c8	pickup	ATTRIBUTE	N	C	irrlicht
+PiecewiseConstantAbcdVariance	Piecewise Constant Abcd Variance	CLASS	NM NM NM N	C++	QuantLib
+s32	pixel Width	ATTRIBUTE	NM N	C	irrlicht
+string[]	platform String	DECLARATION	NM N	C++	openFrameworks
+void	png do strip channel	FUNCTION	NM V V N	C	irrlicht
+PNGAPI	png get row bytes	FUNCTION	NM V NM NPL	C	irrlicht
+PNGAPI	png get rows	FUNCTION	NM V NPL	C	irrlicht
+PNGAPI	png get unknown chunks	FUNCTION	NM V NM NPL	C	irrlicht
+void	png init palette transformations	FUNCTION	NM V NM NPL	C	irrlicht
+void	png read IDAT data	FUNCTION	NM V NM N	C	irrlicht
+void	pnm load raw pfm	FUNCTION	NM V NM N	C	gimp
+Point3_	Point 3	CLASS	N D	C++	opencv
+int	point index	PARAMETER	NM N	C	blender
+PointerMap	Pointer Map	CLASS	NM N	C++	grpc
+String	polling Log	PARAMETER	NM N	Java	jenkins
+PostFile	Post File	CLASS	NM N	Java	okhttp
+PostScriptDocument	Post Script Document	CLASS	NM NM N	Java	antlr4
+Map	pre Map	PARAMETER	P N	Java	corenlp
+PredicateWrapper	Predicate Wrapper	CLASS	NM N	C++	blender
+String	PREF FONT	ATTRIBUTE	NM N	Java	corenlp
+aiVector3D	present Scaling	DECLARATION	NM N	C++	openFrameworks
+void	presentation Start From First	FUNCTION	N V P NM	C++	calligra
+int	prev num hooks	DECLARATION	NM NM NPL	C++	grpc
+IntervalSet	prev Property	ATTRIBUTE	NM N	Java	antlr4
+int	prev Signal Bar Count	DECLARATION	NM NM NM N	C++	Telegram
+String	previous Caption	ATTRIBUTE	NM N	Java	Telegram
+QModelIndex	previous Index	PARAMETER	NM N	C++	calligra
+Real	previous Initial Value	PARAMETER	NM NM N	C++	QuantLib
+bool	previous Is Valid	PARAMETER	N V NM	C++	calligra
+int	previous Stream Id	DECLARATION	NM NM N	Java	okhttp
+D3D11PRIMITIVETOPOLOGY	prim Type	DECLARATION	NM N	C++	ogre
+PrintEvents	Print Events	CLASS	NM NPL	Java	okhttp
+int	print Features Up to	ATTRIBUTE	V NPL VM P	Java	corenlp
+PrintLabelFlag	print label flag	PARAMETER	NM NM N	C++	opencv
+boolean	print t	PARAMETER	V N	C	gimp
+ProblemReporterFactory	Problem Reporter Factory	CLASS	NM NM N	C++	kdevelop
+Process	Process	CLASS	N	C++	gimp.idents
+sharedptr	process Helper	FUNCTION	NM N	C++	QuantLib
+Features	processing Level	DECLARATION	NM N	C++	kdevelop
+Bool	progress out	DECLARATION	V N	C	irrlicht
+bool	progressive	PARAMETER	N	C++	blender
+ProjectControllerPrivate	Project Controller Private	CLASS	NM N NM	C++	kdevelop
+QString	project file	PARAMETER	NM N	C++	kdevelop
+Path	projects Dir	PARAMETER	NM N	C++	kdevelop
+List	promises	PARAMETER	NPL	Java	okhttp
+String	pronoun	PARAMETER	N	Java	corenlp
+GimpColorProfile	proof profile	ATTRIBUTE	NM N	C	gimp
+IntervalSet	property Interval Set	PARAMETER	NM NM N	Java	antlr4
+int	provider Code	DECLARATION	NM N	Java	Telegram
+PublishResponse	Publish Response	CLASS	NM N	Java	elasticsearch
+void	push Reset Later	FUNCTION	V N VM	Java	okhttp
+PutWatchRequest	put Watch Request	PARAMETER	NM NM N	Java	elasticsearch
+PyObject	pybullet compute View Matrix	FUNCTION	PRE V NM N	C	bullet3
+sendrequest	q tail	ATTRIBUTE	NM N	C	grpc
+QRDetect	QR Detect	CLASS	NM N	C++	opencv
+Quad	Quad	CLASS	N	C++	blender
+long	query Timeout In Ms	PARAMETER	NM N P NPL	Java	elasticsearch
+GrammarAST	question AST	PARAMETER	NM N	Java	antlr4
+Quote	Quote	CLASS	N	C++	QuantLib
+RangeInRevision	r	PARAMETER	N	C++	kdevelop
+camhdr	r hdr	DECLARATION	NM N	C	openFrameworks
+float	radius	PARAMETER	N	C	gimp
+sharedptr	random Walk	PARAMETER	NM N	C++	QuantLib
+void	rate Pointer	PARAMETER	NM N	C++	bullet3
+Object	raw Arguments	ATTRIBUTE	NM NPL	Java	mockito
+List	raw Extra Interfaces	DECLARATION	NM NM NPL	Java	mockito
+GPUVertBufRaw	raw nor	DECLARATION	NM N	C	blender
+SILValue	RC Identity	ATTRIBUTE	NM N	C	swift
+float	rcp len 2	DECLARATION	NM N D	C	bullet3
+ReadBitstream	Read Bitstream	CLASS	V N	C++	opencv
+void	read Element Text Span	FUNCTION	V NM NM N	C++	calligra
+ReadBufferOperation	read Operation	PARAMETER	NM N	C++	blender
+void	read Pass	FUNCTION	V N	C++	irrlicht
+List	read Response	FUNCTION	V N	Java	okhttp
+FLACbool	read subframe	FUNCTION	V N	C	Telegram
+char	Read Text File	FUNCTION	V NM N	C++	ogre
+void	read White Space	FUNCTION	V NM N	Java	corenlp
+int	reader Flags	PARAMETER	NM NPL	Java	mockito
+void	reapply Filter	FUNCTION	V N	C++	calligra
+String	received Token Signature	DECLARATION	NM NM N	Java	jenkins
+ReconstructUpdateCallback	Reconstruct Update Callback	CLASS	NM NM N	C++	blender
+RecordHeader	Record Header	CLASS	NM N	C++	calligra
+int	recorded Matchers Size	DECLARATION	NM NM N	Java	mockito
+u32	rectangle Index	DECLARATION	NM N	C++	irrlicht
+JSONObject	reduced Json	DECLARATION	NM N	Java	jenkins
+int	reduction Indices	ATTRIBUTE	NM NPL	C++	opencv
+QPointF	ref Point Offset Percent	ATTRIBUTE	NM NM NM N	C++	calligra
+Map	referee Set Map	PARAMETER	NM NM N	Java	jenkins
+List	reference Index Meta Datas	ATTRIBUTE	NM NM NM NPL	Java	elasticsearch
+SourceRange	Reference Range	PARAMETER	NM N	C++	swift
+String	REFRESH INTERVAL IN MILLIS	ATTRIBUTE	NM N P NPL	Java	elasticsearch
+MeanMetric	refresh Metric	PARAMETER	NM N	Java	elasticsearch
+RefutablePatternInitialization	Refutable Pattern Initialization	CLASS	NM NM N	C++	swift
+void	register With Volatility Spread	FUNCTION	V P NM N	C++	QuantLib
+f32	relative contrast	PARAMETER	NM N	C++	irrlicht
+Int32	rem F	DECLARATION	NM N	C	irrlicht
+void	remap Nearest Neighbor	FUNCTION	V NM N	C++	opencv
+Set	remote Cluster Names	PARAMETER	NM NM NPL	Java	elasticsearch
+RemotingDiagnostics	Remoting Diagnostics	CLASS	NM NPL	Java	jenkins
+void	remove Imported Parent Contexts	FUNCTION	V NM NM NPL	C++	kdevelop
+void	render result exr file end	FUNCTION	V NM NM NM N	C	blender
+void	Render Text	FUNCTION	V N	C++	bullet3
+auto	REPL Module	DECLARATION	NM N	C++	swift
+StringSet	Replace Text Context	ATTRIBUTE	NM NM N	C++	swift
+sizet	Replacement Length	ATTRIBUTE	NM N	C	swift
+void	report No Setter Found	FUNCTION	V DT N V	Java	mockito
+void	repress Ref At Loc	FUNCTION	V N P N	C++	swift
+uint8t	request bytes	DECLARATION	NM NPL	C++	grpc
+Map	requested Plugins	DECLARATION	NM NPL	Java	jenkins
+void	require Client Auth	FUNCTION	V NM N	Java	okhttp
+int	res Width	PARAMETER	NM N	C++	openFrameworks
+vector	resamplers	ATTRIBUTE	NPL	C++	Telegram
+SILFunction	Reserve Fn	PARAMETER	NM N	C++	swift
+void	reset Meta Class Cache	FUNCTION	V NM NM N	Java	jenkins
+void	Reset Token Stats	FUNCTION	V NM NPL	C	Telegram
+void	resize Linear Open CV	FUNCTION	V NM NM N	C++	opencv
+void	resize Nearest Neighbor	FUNCTION	V NM N	C++	opencv
+ResolvedFailedException	Resolved Failed Exception	CLASS	NM NM N	Java	jenkins
+ResponseHandlers	Response Handlers	CLASS	NM NPL	Java	elasticsearch
+RestClient	Rest Client	CLASS	NM N	C++	openFrameworks
+boolean	resume	PARAMETER	V	Java	Telegram
+Predicate	retain Function	PARAMETER	NM N	Java	corenlp
+restrict	rets	PARAMETER	NPL	C	blender
+bool	return Path	PARAMETER	NM N	C++	irrlicht
+Object	returned Value	ATTRIBUTE	NM N	Java	mockito
+ReturnsEmptyValues	Returns Empty Values	CLASS	V NM NPL	Java	mockito
+Answer	RETURNS SELF	ATTRIBUTE	V N	Java	mockito
+void	rgb 2 rgb565	FUNCTION	N P N	C++	opencv
+guchar[]	rgb real	DECLARATION	NM N	C	gimp
+void	rgbx 2 bgrx	FUNCTION	N P N	C++	opencv
+Real	risky Annuity	ATTRIBUTE	NM N	C++	QuantLib
+RiskyBond	Risky Bond	CLASS	NM N	C++	QuantLib
+RollingFrictionDemo	Rolling Friction Demo	CLASS	NM NM N	C++	bullet3
+PointerRNA	root ptr	DECLARATION	NM N	C	blender
+Vector	rot axis	PARAMETER	NM N	C++	blender
+gchar	rotate desc	ATTRIBUTE	NM N	C	gimp
+vector3df	rotation Per Second	PARAMETER	N P N	C++	irrlicht
+ExtendedBounds	rounded Bounds	DECLARATION	NM NPL	Java	elasticsearch
+int	row limit	ATTRIBUTE	NM N	C++	grpc
+RSComputeOperation	RS Compute Operation	CLASS	NM NM N	C++	ogre
+RSStencilOperation	RS Stencil Operation	CLASS	NM NM N	C++	ogre
+ofRtAudioSoundStream	rt Stream Ptr	DECLARATION	NM NM N	C++	openFrameworks
+RtmSession	rtm Session	DECLARATION	NM N	Java	okhttp
+RuleMemberValidator	Rule Member Validator	CLASS	NM NM N	Java	junit4
+Set	rule Options	ATTRIBUTE	NM NPL	Java	antlr4
+RulePropertyRef_ctx	Rule Property Ref ctx	CLASS	NM NM NM N	Java	antlr4
+RulePropertyRef_start	Rule Property Ref start	CLASS	NM NM NM N	Java	antlr4
+RuleVersionAttribute	Rule Version Attribute	CLASS	NM NM N	C#	antlr4
+List	rules Of New Chain	DECLARATION	N P NM N	Java	junit4
+RunAfterParams	Run After Params	CLASS	V NM NPL	Java	junit4
+RunBeforeParams	Run Before Params	CLASS	V NM NPL	Java	junit4
+String	RUN DIST CMD PROP	ATTRIBUTE	NM NM NM N	Java	corenlp
+void	run Methods	FUNCTION	V NPL	Java	junit4
+ParametersRunnerFactory	runner Factory	PARAMETER	NM N	Java	junit4
+Runner	runner Override	ATTRIBUTE	NM N	Java	junit4
+RunnerScheduler	Runner Scheduler	CLASS	NM N	Java	junit4
+Real	running Log Average	DECLARATION	NM NM N	C++	QuantLib
+Object	runtime Mx Bean	ATTRIBUTE	NM NM N	Java	junit4
+string	s Tracking System Name	DECLARATION	PRE NM NM N	C++	bullet3
+TerrainLayerSamplerList	samplers	ATTRIBUTE	NPL	C	ogre
+QString	sanitize Path	FUNCTION	V N	C++	kdevelop
+Dst	saturated cast	FUNCTION	NM N	C	Telegram
+void	save As Quadratic Png	FUNCTION	V P NM N	C++	calligra
+bool	save Dual Cells	PARAMETER	V NM NPL	C	ogre
+char	scene Node Type Name	PARAMETER	NM NM NM N	C++	irrlicht
+double[]	score Pos Prev	DECLARATION	NM N NM	Java	corenlp
+QHash	script Event Action Factories	ATTRIBUTE	NM NM NM NPL	C++	calligra
+ScrollIdForNode	Scroll Id For Node	CLASS	NM N P N	Java	elasticsearch
+Map	search Profile Results	PARAMETER	NM NM NPL	Java	elasticsearch
+GimpHueRange	secondary range	PARAMETER	NM N	C	gimp
+SegmenterCoreAnnotations	Segmenter Core Annotations	CLASS	NM NM NPL	Java	corenlp
+SeiReader	Sei Reader	CLASS	NM N	Java	Telegram
+int	selected Account	ATTRIBUTE	NM N	Java	Telegram
+GeglRectangle	selection bounds	DECLARATION	NM NPL	C	gimp
+GtkWidget	selection width label	ATTRIBUTE	NM NM N	C	gimp
+void	send Serial Config	FUNCTION	V NM N	C++	openFrameworks
+void	send String	FUNCTION	V N	Java	openFrameworks
+List	sentence List	DECLARATION	NM N	Java	corenlp
+QColor	separator Color	PARAMETER	NM N	C++	calligra
+SerializedForm	Serialized Form	CLASS	NM N	Java	junit4
+char	server list	PARAMETER	NM N	C++	grpc
+ServerSafeHandle	Server Safe Handle	CLASS	NM NM N	C#	grpc
+KConfigGroup	session Config	FUNCTION	NM N	C++	kdevelop
+OAuthSessionFactory	session Factory	ATTRIBUTE	NM N	Java	okhttp
+char	session ticket key	ATTRIBUTE	NM NM N	C	grpc
+void	Set Add Faces Points	FUNCTION	V V NM NPL	C	bullet3
+Builder	set Canonical Mention Begin	FUNCTION	V NM NM N	Java	corenlp
+void	set Custom Uniform 1 f	FUNCTION	V NM N D NM	C++	openFrameworks
+clint	set Destructor Callback	FUNCTION	V NM N	C++	opencv
+void	set Display Index	FUNCTION	V NM N	C++	ogre
+CreationSettings	set Extra Interfaces	FUNCTION	V NM NPL	Java	mockito
+void	set Frame Pen	FUNCTION	V NM N	C++	calligra
+void	set Invert	FUNCTION	V N	C++	antlr4
+void	set Layer Texture Name	FUNCTION	V NM NM N	C++	ogre
+Action	set Prev Ctx Action	DECLARATION	NM NM NM N	Java	antlr4
+void	set Project Naming Strategy	FUNCTION	V NM NM N	Java	jenkins
+Method	set Protocol Method	PARAMETER	NM NM N	Java	okhttp
+void	set Tiling	FUNCTION	V N	C++	ogre
+SettingManager	Setting Manager	CLASS	NM N	C++	irrlicht
+ofSoundStreamSettings	settings	PARAMETER	NPL	C++	openFrameworks
+void	setup Bounding Box Vertices	FUNCTION	V NM NM NPL	C++	ogre
+sha2void	sha256 hash	FUNCTION	NM N	C++	irrlicht
+void	shader data to shader globals	FUNCTION	NM NPL P NM NPL	C++	blender
+ShiftReduceTrainOptions	Shift Reduce Train Options	CLASS	NM NM NM NPL	Java	corenlp
+vector	shift Values	ATTRIBUTE	NM NPL	C++	QuantLib
+bool	show tags	ATTRIBUTE	V NPL	C++	blender
+SiblingAlignInfo	Sibling Info	PARAMETER	NM N	C++	swift
+void	silk bw expander 32	FUNCTION	NM NM N D	C	Telegram
+auto	simple Fn Ty	DECLARATION	NM NM N	C++	swift
+SimplePressure	Simple Pressure	CLASS	NM N	C++	gimp.idents
+void	simulate GC	FUNCTION	V N	Java	corenlp
+float	sin rot w	DECLARATION	NM NM N	C++	ogre
+SinglePeriodTimeline	Single Period Timeline	CLASS	NM NM N	Java	Telegram
+SINH	SINH	CLASS	N	Java	elasticsearch
+ImVec2	size contents	PARAMETER	NM NPL	C++	bullet3
+long	size Guess	DECLARATION	NM N	Java	jenkins
+ImVec2	size on first use	PARAMETER	N P NM N	C	bullet3
+int	size Per Span	DECLARATION	N P N	Java	Telegram
+SKEditorConsumer	SK Editor Consumer	CLASS	NM NM N	C++	swift
+boolean	skip Vetoes	ATTRIBUTE	NM NPL	Java	jenkins
+boolean	skip Whitespace And Commas	FUNCTION	V N CJ NPL	Java	okhttp
+SlackClient	Slack Client	CLASS	NM N	Java	okhttp
+DataType	sort Field Data Type	ATTRIBUTE	NM NM NM N	Java	elasticsearch
+int	sorted Indices Buf	PARAMETER	NM NM N	C++	opencv
+uint32t	source index	DECLARATION	NM N	C	openFrameworks
+auto	source Ty	DECLARATION	NM N	C++	swift
+Optional	speaker	FUNCTION	N	Java	corenlp
+int16t	speech in	PARAMETER	NM N	C	Telegram
+gint	spline max len	ATTRIBUTE	NM NM N	C	gimp
+int	split Argument List	FUNCTION	V NM N	Java	antlr4
+array	Sprites	ATTRIBUTE	NPL	C++	irrlicht
+int	sqlite3 session config	FUNCTION	PRE N V	C	Telegram
+int	sqlite3 Walk Expr List	FUNCTION	PRE V NM N	C	Telegram
+QFileInfo	src File Info	DECLARATION	NM NM N	C++	kdevelop
+QGradient	src Gradient	PARAMETER	NM N	C++	calligra
+int	src start idx	DECLARATION	NM NM N	C++	opencv
+grpcchannelcredentials	ssl creds	DECLARATION	NM NPL	C++	grpc
+STGroup	st lib	ATTRIBUTE	NM N	Java	antlr4
+stack	stack	ATTRIBUTE	N	C++	grpc
+QList	start Dirs	PARAMETER	NM NPL	C++	kdevelop
+void	start Matched Count Dec	FUNCTION	V NM N V	Java	corenlp
+int	Start Slot	PARAMETER	NM N	C++	ogre
+Real	start Up Fix Cost	PARAMETER	NM NM NM N	C++	QuantLib
+Map	state To Grammar Region Map	ATTRIBUTE	N P NM NM N	Java	antlr4
+Statement	statement	ATTRIBUTE	N	Java	junit4
+rect	static Rect	DECLARATION	NM N	C++	irrlicht
+File	status File	DECLARATION	NM N	Java	antlr4
+File	status File	PARAMETER	NM N	Java	antlr4
+QIcon	status Icon	FUNCTION	NM N	C++	kdevelop
+int	Step No	ATTRIBUTE	NM N	C	bullet3
+Real	step Size	DECLARATION	NM N	C++	QuantLib
+int	step x	DECLARATION	NM N	C++	opencv
+int	stmt Close	FUNCTION	N V	C	Telegram
+StochasticProcess	Stochastic Process	CLASS	NM N	C++	QuantLib
+Store	Store	CLASS	N	Java	elasticsearch
+QStringList	str args	DECLARATION	NM NPL	C++	kdevelop
+sizet	str array len	FUNCTION	NM NM N	C++	bullet3
+Headers	stream Headers	DECLARATION	NM NPL	Java	okhttp
+float	strength	PARAMETER	N	C	blender
+StrictnessSelector	Strictness Selector	CLASS	NM N	Java	mockito
+GrammarAST	strip Left Recursion	FUNCTION	V NM N	Java	antlr4
+PyObject	Stroke Attribute alpha get	FUNCTION	NM NM N V	C++	blender
+StructType	Struct Ty	ATTRIBUTE	NM N	C++	swift
+StubbingComparator	Stubbing Comparator	CLASS	NM N	Java	mockito
+List stubbingLookupListeners	stubbing Lookup Listeners	ATTRIBUTE	NM NM NPL	Java	mockito
+SUTime	SU Time	CLASS	NM N	Java	corenlp
+vector	Sub Module Name Visibility Pairs	ATTRIBUTE	NM NM NM NM NPL	C++	swift
+constiterator	sub start	DECLARATION	NM N	C++	openFrameworks
+String	subroutine Slot Name	PARAMETER	NM NM N	C++	ogre
+SuiteMethod	Suite Method	CLASS	NM N	Java	junit4
+SuiteMethodBuilder	Suite Method Builder	CLASS	NM NM N	Java	junit4
+RealMethod	super Method	PARAMETER	NM N	Java	mockito
+Class	supplier Class	PARAMETER	NM N	Java	junit4
+XIMStyle	supported Style	DECLARATION	NM N	C++	irrlicht
+bool	suppress File	PARAMETER	V N	C++	ogre
+int	sz Joint Ranges	DECLARATION	NM NM NPL	C	bullet3
+QStyleOptionTab	tab Overlap	DECLARATION	NM N	C++	kdevelop
+gint	table 2 id	PARAMETER	N D N	C	gimp
+QSet	tagged Resources	DECLARATION	NM NPL	C++	calligra
+ATNState	target	PARAMETER	N	C++	antlr4
+char	target chars	DECLARATION	NM NPL	C	grpc
+void	target Started	FUNCTION	N V	Java	jenkins
+TaskImpl	Task Impl	CLASS	NM N	Java	jenkins
+uint32t	tbl index	PARAMETER	NM N	C++	grpc
+Object	tcp Slave Agent Listener Lock	ATTRIBUTE	NM NM NM NM N	Java	jenkins
+int	TEGRA MORPH INIT	FUNCTION	PRE N V	C++	opencv
+DeclAttributes	temp Attrs	DECLARATION	NM NPL	C++	swift
+TempCompMask	Temp Comp Mask	CLASS	NM NM N	C++	gimp.idents
+TemperatureCauchy1D	Temperature Cauchy 1 D	CLASS	NM N D NM	C++	QuantLib
+TemplatePreviewIconData	Template Preview Icon Data	CLASS	NM NM NM NPL	C++	kdevelop
+TemporaryFolder	Temporary Folder	CLASS	NM N	Java	junit4
+void	tessellate To Mesh	FUNCTION	V P N	C++	openFrameworks
+int	tex ID	PARAMETER	NM N	Java	openFrameworks
+stringt	text Chopped 2	DECLARATION	N NM D	C	swift
+TextPaintView	Text Paint View	CLASS	NM NM N	Java	Telegram
+InvocationOnMock	the Invocation	PARAMETER	DT N	Java	mockito
+AssignExpr	Then	ATTRIBUTE	N	C++	swift
+int	thread Array Size	DECLARATION	NM NM N	Java	junit4
+void	throw Provision Exception If Errors Exist	FUNCTION	V NM N CJ NPL V	Java	elasticsearch
+double	tick Freq	PARAMETER	NM N	C++	opencv
+uint32	TIFF Current Tile	FUNCTION	NM NM N	C	opencv
+TIFFSizeProc	TIFF Get Size Proc	FUNCTION	NM V NM N	C	opencv
+TileParameterDefaultTypeInternal	Tile Parameter Default Type Internal	CLASS	NM NM NM N NM	C++	opencv
+float	time range	PARAMETER	NM N	C	blender
+TimeSignalCommand	Time Signal Command	CLASS	NM NM N	Java	Telegram
+Timespec	time spec	ATTRIBUTE	NM N	C#	grpc
+Timelapser	Timelapser	CLASS	N	C++	opencv
+Cancellable	timeout Task	PARAMETER	NM N	Java	elasticsearch
+int	times To Append Last Matcher	PARAMETER	NPL P V NM N	Java	mockito
+vector	tlv Symbols	ATTRIBUTE	NM NPL	C++	swift
+float[]	tmp vec	DECLARATION	NM N	C	blender
+ToStringWalker	To String Walker	CLASS	P N N	C++	grpc
+double[]	to XYZ	DECLARATION	P N	C	bullet3
+JSONObject	token Data	DECLARATION	NM N	Java	jenkins
+TokenPropertyRef	Token Property Ref	CLASS	NM NM N	Java	antlr4
+TokenPropertyRef_channel	Token Property Ref channel	CLASS	NM NM NM N	Java	antlr4
+Map	token Store Typed Data	DECLARATION	NM NM NM N	Java	jenkins
+TokenTypeDecl	Token Type Decl	CLASS	NM NM N	Java	antlr4
+String	token Type S	DECLARATION	NM NM N	Java	antlr4
+Token	token Within Action	PARAMETER	N P N	Java	antlr4
+Position	tool View Position	FUNCTION	NM NM N	C++	kdevelop
+bool	Top Dir	PARAMETER	NM N	C++	grpc
+TopNGramRecord	Top NGram Record	CLASS	NM NM N	Java	corenlp
+int totchannel	tot channel	ATTRIBUTE	NM N	C	blender
+sizet	tot elem	ATTRIBUTE	NM N	C	blender
+CounterMetric	total Merge Throttled Time	ATTRIBUTE	NM NM NM N	Java	elasticsearch
+CommodityUnitCost	trade Price	ATTRIBUTE	NM N	C++	QuantLib
+Builder	training Examples	FUNCTION	NM NPL	Java	corenlp
+Real	tranched Loss After	DECLARATION	NM N P	C++	QuantLib
+PathInfo	transform Path	FUNCTION	V N	C++	QuantLib
+Affine3	transform Unique Id	DECLARATION	NM NM N	C++	ogre
+TransportShardRefreshAction	Transport Shard Refresh Action	CLASS	NM NM NM N	Java	elasticsearch
+TreeElement	Tree Element	CLASS	NM N	C++	blender
+TreePostScriptGenerator	Tree Post Script Generator	CLASS	NM NM NM N	Java	antlr4
+bool	trim Parse Trees	DECLARATION	V NM NPL	C#	antlr4
+TsurgeonParseException	Tsurgeon Parse Exception	CLASS	NM NM N	Java	corenlp
+Treebank	tune Treebank	DECLARATION	NM N	Java	corenlp
+TupleLValueEmitter	Tuple LValue Emitter	CLASS	NM NM N	C++	swift
+void	two Factor Response	FUNCTION	NM NM N	C++	kdevelop
+char	txt alias	DECLARATION	NM N	C	grpc
+int	TYPE TEST RULE	ATTRIBUTE	NM NM N	Java	junit4
+byte	TYPE WINDOW UPDATE	ATTRIBUTE	N NM NM	Java	okhttp
+T[]	typed Array	PARAMETER	NM N	Java	mockito
+int	ui Index	DECLARATION	NM N	C++	ogre
+BytesReference	uncompress If Needed	FUNCTION	V CJ V	Java	elasticsearch
+Handle	underlying Fx Correlation	PARAMETER	NM NM N	C++	QuantLib
+Money	undiscounted Amount	PARAMETER	NM N	C++	QuantLib
+String	UNKNOWN USERNAME	ATTRIBUTE	NM N	Java	jenkins
+void	unload Textures	FUNCTION	V NPL	C++	openFrameworks
+void	unpack texture Blend Func	FUNCTION	V NM NM N	C	irrlicht
+requestmatcher	unregistered request matcher	ATTRIBUTE	NM NM N	C++	grpc
+SoloFilePathFilter	UNRESTRICTED	ATTRIBUTE	NM	Java	jenkins
+GeglRectangle	update area	DECLARATION	NM N	C	gimp
+void	update Mouse Pos	FUNCTION	V NM N	C++	calligra
+HttpUrl	url From Json	FUNCTION	N P N	Java	okhttp
+int	usage	PARAMETER	N	C++	openFrameworks
+bool	use Atm Spread	ATTRIBUTE	V NM N	C++	QuantLib
+bool	use mat dirs	DECLARATION	V NM NPL	C++	irrlicht
+bool	use Shadows 1	PARAMETER	V NPL D	C++	bullet3
+bool	use Tabs	PARAMETER	V NPL	C++	kdevelop
+User2InternalIndex	User 2 Internal Index	CLASS	N P NM N	C++	bullet3
+void	user hook 3	ATTRIBUTE	NM N D	C	blender
+long	utc Timestamp Ms	DECLARATION	NM NM NPL	Java	Telegram
+sizet	utf8 size	PARAMETER	NM N	C++	grpc
+Guid	uuid	ATTRIBUTE	N	C#	antlr4
+float	v proj axis	DECLARATION	NM NM N	C	blender
+StringTokenizer	v Tok	DECLARATION	NM N	Java	elasticsearch
+void	V URL Encode	FUNCTION	PRE N V	C++	bullet3
+uint	val 32	ATTRIBUTE	N D	C++	opencv
+sizet	Val Size	DECLARATION	NM N	C++	grpc
+void	validate Class Rules	FUNCTION	V NM NPL	Java	junit4
+u32	validate On	PARAMETER	V P	C++	irrlicht
+void	validate Public Static Void Methods	FUNCTION	V NM NM NM NPL	Java	junit4
+List	validator Strategies	ATTRIBUTE	NM NPL	Java	junit4
+Value	Value	CLASS	N	C++	Telegram
+String	value Count String	DECLARATION	NM NM N	Java	okhttp
+QHash	value Hash	ATTRIBUTE	NM N	C++	calligra
+ValueLabel	Value Label	CLASS	NM N	Java	corenlp
+Real	value X	DECLARATION	NM N	C++	QuantLib
+VanillaForwardPayoff	Vanilla Forward Payoff	CLASS	NM NM N	C++	QuantLib
+Object[]	var Args	DECLARATION	NM NPL	Java	mockito
+vector	variables	ATTRIBUTE	NPL	C++	QuantLib
+Set	vary Fields	DECLARATION	NM NPL	Java	okhttp
+VerificationModeFactory	Verification Mode Factory	CLASS	NM NM N	Java	mockito
+VerificationOverTimeImpl	verification Over Time	DECLARATION	N P N	Java	mockito
+void	verification Started	FUNCTION	N V	Java	mockito
+VerificationStrategy	verification Strategy	ATTRIBUTE	NM N	Java	mockito
+emailkeymapping	verifier get mapping	FUNCTION	N V N	C++	grpc
+VertexPosition	Vertex Position	CLASS	NM N	C++	bullet3
+VideoCapture_DShow	Video Capture DShow	CLASS	NM NM N	C++	opencv
+freenectchunkcb	video chunk cb	ATTRIBUTE	NM NM N	C	openFrameworks
+void	visit SIL Argument	FUNCTION	V NM N	C++	swift
+VoronoiFractureDemo	Voronoi Fracture Demo	CLASS	NM NM N	C++	bullet3
+int	w Width	ATTRIBUTE	NM N	C++	ogre
+long	wait Until	DECLARATION	V P	Java	jenkins
+Set	waiting List	PARAMETER	NM N	Java	jenkins
+int	want x	PARAMETER	V N	C	openFrameworks
+WarningFailureException	Warning Failure Exception	CLASS	NM NM N	C++	openFrameworks
+s32	wat id	DECLARATION	NM N	C	irrlicht
+AtomicInteger	weak Ref Lost	ATTRIBUTE	NM NM N	Java	jenkins
+WebSocketListener	Web Socket Listener	CLASS	NM NM N	Java	okhttp
+vector	weights Multipliers	ATTRIBUTE	NM NPL	C++	opencv
+StringPiece	whole regexp	ATTRIBUTE	NM N	C++	grpc
+void	widget Destroyed	FUNCTION	N V	C++	kdevelop
+boolean	will Return Last Parameter	PARAMETER	V V NM N	Java	mockito
+int	win Error	DECLARATION	NM N	C++	ogre
+Challenge	with Charset	FUNCTION	P N	Java	okhttp
+MakeCms	with Cms Leg Rule	FUNCTION	P NM NM N	C++	QuantLib
+ModelSettings	with Market Rate Accuracy	FUNCTION	P NM NM N	C++	QuantLib
+Settings	with Rate Bound	FUNCTION	P NM N	C++	QuantLib
+MockResponse	with Web Socket Upgrade	FUNCTION	P NM NM N	Java	okhttp
+gboolean	within vertically	DECLARATION	P VM	C	gimp
+WordLemmaTag	word Lemma Tag	PARAMETER	NM NM N	Java	corenlp
+void	worker Thread Wait	FUNCTION	NM N V	C++	bullet3
+WorkspaceFileMask	Workspace File Mask	CLASS	NM NM N	Java	jenkins
+vec3	world To Screen	FUNCTION	N P N	C++	openFrameworks
+int	worst score	DECLARATION	NM N	C	opencv
+RunListener	wrap If Not Thread Safe	FUNCTION	V CJ VM NM N	Java	junit4
+WrapperType	wrapped Verification	PARAMETER	NM N	Java	mockito
+Class	wrapper Class	DECLARATION	NM N	Java	junit4
+WriteContext	Write Context	CLASS	NM N	C++	grpc
+void	write Node Materials	FUNCTION	V NM NPL	C++	irrlicht
+int	write Root	PARAMETER	V N	C++	irrlicht
+void	write tcp data	FUNCTION	V NM N	C	grpc
+WSDLSSolver	WSDLS Solver	CLASS	NM N	C++	blender
+btScalar	x 2	DECLARATION	N D	C++	bullet3
+float	x Distance	DECLARATION	NM N	Java	openFrameworks
+Atom	X dnd Type List	ATTRIBUTE	NM NM NM N	C	blender
+short	x origin	ATTRIBUTE	NM N	C	bullet3
+f32	X scale	ATTRIBUTE	NM N	C	irrlicht
+int	x Tilt	FUNCTION	NM N	C++	calligra
+Real	y In	PARAMETER	NM N	C++	ogre
+S32	ya Bottom	ATTRIBUTE	NM N	C	calligra
+SmallVectorImpl	yield MVs	PARAMETER	NM NPL	C++	swift
+freenectzeroplaneinfo	z p i	PARAMETER	NM NM N	C	openFrameworks
+double[]	z probs	PARAMETER	NM NPL	Java	corenlp
+Real	z weight	ATTRIBUTE	NM N	C++	QuantLib
+int	zero plane res	ATTRIBUTE	NM NM N	C	openFrameworks
+int[]	zoom x y	PARAMETER	NM N N	C	blender
+T	a	PARAMETER	N	C++	drill
+char	a 0	PARAMETER	N D	C	rigraph
+int	a 3	DECLARATION	N D	C	toggldesktop
+int	a cap	PARAMETER	NM N	C	rigraph
+int	a Change	PARAMETER	PRE N	C	ccv
+asn1_ctx_t	a ctx	PARAMETER	NM N	C	wireshark
+int	a len	PARAMETER	NM N	C	naemon-core
+u8	a light	PARAMETER	DT N	C++	freeminer
+int	a low	PARAMETER	NM N	C	rigraph
+u8	a Old Record 1	PARAMETER	PRE NM N D	C	ccv
+sqlite3_value**	a Replace	PARAMETER	PRE V	C	ccv
+Throwable	a Throwable	PARAMETER	DT NM	Java	Spark
+long	a time	PARAMETER	NM N	Java	drill
+ovsdb_type	a type	PARAMETER	NM N	C	ovs
+Void	a Void	PARAMETER	DT N	Java	immutables
+bool	above Base	PARAMETER	P N	C++	proxygen
+class	Abstract SV 2 Copier	CLASS	NM N D N	Java	drill
+void	add before forward	FUNCTION	V P N	C++	caffe
+customvariablesmember*	add custom variable to service	FUNCTION	V NM N P N	C	naemon-core
+void	add Menu For List Nodes	FUNCTION	V N P NM NPL	Java	Spark
+bool	add no exist	PARAMETER	V DT V	C++	s3fs-fuse
+bool	add no truncate cache	PARAMETER	V DT NM N	C++	s3fs-fuse
+int	add parent to host	FUNCTION	V N P N	C	naemon-core
+int	add temp to args	DECLARATION	V N P NPL	C	weechat
+void	adjust to camera	FUNCTION	V P N	C++	panda3d
+uLong	adler 1	DECLARATION	N D	C	mgba
+long	after	DECLARATION	P	Java	drill
+auto	after 1	DECLARATION	P D	C++	meta
+auto	after 2	DECLARATION	P D	C++	meta
+boolean	after Equals	DECLARATION	P N	Java	Openfire
+void	after Filters Closed	FUNCTION	P NPL V	Java	Smack
+boolean	after First Batch	ATTRIBUTE	P NM N	Java	drill
+request	after handle	DECLARATION	P N	C	crow
+void	after Join Send History	FUNCTION	P V V N	Java	Openfire
+void	after Last	FUNCTION	P DT	Java	drill
+boolean	after Last Row	ATTRIBUTE	P NM N	Java	drill
+ClassToInstanceMap	after Processing	ATTRIBUTE	P V	Java	immutables
+gboolean	after release	ATTRIBUTE	P N	C	wireshark
+ClassToInstanceMap	after Round	ATTRIBUTE	P N	Java	immutables
+ebb_after_write_cb	after write cb	ATTRIBUTE	P V N	C	ccv
+AfterXStanzas	after X Stanzas	ATTRIBUTE	P D NPL	Java	Smack
+off_t	alias off	ATTRIBUTE	NM N	C	ccv
+Set	all Annotated Elements	FUNCTION	DT NM NPL	Java	immutables
+EnumSet	all Casts	DECLARATION	DT NPL	Java	drill
+boolean	all Cols Indexed	PARAMETER	DT NPL V	Java	drill
+Map	all Drill bits	DECLARATION	DT NM NPL	Java	drill
+vector	all Errors	DECLARATION	DT NPL	C++	toggldesktop
+List	all Exprs	DECLARATION	DT NPL	Java	drill
+Set	all Fields	ATTRIBUTE	DT NPL	Java	drill
+boolean	all Final	PARAMETER	DT N	Java	immutables
+List	all Labels	ATTRIBUTE	DT NPL	Java	deeplearning4j
+List	all Methods	PARAMETER	DT NPL	Java	cglib
+Set	all Metrics	DECLARATION	DT NPL	Java	drill
+ImmutableList	all Mirrors	FUNCTION	DT NPL	Java	immutables
+String	all Names	PARAMETER	DT NPL	Java	cglib
+Set	all New Schema Paths	FUNCTION	DT NM NM NPL	Java	drill
+List	all Open Workspaces	FUNCTION	DT NM NPL	Java	deeplearning4j
+Iterable	all Options	DECLARATION	DT NPL	Java	drill
+List	all Pools	PARAMETER	DT NPL	Java	drill
+boolean	all Procedures Are Callable	FUNCTION	DT NPL V NM	Java	drill
+String	all Projects	ATTRIBUTE	DT NPL	Java	deeplearning4j
+List	all Room Names	DECLARATION	DT NM NPL	Java	Openfire
+Map	all Service Response	PARAMETER	DT NM N	Java	drill
+gboolean	all set	DECLARATION	DT N	C	wireshark
+bool	all space after	DECLARATION	DT N P	C	crow
+bool	all space before	DECLARATION	DT N P	C	crow
+KeyStore	all Store	ATTRIBUTE	DT N	Java	Spark
+void	all Streams Finished	FUNCTION	DT NPL V	Java	drill
+int*	all synced	FUNCTION	DT V	C	ovs
+bool	all Users	PARAMETER	DT NPL	C++	facebook-repo-ds2
+ELoginRegister	allow login or register	ATTRIBUTE	V V CJ V	C	freeminer
+int	among	PARAMETER	P	C++	rigraph
+i64	an Size	PARAMETER	NM N	C	ccv
+checkout_conflictdata	ancestor out	PARAMETER	NM N	C	git2r
+Object	and	DECLARATION	CJ	Java	immutables
+DruidFilter	and Filter At Index	FUNCTION	NM N P N	Java	drill
+AndNode	and Node	PARAMETER	NM N	Java	drill
+byte	another ID	PARAMETER	DT N	Java	Openfire
+bool	any diffuse	ATTRIBUTE	DT N	C	panda3d
+primitive	any hidden	PARAMETER	DT NM	C++	panda3d
+String	APPLICATION INFO 2	ATTRIBUTE	NM N D	Java	Spark
+class	are Unities in Shape	CLASS	V NPL P N	Java	deeplearning4j
+Control	arg 1	PARAMETER	N D	Java	Openfire
+DoublePointer	arg 18	PARAMETER	N D	Java	deeplearning4j
+long	arg 26	PARAMETER	N D	Java	deeplearning4j
+long	arg 29	PARAMETER	N D	Java	deeplearning4j
+long	arg 31	PARAMETER	N D	Java	deeplearning4j
+boolean	as Array	ATTRIBUTE	P N	Java	drill
+DCArrayParameter*	as array parameter	FUNCTION	P NM N	C++	panda3d
+int	as binary	ATTRIBUTE	P N	C	rigraph
+BoundingBox*	as bounding box	FUNCTION	P NM N	C++	panda3d
+char*	as C String	FUNCTION	P NM N	C++	freeminer
+AnnotationMirror	as Caching	FUNCTION	P V	Java	immutables
+Object	as Diamond	DECLARATION	P N	Java	immutables
+double[]	as Double	FUNCTION	P N	Java	deeplearning4j
+DCField*	as field	FUNCTION	P N	C++	panda3d
+git_diff_file	as file	PARAMETER	P N	C	git2r
+Function	as Function	ATTRIBUTE	P N	Java	guava
+CPPFunctionType*	as function type	FUNCTION	P NM N	C++	panda3d
+ASIdentifiers	as id	PARAMETER	NM N	C	toggldesktop
+int	as in	ATTRIBUTE	P P	C	rigraph
+Int64	as Int64	FUNCTION	P N	C++	toggldesktop
+long	as Long	DECLARATION	P N	Java	guava
+MapWriter	as Map	FUNCTION	P N	Java	drill
+DCMolecularField*	as molecular field	FUNCTION	P NM N	C++	panda3d
+double	as of	PARAMETER	P P	C	panda3d
+void*	as pointer	FUNCTION	P N	C++	panda3d
+ByteBuf	as Read Only	FUNCTION	P NM VM	Java	drill
+DCSimpleParameter*	as simple parameter	FUNCTION	P NM N	C++	panda3d
+List	as Sorted Entry List	FUNCTION	P NM NM N	Java	Singularity
+String	as String	DECLARATION	P N	Java	immutables
+List	as Stripes	DECLARATION	P NPL	Java	guava
+Expression	as Transform Generator Transform	FUNCTION	P NM NM N	Java	immutables
+V	as V	DECLARATION	P N	Java	guava
+Var	as Var	FUNCTION	P N	C++	toggldesktop
+int	as warning	PARAMETER	P N	C	libxo
+int	as within	ATTRIBUTE	P P	C	rigraph
+Writer	as Writer	FUNCTION	P N	Java	guava
+boolean	at Least One Write	ATTRIBUTE	P DT D V	Java	drill
+R	at Most	FUNCTION	P DT	Java	immutables
+int	atalk len	FUNCTION	N NM	C	wireshark
+int	b 1 Index	PARAMETER	N D N	Java	drill
+int	B 1110	ATTRIBUTE	N D	Java	deeplearning4j
+byte	b 3	PARAMETER	N D	Java	guava
+Indexer	b float 16 Indexer	DECLARATION	PRE N D N	Java	deeplearning4j
+double	b float 16 To Double	FUNCTION	PRE N D P N	Java	deeplearning4j
+bool	b Force 16 bpp	PARAMETER	PRE V D N	C++	panda3d
+int	b next	DECLARATION	N DT	C	git2r
+svec	b only	PARAMETER	N VM	C	ovs
+int	b Stat 1	ATTRIBUTE	PRE N D	C	ccv
+uint64_t	bad only	DECLARATION	NM VM	C	toxcore
+class	Base Level 1	CLASS	NM N D	Java	deeplearning4j
+class	Base Level 3	CLASS	NM N D	Java	deeplearning4j
+uint8_t	be32 ofs	PARAMETER	NM N	C	ovs
+long	before	DECLARATION	P	Java	drill
+void	before Execute	FUNCTION	P V	Java	cglib
+boolean	before First	ATTRIBUTE	P DT	Java	drill
+int	before major	ATTRIBUTE	P N	C++	panda3d
+int	before minor	ATTRIBUTE	P N	C++	panda3d
+off_t	behind rem start	DECLARATION	P NM N	C++	s3fs-fuse
+off_t	behind size	PARAMETER	P N	C++	s3fs-fuse
+off_t	behind start	PARAMETER	P N	C++	s3fs-fuse
+LVecBase4	bi 0	DECLARATION	N D	C++	panda3d
+class	Bind 2 Module	CLASS	V D N	Java	Smack
+int	bit Field 0	ATTRIBUTE	NM N D	Java	drill
+ssize_t	bit off	DECLARATION	NM N	C++	facebook-repo-ds2
+int	bits per pixel	ATTRIBUTE	NPL P N	C	panda3d
+map	blob name to last top idx	DECLARATION	NM N P DT NM N	C++	caffe
+String	BLOCK CONTACT 16 x 16	ATTRIBUTE	NM N D P D	Java	Spark
+u32	block count all	DECLARATION	NM N DT	C++	freeminer
+void	body 0	PARAMETER	N D	C	panda3d
+boolean	both Empty Selection	DECLARATION	DT NM N	Java	drill
+boolean	both NonEmpty Selection	DECLARATION	DT NM N	Java	drill
+string	both Or All	FUNCTION	DT CJ DT	C++	freeminer
+UInt32	Bt3Zip Match Finder Get Matches	FUNCTION	NM NM N V NPL	C	mgba
+bool	btn down for dig	ATTRIBUTE	NM VM P N	C++	freeminer
+JButton	btn save	DECLARATION	NM N	Java	Spark
+char	buf 3	DECLARATION	N D	C	naemon-core
+char	buf out	DECLARATION	NM N	C	naemon-core
+X	buff Ptr 2	ATTRIBUTE	NM N D	C++	deeplearning4j
+char	buffer as string	ATTRIBUTE	N P N	C	weechat
+char	buffer out	PARAMETER	NM N	C	git2r
+void	build Schema For 2Dimensional Dataset	FUNCTION	V N P NM N	Java	drill
+String	by	ATTRIBUTE	P	Java	Smack
+class	Bypass Comparison 8192 x 8192	CLASS	NM N D P D	Java	deeplearning4j
+byte	byte I Plus 1	DECLARATION	NM N P D	Java	drill
+long	bytes in	PARAMETER	NPL NM	Java	drill
+long	bytes out	PARAMETER	NPL NM	Java	drill
+int64	bytes to read	DECLARATION	NPL P V	C++	deeplearning4j
+class	C Matrix 33	CLASS	PRE N D	C++	freeminer
+SColor	c outside	DECLARATION	PRE P	C++	freeminer
+igraph_vector_t	c partition 2	DECLARATION	NM N D	C	rigraph
+int	c receive only	ATTRIBUTE	PRE V VM	C	ovs
+class	C Vector 3	CLASS	PRE N D	C++	freeminer
+char	c where	DECLARATION	PRE VM	C	weechat
+Dtype	caffe next after	FUNCTION	PRE DT P	C++	caffe
+int	caps 2	ATTRIBUTE	N D	C++	panda3d
+gboolean	cb service in host group each host	FUNCTION	NM N P NM N DT N	C	naemon-core
+void*	ccv atan 2	FUNCTION	PRE N D	C	ccv
+void*	ccv cnnp batch norm add to output	FUNCTION	PRE PRE PRE PRE V P N	C	ccv
+ccv_numeric_data_t*	ccv get sparse matrix cell from vector	FUNCTION	PRE V NM NM N P N	C	ccv
+void	ccv nnc insert if prior to any	FUNCTION	PRE PRE V CJ NM P DT	C	ccv
+ccv_nnc_tensor_t*	ccv nnc tensor for while count	FUNCTION	PRE PRE N P NM N	C	ccv
+ovs_list	change set for tables	ATTRIBUTE	NM N P NPL	C	ovs
+char	chars 1	PARAMETER	NPL D	C	weechat
+guint	chars per unit	ATTRIBUTE	NPL P N	C	wireshark
+String	CHAT COBROWSE IMAGE 24 x 24	ATTRIBUTE	NM NM N D P D	Java	Spark
+int	check against known hosts	FUNCTION	V P NM NPL	C	git2r
+internal_function*	check arrival add next nodes	FUNCTION	V N V DT NPL	C	git2r
+bool	check content only	PARAMETER	V N VM	C++	s3fs-fuse
+void	check for host flapping	FUNCTION	V P N V	C	naemon-core
+bool	check last arg	FUNCTION	V DT N	C++	panda3d
+int*	checkout action wd only	FUNCTION	V NM N VM	C	git2r
+int*	checkout create the new	FUNCTION	PRE V DT NM	C	git2r
+void	clear all markers	FUNCTION	V DT NPL	C++	rigraph
+Builder	clear Part 1	FUNCTION	V N D	Java	drill
+bool	close fd when done	PARAMETER	V N VM V	C	git2r
+char	cmd 1	PARAMETER	N D	C	weechat
+int	col 1	PARAMETER	N D	C++	deeplearning4j
+LongType	col Stride 1	DECLARATION	NM N D	C++	deeplearning4j
+List	column Statistics V 1s	DECLARATION	NM NPL NM D	Java	drill
+ColumnTypeMetadata_v4	column Type Metadata v 4	DECLARATION	NM NM N NM D	Java	drill
+char	command 2	DECLARATION	N D	C	weechat
+int	commit on success	PARAMETER	V P N	C	git2r
+int	conditional match on branch	FUNCTION	NM N P N	C	git2r
+List	conjuncts 1	DECLARATION	NPL D	Java	drill
+conn	conn in	PARAMETER	NM N	C	ovs
+DrillConnectionImpl	connection 1	DECLARATION	N D	Java	drill
+Contents	contents 2	PARAMETER	NPL D	C++	panda3d
+ccv_cnnp_model_t	conv 0	DECLARATION	N D	C	ccv
+INDArray	conv 2D	FUNCTION	N NM	Java	deeplearning4j
+int	conv in channels	ATTRIBUTE	NM NM NPL	C++	caffe
+int	conv out channels	ATTRIBUTE	NM NM NPL	C++	caffe
+int	conv out spatial dim	ATTRIBUTE	NM NM NM N	C++	caffe
+void	conv rgba4444	FUNCTION	V N	C++	panda3d
+int	convert to 8 bit	PARAMETER	V P D N	C	mgba
+int	CONVERT TO UINT4 LENGTH	ATTRIBUTE	V P NM N	Java	drill
+bool*	copy primitives from	FUNCTION	V NPL P	C++	panda3d
+bool	copy this file	FUNCTION	V DT N	C++	panda3d
+DrillCostBase	cost 1	DECLARATION	N D	Java	drill
+int	count 1	PARAMETER	N D	C	git2r
+uint32_t	count 32	DECLARATION	N D	C	weechat
+class	Cout Stream	CLASS	NM N	C++	freeminer
+float	cp 0	DECLARATION	N D	C	ccv
+ContentParamType2	cpt 2	DECLARATION	N D	C++	freeminer
+class	Cropping 1D	CLASS	N NM	Java	deeplearning4j
+ConvertSupport	cs 2	PARAMETER	N D	C++	drill
+LdapContext	ctx 2	DECLARATION	N D	Java	Openfire
+class	CuDNN Deconvolution Layer	CLASS	PRE NM N	C++	caffe
+curandGenerator_t	curand generator	FUNCTION	NM N	C++	caffe
+WhichMemory	data or diff	PARAMETER	N CJ N	C++	caffe
+SelectionVector4	data Sv 4	PARAMETER	NM N D	Java	drill
+Decimal38DenseWriter	decimal 38 Dense	FUNCTION	N D NM	Java	drill
+int	DECIMAL 38 DENSE VALUE	ATTRIBUTE	NM D NM N	Java	drill
+Class	declared Type 1	PARAMETER	NM N D	Java	guava
+DrillBuf	decompress Page V 1	FUNCTION	V N NM D	Java	drill
+class	Deconvolution 2D	CLASS	N NM	Java	deeplearning4j
+class	Deconvolution 3D	CLASS	N NM	Java	deeplearning4j
+class	Deconvolution 3D Param Initializer	CLASS	NM NM NM N	Java	deeplearning4j
+class	Deconvolution Layer	CLASS	NM N	C++	caffe
+class	Deconvolution Param Initializer	CLASS	NM NM N	Java	deeplearning4j
+String	default S3 Bucket	ATTRIBUTE	NM NM N	Java	Singularity
+class	Depthwise Convolution 2D	CLASS	NM N NM	Java	deeplearning4j
+int	description 2	ATTRIBUTE	N D	C	weechat
+int	DESTINATION OPTIONS V6	ATTRIBUTE	NM NPL NM	Java	drill
+bool	Destroy Usr 1 Handler	FUNCTION	V N D N	C++	s3fs-fuse
+int*	dissect acdr ip or other	FUNCTION	V NM N CJ N	C	wireshark
+PN_stdfloat	dist 2	ATTRIBUTE	N D	C++	panda3d
+igraph_integer_t	distance 12	PARAMETER	N D	C	rigraph
+double	dl now	PARAMETER	V VM	C++	s3fs-fuse
+class	DL4J Invalid Input Exception	CLASS	PRE NM NM N	Java	deeplearning4j
+float32x4_t	dn 1 x 2	DECLARATION	N D P D	C	ccv
+bool	do adjust this size	FUNCTION	V V DT N	C++	panda3d
+void	do all sorted fn	FUNCTION	V DT V N	C	toggldesktop
+void	dof reg handoff dpp 0	FUNCTION	PRE V NM N D	C	wireshark
+uint32_t	dot3 ad Agg Port Attached Agg ID	ATTRIBUTE	PRE PRE PRE PRE NM NM N	C	ovs
+Icon	down Icon	ATTRIBUTE	NM N	Java	Spark
+int	down time	ATTRIBUTE	P N	C	freeminer
+String	downsample	ATTRIBUTE	V	Java	drill
+int	dp if create and open	FUNCTION	PRE CJ V CJ V	C	ovs
+int	dp if index	ATTRIBUTE	PRE NM N	C	ovs
+int*	dps for each	FUNCTION	N P DT	C	ovs
+OpenFlags	ds 2 Flags	PARAMETER	N D NPL	C++	facebook-repo-ds2
+int	ds last	FUNCTION	PRE DT	C	ovs
+PandaNode*	dupe for flatten	FUNCTION	V P V	C++	panda3d
+GUID	DX7 Device GUID	ATTRIBUTE	NM NM N	C	panda3d
+E	e 3	PARAMETER	N D	Java	guava
+E	e 8	PARAMETER	N D	Java	guava
+void	each seen event	FUNCTION	DT NM N	C++	meta
+igraph_vector_int_t	edge color 2	PARAMETER	NM N D	C	rigraph
+igraph_vector_t	edge map 2	PARAMETER	NM N D	C	rigraph
+igraph_inclist_t	edges per node	PARAMETER	NPL P N	C	rigraph
+ASTNodeInfo	else Info	DECLARATION	NM N	C++	cling
+Stmt	else Replacement	DECLARATION	NM N	C++	cling
+MinorType	else Type	DECLARATION	NM N	Java	drill
+boolean	enable Push down	ATTRIBUTE	V V P	Java	drill
+int	ENCAPSULATING SECURITY V6	ATTRIBUTE	NM N NM	Java	drill
+unsigned	encode only	PARAMETER	V VM	C	libxo
+int	end for	DECLARATION	V N	C	freeminer
+char*	end of record	FUNCTION	N P N	C	git2r
+DrillbitEndpoint	endpoint 1	PARAMETER	N D	Java	drill
+DrillbitEndpoint	endpoint 2	PARAMETER	N D	Java	drill
+boolean	enough Memory	FUNCTION	DT N	Java	drill
+Object	entry 1	PARAMETER	N D	Java	guava
+Client_data	entry 2	DECLARATION	N D	C	toxcore
+git_tree_entry	entry out	PARAMETER	NM N	C	git2r
+char*	Err no	FUNCTION	NM N	C++	facebook-repo-ds2
+void*	error 2	FUNCTION	N D	C	rigraph
+boolean	error On 400	ATTRIBUTE	N P D	Java	drill
+auto	even	DECLARATION	NM	C++	meta
+int	even dist	PARAMETER	NM N	C	ccv
+bool	even split	PARAMETER	NM N	C++	meta
+JsonParseException	ex 1	PARAMETER	N D	Java	drill
+Except	except	PARAMETER	N	Java	drill
+__int64	exit 64	DECLARATION	N D	C++	panda3d
+long	expand in	ATTRIBUTE	V N	C	toggldesktop
+CODE*	expression 7	FUNCTION	N D	C	rigraph
+CODE*	expression 8	FUNCTION	N D	C	rigraph
+bool	extend by hexahedron	FUNCTION	V P N	C++	panda3d
+PackOutFunc	f out	ATTRIBUTE	N NM	C++	freeminer
+double	f1 score	FUNCTION	NM N	C++	meta
+int	fan in	DECLARATION	N P	C++	caffe
+uint64_t	features per class	DECLARATION	NPL P N	C	meta
+_NXMapTable	field 37	ATTRIBUTE	N D	C	toggldesktop
+int	field 4	ATTRIBUTE	N D	C	toggldesktop
+int	field 63	ATTRIBUTE	N D	C	toggldesktop
+class	File 2 Page App	CLASS	N P NM N	C++	toggldesktop
+int	file no	PARAMETER	NM N	C++	facebook-repo-ds2
+NodePathCollection	find all matches	FUNCTION	V DT NPL	C++	panda3d
+TextureCollection*	find all textures	FUNCTION	V DT NPL	C++	panda3d
+int	flag true if should convert	PARAMETER	N NM CJ V V	C	panda3d
+unsigned	flag within	ATTRIBUTE	N P	C	git2r
+int	FLOAT4 VALUE	ATTRIBUTE	NM N	Java	drill
+Reporter	for Annotation	FUNCTION	NM N	Java	immutables
+boolean	for Attribute	ATTRIBUTE	NM N	Java	immutables
+linear_model	for avg	DECLARATION	P N	C++	meta
+boolean	for Backprop	PARAMETER	P N	Java	deeplearning4j
+TypeDescriptor 	for Class	FUNCTION	P N	Java	drill
+DrillConfig	for Client	FUNCTION	P N	Java	drill
+Visibility	for Implementation	FUNCTION	P N	Java	immutables
+void*	for num	FUNCTION	NM N	C	freeminer
+MinorType	for Number	FUNCTION	P N	Java	drill
+DeclaringPackage	for Package	FUNCTION	NM N	Java	immutables
+Set	for Resource	FUNCTION	P N	Java	drill
+ProxyInfo	for Socks4 Proxy	FUNCTION	P NM N	Java	Smack
+boolean	for Unknown Schema	PARAMETER	P NM N	Java	drill
+bool	force fog off	ATTRIBUTE	V N VM	C++	freeminer
+bool	force nd im 2 col	ATTRIBUTE	V NM N P N	C++	caffe
+vector	forward time per layer	DECLARATION	NM N P N	C++	caffe
+double	fp irand 224	FUNCTION	NM N D	C	rigraph
+bool	fp on	ATTRIBUTE	N P	C++	rigraph
+void	fprint all protocols for layer types	FUNCTION	V DT NPL P NM NPL	C	wireshark
+int*	friend in close	FUNCTION	N P N	C	toxcore
+state_id	from	PARAMETER	P	C	s3fs-fuse
+char	from	PARAMETER	P	C++	meta
+T	from Bytes	FUNCTION	P NPL	Java	Singularity
+ThreadContext	from context	PARAMETER	P N	C	panda3d
+boolean	from Docker Config	ATTRIBUTE	P NM N	Java	Singularity
+String	from Email	PARAMETER	P N	Java	Openfire
+FromHeader	from Header	ATTRIBUTE	P N	Java	Spark
+uint8_t	from id	PARAMETER	P N	C	toxcore
+boolean	from Inclusive	PARAMETER	P N	Java	guava
+class	from Iterator	CLASS	P N	Java	guava
+JID	from JID	PARAMETER	P N	Java	Openfire
+K	from Key	PARAMETER	P N	Java	guava
+StanzaFilter	from Room Filter	ATTRIBUTE	P NM N	Java	Smack
+boolean	from Server	PARAMETER		Java	Openfire
+SelectionVector4	from SV 4	PARAMETER	P N D	Java	drill
+int	from Y	PARAMETER	P N	Java	Spark
+int*	fts5 MultiIter Do Compare	FUNCTION	PRE PRE V V	C	toggldesktop
+void*	fts5 Seg Iter Clear	FUNCTION	PRE NM N V	C	ccv
+fts5yyParser	fts5 yyp Parser	PARAMETER	PRE NM N	C	ccv
+char*	function and data	DECLARATION	N CJ N	C	weechat
+bool	g curand availability logged	DECLARATION	PRE NM N V	C++	caffe
+objectlist	g next	DECLARATION	PRE DT	C	naemon-core
+int*	generate key or iv	FUNCTION	V N CJ N	C	wireshark
+void	Generate Prolog 1	FUNCTION	V N D	C++	facebook-repo-ds2
+string	get a 1	FUNCTION	V N D	C++	panda3d
+String	get B64 Data	FUNCTION	V NM N	Java	Smack
+LVecBase2d	get data 2d	FUNCTION	V N NM	C++	panda3d
+LVecBase4d	get data 4d	FUNCTION	V N NM	C++	panda3d
+LVecBase4i	get data 4i	FUNCTION	V N NM	C++	panda3d
+int	get Decimal 9 From Big Decimal	FUNCTION	V N D P NM N	Java	drill
+time_t	get last modified	FUNCTION	V DT N	C++	s3fs-fuse
+JLabel	get Look and feel Label	FUNCTION	V NM CJ NM N	Java	Spark
+long	get next comment id	FUNCTION	V DT NM N	C	naemon-core
+xmlChar*	get next marker	FUNCTION	V DT N	C++	s3fs-fuse
+long	get Part 2	FUNCTION	V N D	Java	drill
+LPoint3	get position world on a	FUNCTION	V NM N P N	C++	panda3d
+size_t	get start r 1	FUNCTION	V NM N D	C++	meta
+bool	get U 16 No Ex	FUNCTION	V N D DT N	C++	freeminer
+int*	git delta read header from stream	FUNCTION	PRE PRE V N P N	C	git2r
+int	git diff commit as email	FUNCTION	PRE PRE N P N	C	git2r
+int	git fs path to dir	FUNCTION	PRE NM N P N	C	git2r
+int	git index update all	FUNCTION	PRE PRE V DT	C	git2r
+size_t	git off map size	FUNCTION	PRE NM NM N	C	git2r
+int*	git repository head detached for work tree	FUNCTION	PRE PRE N V P NM N	C	git2r
+void	gl M 3 Inv	FUNCTION	PRE N D N	C++	panda3d
+void	group with	FUNCTION	V P	C++	panda3d
+GsonBuilder	gson Builder	DECLARATION	NM N	Java	immutables
+void*	gui buffer local var remove all	FUNCTION	PRE PRE NM N V DT	C	weechat
+void	gui buffer set time for each line	FUNCTION	PRE PRE V N P DT N	C	weechat
+void*	gui input search next	FUNCTION	PRE PRE V DT	C	weechat
+int*	gui line has tag no filter	FUNCTION	PRE PRE V N DT N	C	weechat
+void*	gui line mixed free all	FUNCTION	PRE PRE NM V DT	C	weechat
+char*	gui mouse event code 2 key	FUNCTION	PRE PRE NM N P N	C	weechat
+void	gui nick hash sum 64	FUNCTION	PRE PRE NM N D	C	weechat
+in6_addr	gw 6	PARAMETER	N D	C	ovs
+auto	h 2 server	DECLARATION	N D N	C++	proxygen
+int	h if index	ATTRIBUTE	PRE NM N	C	ovs
+int*	handle send 2	FUNCTION	V N D	C	toxcore
+bool	has after destruct	ATTRIBUTE	V NM N	C	freeminer
+bool	has each	DECLARATION	V DT	C++	panda3d
+bool	has in band	PARAMETER	V NM N	C	ovs
+bool	has on activate	ATTRIBUTE	V P V	C	freeminer
+bool	has run at least once	ATTRIBUTE	V V VM VM VM	C++	caffe
+bool	has Upgrade Token in Connection	DECLARATION	V NM N P N	C++	proxygen
+class	Hash 32 Functions	CLASS	N D NPL	Java	drill
+class	Hash 64 Functions With Seed	CLASS	N D NPL P N	Java	drill
+int	hash and save	FUNCTION	V CJ V	C	git2r
+float	hbp 1	DECLARATION	N D	C	ccv
+gboolean	header only	PARAMETER	N VM	C	wireshark
+void	hide all switches	FUNCTION	V DT NPL	C++	panda3d
+ObjectMapper	hocon Mapper	ATTRIBUTE	NM N	Java	drill
+void	host 1	PARAMETER	N D	C	naemon-core
+int	hosts down	DECLARATION	NPL VM	C	naemon-core
+gint	how	PARAMETER	VM	C	wireshark
+float	hp 1	DECLARATION	N D	C	ccv
+int	httperf 2	FUNCTION	N D	C++	proxygen
+int	i 1	PARAMETER	N D	Java	Openfire
+igraph_error_t*	i graph all st min cuts	FUNCTION	PRE PRE DT NM NM NPL	C	rigraph
+class	Iax2 Analysis Tree Widget Item	CLASS	PRE NM NM NM N	C++	wireshark
+IDAT	idat	ATTRIBUTE	N	C	mgba
+IDAT	idat var	PARAMETER	NM N	C	mgba
+char	idb 1 if description	DECLARATION	N D NM N	C	wireshark
+LogicalExpression	if Condition	DECLARATION	NM N	Java	drill
+IfElseWidthExpr	if Else Width Expr	PARAMETER	NM NM NM N	Java	drill
+SqlNode	if Exists	ATTRIBUTE	NM N	Java	drill
+IfExpression	if Expr	PARAMETER	NM N	Java	drill
+JBlock	if Found	DECLARATION	CJ V	Java	drill
+int	if index	ATTRIBUTE	NM N	C	ovs
+ifinfomsg	if info	DECLARATION	NM N	C	ovs
+char	if name	ATTRIBUTE	NM N	C	ovs
+JBlock	if No Val	DECLARATION	CJ DT N	Java	drill
+void	if notifier wait	FUNCTION	NM N V	C	ovs
+boolean	if Present	PARAMETER	CJ NM	Java	immutables
+OutputWidthExpression	if Reduced Expr	DECLARATION	CJ NM N	Java	drill
+PMIB_IF_TABLE2	if Table	PARAMETER	NM N	C	ovs
+bool	if Unique	PARAMETER	CJ NM	C++	cling
+int	if Width	DECLARATION	NM N	Java	drill
+int	igraph 2 w heap init	FUNCTION	PRE D VM N V	C	rigraph
+void	igraph err no	PARAMETER	NM NM N	C	rigraph
+int	igraph i get subisomorphisms vf 2 inner	FUNCTION	PRE PRE V NPL N D NM	C	rigraph
+bool	im 2 col	PARAMETER	N P N	C++	caffe
+INDArray	im 2 col 2d	DECLARATION	N P N NM	Java	deeplearning4j
+class	Im 2 col Layer	CLASS	N P N N	C++	caffe
+Image	image 1	PARAMETER	N D	Java	Spark
+String	in Action Code	PARAMETER	PRE NM N	Java	Smack
+InetAddress	in addr	PARAMETER	NM N	Java	Openfire
+bool	in best path	ATTRIBUTE	P NM N	C++	rigraph
+Channel	in channel	PARAMETER	NM N	C++	drill
+verify_context	in ctx	PARAMETER	NM N	C++	drill
+boolean	in Eclipse Compiler	ATTRIBUTE	P NM N	Java	immutables
+Map	in Edges	PARAMETER	NM NPL	Java	guava
+Set	in Eq More Than Once	DECLARATION	P N DT CJ VM	Java	drill
+int	in expected RPC Type	PARAMETER	PRE NM NM N	C++	drill
+TypedFieldId	in Field Id	DECLARATION	NM NM N	Java	drill
+List	in Fields	ATTRIBUTE	NM NPL	Java	drill
+double	in Flow New M	DECLARATION	NM NM NM N	C++	rigraph
+Context	in for	FUNCTION	P N	Java	immutables
+HANDLE	in hand	DECLARATION	NM N	C	toggldesktop
+void	in how	ATTRIBUTE	V VM	C	mgba
+Integer	in Index	PARAMETER	NM N	Java	drill
+ovs_be64	in key	ATTRIBUTE	NM N	C	ovs
+igraph_adjlist_t	in list	DECLARATION	NM N	C	rigraph
+boolean	in Literal	DECLARATION	P N	Java	Openfire
+int	in Mem Count	PARAMETER	P NM N	Java	drill
+string	in name	PARAMETER	NM N	C++	meta
+stbi_uc	in near	PARAMETER	N P	C	panda3d
+bool	in neighbour heap	ATTRIBUTE	P NM N	C++	rigraph
+boolean	in Outer List	ATTRIBUTE	P NM N	Java	drill
+Pipe	in Pipe	PARAMETER	NM N	C++	toggldesktop
+string	in prop Name	PARAMETER	NM NM N	C++	drill
+SizeT	in Size	DECLARATION	NM N	C	mgba
+RelTrait	in Trait	ATTRIBUTE	NM N	Java	drill
+BOOLEAN	in Transaction	DECLARATION	P N	C	ovs
+int	in Vector	DECLARATION	NM N	Java	drill
+bool	include all fetch heads	DECLARATION	V DT NM NPL	C	git2r
+boolean	incoming Has Sv 2	DECLARATION	V V N D	Java	drill
+int	index as child	ATTRIBUTE	V P N	C++	deeplearning4j
+int	index as parent	ATTRIBUTE	V P N	C++	deeplearning4j
+xo_info_t	info p	PARAMETER	NM N	C	libxo
+void	init 2	FUNCTION	N D	C++	toggldesktop
+BIO	inkey bio	DECLARATION	NM N	C	toggldesktop
+Map	inlinables	ATTRIBUTE	NPL	Java	immutables
+int32_t	inp 0	DECLARATION	N D	C	mgba
+int	insert at	PARAMETER	V P	C++	panda3d
+void*	insert V 4 Headers	FUNCTION	V N D NPL	C++	s3fs-fuse
+uint32_t	insn 0	PARAMETER	N D	C++	facebook-repo-ds2
+uint32_t	insn 1	PARAMETER	N D	C++	facebook-repo-ds2
+void	instr max	ATTRIBUTE	NM N	C++	toggldesktop
+String	interval SubString 1	DECLARATION	NM N D	Java	drill
+int	INVERSE COMPUTE FOR WORD OF ALL 1S	ATTRIBUTE	V N P N P DT NPL	Java	guava
+u_int8_t	ip6 h nxt	ATTRIBUTE	PRE N NM	C	ovs
+guint8	ip6r0 slmap	ATTRIBUTE	PRE N	C	wireshark
+void	irc batch free all	FUNCTION	PRE PRE V DT	C	weechat
+int*	irc color convert rgb 2 irc	FUNCTION	PRE PRE V N P N	C	weechat
+int*	irc message split 005	FUNCTION	PRE N V D	C	weechat
+void*	irc notify new for all servers	FUNCTION	PRE PRE NM P DT NPL	C	weechat
+bool	is 1 x 1	ATTRIBUTE	V D P D	C++	caffe
+bool	is 32	ATTRIBUTE	V D	C	facebook-repo-ds2
+bool*	is a ge zero and a lt b	FUNCTION	V N NM D CJ N NM N	C++	caffe
+bool	is all	PARAMETER	V DT	C++	s3fs-fuse
+boolean	is Base32	FUNCTION	V N	Java	Openfire
+bool*	is convertible to	FUNCTION	V NM P	C++	panda3d
+char	is dir 2	PARAMETER	V N D	C	git2r
+bool	is even	PARAMETER	V NM	C++	toggldesktop
+bool	is HTTP11	FUNCTION	V N	C++	proxygen
+bool	is step up	ATTRIBUTE	V N P	C++	freeminer
+boolean	is Supports Limit Push down	FUNCTION	V V N V P	Java	drill
+bool	is this	FUNCTION	V N	C++	panda3d
+int*	is valid escalation for service notification	FUNCTION	V NM N P NM N	C	naemon-core
+bool	is valid position 2	DECLARATION	V NM N D	C++	freeminer
+int	iterations per sample	ATTRIBUTE	NPL P N	C++	freeminer
+class	Java11 Web Socket	CLASS	PRE NM N	Java	Smack
+class	Java11 Web Socket Factory	CLASS	PRE NM NM N	Java	Smack
+Map	join Mj Id 2 Scan Mj Id	DECLARATION	V NM N P V NM N	Java	drill
+K	k 6	PARAMETER	N D	Java	guava
+K	k 7	PARAMETER	N D	Java	guava
+K	k 8	PARAMETER	N D	Java	guava
+int	k Diy Significand Size	ATTRIBUTE	NM NM NM N	C++	panda3d
+uint64_t	k Dp Significand Mask	ATTRIBUTE	NM NM NM N	C++	panda3d
+int	k Dp Significand Size	ATTRIBUTE	NM NM NM N	C++	panda3d
+bool	k Im 2 Col	DECLARATION	NM N P N	C++	caffe
+uint32_t	keep when false	PARAMETER	V VM NM	C	rigraph
+uint32_t	keep when true	PARAMETER	V VM NM	C	rigraph
+class	Keras 2D Embedding	CLASS	PRE NM N	Java	deeplearning4j
+class	Keras Convolution 2D	CLASS	PRE N NM	Java	deeplearning4j
+class	Keras Deconvolution 2D	CLASS	PRE N NM	Java	deeplearning4j
+class	Keras Depthwise Convolution 2D	CLASS	PRE NM N NM	Java	deeplearning4j
+class	Keras Upsampling 1D	CLASS	PRE N NM	Java	deeplearning4j
+class	Keras Zero Padding 1D	CLASS	PRE NM N NM	Java	deeplearning4j
+void*	kill nonused tcp	FUNCTION	V NM N	C	toxcore
+uint16_t	l 4 ofs	ATTRIBUTE	N D N	C	ovs
+int	l get node or nil	FUNCTION	PRE V N CJ NM	C++	freeminer
+int*	l place schematic on vmanip	FUNCTION	PRE V N P N	C++	freeminer
+int	l set last run mod	FUNCTION	PRE V DT V N	C++	freeminer
+integer	l wk 1	DECLARATION	PRE N D	C	rigraph
+double	l1 regularizer	ATTRIBUTE	NM N	C	meta
+void	l2 norm transform	FUNCTION	NM NM V	C	meta
+int	last Access Time	PARAMETER	DT NM N	Java	drill
+guint32	last ack seq	ATTRIBUTE	DT NM N	C	wireshark
+long	last Active	ATTRIBUTE	DT N	Java	Openfire
+Instant	last Activity	DECLARATION	DT N	Java	Openfire
+Date	last Activity Date Range Max	ATTRIBUTE	DT NM NM NM N	Java	Openfire
+long	last Answered Request ID	ATTRIBUTE	DT NM NM N	Java	Openfire
+String	last Argument	DECLARATION	DT N	Java	immutables
+boolean	last Batch Read	ATTRIBUTE	DT NM N	Java	drill
+int	last bucket	DECLARATION	DT N	C	naemon-core
+unique_ptr	last builder	ATTRIBUTE	DT N	C++	meta
+size_t	last bytes	ATTRIBUTE	DT NPL	C	git2r
+AtomicInteger	last Coordination Id	ATTRIBUTE	DT NM N	Java	drill
+time_t	last data purge	ATTRIBUTE	DT N V	C	weechat
+Document	last Document	ATTRIBUTE	DT N	Java	drill
+BatchHolder	last Entry Batch	PARAMETER	DT NM N	Java	drill
+DWORD	last err	PARAMETER	DT N	C	ovs
+SQLException	last Exception	DECLARATION	DT N	Java	Openfire
+QualType	last Expr Ty	DECLARATION	DT NM N	C++	cling
+AtomicLong	last Heartbeat Time	PARAMETER	DT NM N	Java	Singularity
+uint64_t	last id	DECLARATION	DT N	C++	meta
+int	last Idx	ATTRIBUTE	DT N	Java	deeplearning4j
+size_t	last in target	DECLARATION	DT P N	C	git2r
+int	last Index Of Dot	DECLARATION	DT N P N	Java	immutables
+K	last Key	FUNCTION	DT N	Java	guava
+int	last layer index	ATTRIBUTE	DT NM N	C++	caffe
+label_id	last lbl	DECLARATION	DT N	C++	meta
+INDArray	last Mem Cell	ATTRIBUTE	DT NM N	Java	deeplearning4j
+HashMap	last Message	ATTRIBUTE	DT N	Java	Spark
+Date	last Message Date	PARAMETER	DT NM N	Java	Smack
+Mode	last Mode	ATTRIBUTE	DT N	C++	freeminer
+guint	last n	ATTRIBUTE	DT N	C	wireshark
+bool	last part	PARAMETER	DT N	C++	s3fs-fuse
+long	last Pending Task Cache	ATTRIBUTE	DT NM NM N	Java	Singularity
+u16	last percent	ATTRIBUTE	DT N	C++	freeminer
+AtomicLong	last Persister Success	PARAMETER	DT NM N	Java	Singularity
+int	last pos	PARAMETER	DT N	C	toggldesktop
+ScanRange	last Range	DECLARATION	DT N	Java	drill
+RelNode	last Rel Node	DECLARATION	DT NM N	Java	drill
+long	last Request Utilization Cache	ATTRIBUTE	DT NM NM N	Java	Singularity
+IterOutcome	last Right Status	DECLARATION	DT NM N	Java	drill
+int	last Row	ATTRIBUTE	DT N	Java	drill
+t_plugin_script	last script	ATTRIBUTE	DT N	C	weechat
+int	last Segment Index	DECLARATION	DT NM N	Java	drill
+int	last Set	DECLARATION	DT N	Java	drill
+int	last Slash Index	DECLARATION	DT NM N	Java	drill
+StreamID	last Stream	PARAMETER	DT N	C++	proxygen
+auto	last Stream Id Size	DECLARATION	DT NM NM N	C++	proxygen
+Optional	last Task Status	ATTRIBUTE	DT NM N	Java	Singularity
+uint64_t	last term	ATTRIBUTE	DT N	C	ovs
+u64	last time	ATTRIBUTE	DT N	C++	freeminer
+time_t	last time critical	ATTRIBUTE	DT N NM	C	naemon-core
+u64	last time ms	ATTRIBUTE	DT N NM	C++	freeminer
+char	last Transition	ATTRIBUTE	DT N	Java	drill
+long	last Update Time	ATTRIBUTE	DT NM N	Java	deeplearning4j
+SingularityTaskUsage	last Usage	DECLARATION	DT N	Java	Singularity
+long	last used	ATTRIBUTE	DT V	C	ovs
+int	last used Ypos	ATTRIBUTE	DT V N	Java	Spark
+boolean	last Value	PARAMETER	DT N	Java	immutables
+rusage	last wakeup	PARAMETER	DT V	C	ovs
+wint_t	last wc	PARAMETER	DT N	C	git2r
+int	last Write	PARAMETER	DT N	Java	drill
+int	last Write Index	FUNCTION	DT NM N	Java	drill
+JVar	last Writer Idx	ATTRIBUTE	DT NM N	Java	drill
+int	last Y	ATTRIBUTE	DT N	C	mgba
+String	LAYER FIELD POOL 1D SIZE	ATTRIBUTE	NM NM N NM NM	Java	deeplearning4j
+char	least addr	ATTRIBUTE	DT N	C++	panda3d
+Intratype	left Intra type	DECLARATION	NM NM N	Java	immutables
+uint32_t	left to parse	DECLARATION	V P V	C++	proxygen
+size_t	len a	PARAMETER	NM N	C	freeminer
+String	less Terminal Path	ATTRIBUTE	NM NM N	Java	Singularity
+LikeFilter	like Filter	PARAMETER	NM N	C++	drill
+RunQuery	limit 0 Query	DECLARATION	V D N	Java	drill
+int	lines after	ATTRIBUTE	NPL P	C	weechat
+size_t	lines in hunk	DECLARATION	NPL P N	C	git2r
+int	lines per option	DECLARATION	NPL P N	C	weechat
+long	lk 1	DECLARATION	N D	Java	drill
+int	load from lib dir	PARAMETER	V P NM N	C	weechat
+git_str	local path out	PARAMETER	NM NM N	C	git2r
+string	log on wait	PARAMETER	V P V	C++	caffe
+JComboBox	look and feel	ATTRIBUTE	NM CJ NM	Java	Spark
+JLabel	look and feel Label	ATTRIBUTE	NM CJ NM N	Java	Spark
+LPoint3	look at	ATTRIBUTE	V P	C++	panda3d
+double	m 11	ATTRIBUTE	N D	C++	freeminer
+double	m 13	ATTRIBUTE	N D	C++	freeminer
+double	m 21	ATTRIBUTE	N D	C++	freeminer
+double	m 22	ATTRIBUTE	N D	C++	freeminer
+double	m 23	ATTRIBUTE	N D	C++	freeminer
+double	m 31	ATTRIBUTE	N D	C++	freeminer
+double	m 33	ATTRIBUTE	N D	C++	freeminer
+bool	m all Tables Selectable	ATTRIBUTE	PRE DT NPL NM	C++	drill
+QString	m endpoint a	ATTRIBUTE	PRE NM N	C++	wireshark
+ForWhat	m for What	ATTRIBUTE	PRE P DT	C++	freeminer
+double	m in Nanoseconds	ATTRIBUTE	PRE P NPL	C++	freeminer
+AssertionInfo	m last Assertion Info	ATTRIBUTE	PRE DT NM N	C++	freeminer
+bool	m last Assertion Passed	ATTRIBUTE	PRE DT N V	C++	freeminer
+size_t	m last Connection	ATTRIBUTE	PRE DT N	C++	drill
+string	m last Query	ATTRIBUTE	PRE DT N	C++	drill
+Option	m last Result	ATTRIBUTE	PRE DT N	C++	freeminer
+u16	m last used id	ATTRIBUTE	PRE DT NM N	C++	freeminer
+bool	m like Escape Clause Supported	ATTRIBUTE	PRE NM NM N V	C++	drill
+map	m name to id	ATTRIBUTE	PRE N P N	C++	freeminer
+CachedVertexShaderSetting	m perspective bias 1 vertex	ATTRIBUTE	PRE NM NM D N	C++	freeminer
+void	m PSP 2 Load ROM	FUNCTION	PRE N D V N	C	mgba
+string	m redirected Cout	ATTRIBUTE	PRE V N	C++	freeminer
+int	m sbox 4	ATTRIBUTE	PRE N D	Java	Openfire
+T	m to	ATTRIBUTE	PRE P	C++	toggldesktop
+Vector	m to copy	ATTRIBUTE	PRE P N	Java	freeminer
+bool	m used up	ATTRIBUTE	PRE V VM	C++	freeminer
+string	m what	ATTRIBUTE	PRE DT	C++	toggldesktop
+String	MAIL FORWARD 16 x 16	ATTRIBUTE	V N D P D	Java	Spark
+int*	make no n indexed	FUNCTION	V DT N NM	C++	panda3d
+int	max v 4 frag list size	ATTRIBUTE	NM N D NM NM N	C	ovs
+int	md5 out len	DECLARATION	NM NM N	C++	s3fs-fuse
+void	merge chunks by bucket size	FUNCTION	V NPL P NM N	C++	meta
+int*	merge driver name for path	FUNCTION	V NM N P N	C	git2r
+auto	merged 1	ATTRIBUTE	N D	C++	deeplearning4j
+auto	merged 3	ATTRIBUTE	N D	C++	deeplearning4j
+char	message after mod	ATTRIBUTE	N P N	C	weechat
+char	message before mod	ATTRIBUTE	V P N	C	weechat
+char	message no color	DECLARATION	N DT N	C	weechat
+float	mid 1	DECLARATION	N D	C++	freeminer
+LogicalMinus	minus	PARAMETER	P	Java	drill
+bool	monitor everything by default	PARAMETER	V DT P N	C	ovs
+ccv_cnnp_dataframe_data_item_t	more data	DECLARATION	DT N	C	ccv
+ClosingFuture	more Futures	PARAMETER	DT NPL	Java	guava
+Gs	more Generators	PARAMETER	DT NPL	C++	freeminer
+u8	more To Follow	PARAMETER	DT P V	C	ccv
+int	morecore properties	ATTRIBUTE	PRE NPL	C++	panda3d
+long	murmur 3 64	FUNCTION	N D D	Java	drill
+class	Murmur Hash 3	CLASS	NM N D	Java	drill
+class	Murmur3 128 Hash Function	CLASS	PRE D NM N	Java	guava
+MapNode	n 3	DECLARATION	N D	C++	freeminer
+Int	n col 2	PARAMETER	NM N D	C	rigraph
+MapNode	n dirt with grass	DECLARATION	PRE N P N	C++	freeminer
+uint64_t	n frag too small	ATTRIBUTE	NM N VM NM	C	ovs
+MapNode	n from	PARAMETER	N P	C++	freeminer
+MapNode	n water or ice	DECLARATION	N N CJ N	C++	freeminer
+string	name 1	PARAMETER	N D	C++	meta
+char	name what	ATTRIBUTE	N DT	C	freeminer
+void	Net after forward	FUNCTION	PRE P N	C++	caffe
+void	Net before backward	FUNCTION	PRE P N	C++	caffe
+void	Net before forward	FUNCTION	PRE P N	C++	caffe
+int*	netdev dummy queue dump next	FUNCTION	PRE PRE N V DT	C	ovs
+int*	network pass socks5 proxy	FUNCTION	NM NM NM N	C	weechat
+int	new Param 1	ATTRIBUTE	NM N D	C++	freeminer
+int	new Param 2	ATTRIBUTE	NM N D	C++	freeminer
+int	new Shape 2	DECLARATION	NM N D	Java	deeplearning4j
+BatchInfo	next	ATTRIBUTE	DT	C++	toggldesktop
+t_gui_bar	next bar	ATTRIBUTE	DT N	C	weechat
+float	next Cast Cost	DECLARATION	DT NM N	Java	drill
+char	next Char	DECLARATION	DT N	Java	drill
+t_config_file	next config	ATTRIBUTE	DT N	C	weechat
+xodtemplate_daterange	next date range	DECLARATION	DT NM N	C	naemon-core
+double	next Double	FUNCTION	DT N	Java	immutables
+void	next Egress	FUNCTION	DT N	C++	proxygen
+Group	next Element	ATTRIBUTE	DT N	Java	Openfire
+int32_t	next event	ATTRIBUTE	DT N	C	mgba
+String	next Field	ATTRIBUTE	DT N	Java	drill
+String	next Field Name	FUNCTION	DT NM N	Java	immutables
+NextFilter	Next Filter	PARAMETER	DT N	Java	Openfire
+ClassNames	next Generated	DECLARATION	DT V	Java	drill
+t_gui_nick_group	next group	ATTRIBUTE	DT N	C	weechat
+xodtemplate_hostescalation	next he	DECLARATION	DT N	C	naemon-core
+int	next head	FUNCTION	DT N	C	git2r
+int	next Id	ATTRIBUTE	DT N	Java	drill
+t_irc_ignore	next ignore	ATTRIBUTE	DT V	C	weechat
+ImmutableEntry	next In Bucket	FUNCTION	DT P N	Java	guava
+TaskStatus	next In Memory	FUNCTION	DT P N	Java	Singularity
+Object	next Instance	FUNCTION	DT N	Java	cglib
+boolean	next Integer If Not EOF	FUNCTION	DT NM CJ DT N	Java	drill
+char	next Key	DECLARATION	DT N	C++	cling
+Object	next Label	DECLARATION	DT N	Java	guava
+int	next Local	ATTRIBUTE	DT N	Java	cglib
+string	next marker	DECLARATION	DT N	C++	s3fs-fuse
+int	next max	ATTRIBUTE	DT N	C	ovs
+vector	next metadata	FUNCTION	DT N	C++	meta
+List	next Names	DECLARATION	DT NPL	Java	drill
+Cell	next nonsingleton	ATTRIBUTE	DT N	C++	rigraph
+objectlist	next object list	DECLARATION	DT NM N	C	naemon-core
+gint	next offset	DECLARATION	DT N	C	wireshark
+ReadStatus	next Page From Queue	FUNCTION	DT N P N	Java	drill
+Object	next Page Value	DECLARATION	DT NM N	Java	drill
+int	next Partition To Return	ATTRIBUTE	DT N P N	Java	drill
+uint32_t	next PC	DECLARATION	DT N	C++	facebook-repo-ds2
+ClassNames	next Precompiled	DECLARATION	DT NM	Java	drill
+Token*	next Preprocessed	FUNCTION	DT V	C++	toggldesktop
+string	next Protocol	PARAMETER	DT N	C++	proxygen
+string	next Protos	PARAMETER	DT NPL	C++	proxygen
+query_handler	next qh	ATTRIBUTE	DT N	C	naemon-core
+u8	next real face	DECLARATION	DT NM N	C++	freeminer
+Pair	next Row Key Batch	FUNCTION	DT NM NM N	Java	drill
+xodtemplate_serviceextinfo	next se	DECLARATION	DT N	C	naemon-core
+servicesmember	next services member	DECLARATION	DT NM N	C	naemon-core
+ClassSet	next Set	DECLARATION	DT N	Java	drill
+Node*	next Sibling	FUNCTION	DT N	C++	toggldesktop
+FileSplit	next Split	FUNCTION	DT N	Java	drill
+long	next tag	DECLARATION	DT N	C++	s3fs-fuse
+Runnable	next Task	ATTRIBUTE	DT N	Java	guava
+int	next tex	ATTRIBUTE	DT N	C++	panda3d
+char	next Tok Ptr	PARAMETER	DT NM N	C	toggldesktop
+ASN1_GENERALIZEDTIME	next upd	PARAMETER	DT N	C	toggldesktop
+int32_t	next update	ATTRIBUTE	DT N	C	mgba
+long	next Update Time	ATTRIBUTE	DT NM N	Java	drill
+long	next Value	ATTRIBUTE	DT N	Java	metrics
+ImmutableList	no Attributes	ATTRIBUTE	DT NPL	Java	immutables
+bool	no cache	PARAMETER	DT N	C	freeminer
+int	no callback	DECLARATION	DT N	C	git2r
+fdpage_list_t	no data pages	DECLARATION	DT NM NPL	C++	s3fs-fuse
+no_delay	no Delay	DECLARATION	DT N	C++	drill
+float	no dig delay timer	ATTRIBUTE	DT N NM N	C++	freeminer
+bool	no emerge	PARAMETER	VM V	C++	freeminer
+bool	no Error	FUNCTION	DT N	C++	proxygen
+int	no in	DECLARATION	NM N	C	rigraph
+boolean	no Interfaces	ATTRIBUTE	DT NPL	Java	deeplearning4j
+int	no lock Lock	FUNCTION	DT NM N	C	ccv
+char	no log	DECLARATION	DT N	C	weechat
+bool	no logo	PARAMETER	DT N	C	cling
+igraph_integer_t	no of edges 2	DECLARATION	N P NPL D	C	rigraph
+int	no of groups	PARAMETER	N P NPL	C	rigraph
+int	no of nodes	DECLARATION	N P NPL	C	rigraph
+igraph_integer_t	no out types	PARAMETER	NM NM NPL	C	rigraph
+bool	no output	ATTRIBUTE	DT N	C++	freeminer
+bool	no random	PARAMETER	DT N	C++	freeminer
+uint64_t	no replay	DECLARATION	DT N	C	toxcore
+boolean	no Reply	PARAMETER	DT N	Java	Smack
+bool	no Runtime	PARAMETER	DT N	C++	cling
+SourceLocation	no Src Loc	DECLARATION	DT NM N	C++	cling
+int	no text	PARAMETER	DT N	C	toggldesktop
+bool	no truncate	PARAMETER	DT V	C++	s3fs-fuse
+Writable	no Val	ATTRIBUTE	DT N	Java	deeplearning4j
+PandaNode	node 2	PARAMETER	N D	C++	panda3d
+float	noise 2	PARAMETER	N D	C++	freeminer
+auto	none	DECLARATION	DT	C++	meta
+void	normalize by rebuilding	FUNCTION	V P V	C++	panda3d
+int*	not a local branch	FUNCTION	VM DT NM N	C	git2r
+guint16	noti flags number	PARAMETER	NM NM N	C	wireshark
+int	notified on	ATTRIBUTE	V P	C	naemon-core
+int	notify contact of host	FUNCTION	V N P N	C	naemon-core
+Instant	now	DECLARATION	VM	Java	Openfire
+string	now cache	DECLARATION	VM V	C++	s3fs-fuse
+long	now Micros	PARAMETER	VM NPL	Java	guava
+string	now path	DECLARATION	VM N	C++	s3fs-fuse
+time_t	now time	DECLARATION	VM N	C++	s3fs-fuse
+TextureCollection*	ns find all textures	FUNCTION	PRE V DT NPL	C++	panda3d
+bool	null or empty	FUNCTION	NM CJ NM	C++	deeplearning4j
+int	num ascnt	ATTRIBUTE	NM N	C	rigraph
+Blob	num by chans	ATTRIBUTE	N P NPL	C++	caffe
+int	num faces to draw	DECLARATION	NM NPL P V	C++	freeminer
+int	num kernels col 2 im	ATTRIBUTE	NM NPL N P N	C++	caffe
+int	num kernels im 2 col	ATTRIBUTE	NM NPL N P N	C++	caffe
+int	num of after release pdus	ATTRIBUTE	N P P V NPL	C	wireshark
+int	num of gops	ATTRIBUTE	N P NPL	C	wireshark
+size_t	num to check	DECLARATION	N P V	C	mgba
+int	num vertices per primitive	DECLARATION	NM NPL P N	C++	panda3d
+long	number of nodes	DECLARATION	N P NPL	C++	rigraph
+NumericType	numeric type 3	PARAMETER	NM N D	C++	panda3d
+int	nxt in	ATTRIBUTE	P P	C	rigraph
+MultiUserChatService	o 2	PARAMETER	N D	Java	Openfire
+LongType	o Stride 0	DECLARATION	NM N D	C++	deeplearning4j
+btCollisionObject	obj 0	DECLARATION	N D	C++	panda3d
+int	obsess over host	PARAMETER	V P N	C	naemon-core
+odp_key_fitness*	odp nsh key from attr	FUNCTION	PRE PRE N P N	C	ovs
+float	odx 1	DECLARATION	N D	C	rigraph
+class	OF 1515	CLASS	N D	Java	Openfire
+ofbundle	of bundle	PARAMETER	PRE N	C	ovs
+ofservice	of service	DECLARATION	PRE N	C	ovs
+off_t	off	PARAMETER	N	C	meta
+auto	off Arg	DECLARATION	NM N	C++	facebook-repo-ds2
+long	off Bits	PARAMETER	NM NPL	C++	panda3d
+ImageIcon	off Icon	ATTRIBUTE	NM N	Java	Openfire
+int	offset p	PARAMETER	NM N	C	wireshark
+igraph_real_t	offset to left contour	ATTRIBUTE	V P NM N	C	rigraph
+igraph_real_t	offset to right contour	ATTRIBUTE	V P NM N	C	rigraph
+ofpbuf*	ofp buf clone with headroom	FUNCTION	PRE PRE N P N	C	ovs
+git_oid	oid a	PARAMETER	NM N	C	git2r
+int	old Param 2	ATTRIBUTE	NM N D	C++	freeminer
+ObjectName	on	DECLARATION	N	Java	metrics
+void	on Connection Close	FUNCTION	P N V	Java	Openfire
+int	on disk	ATTRIBUTE	P N	C	git2r
+void	on Egress Buffered	FUNCTION	P N V	C++	proxygen
+ErrorCode	on Execute Program	FUNCTION	P V N	C++	facebook-repo-ds2
+ObjectNameFactory	on Factory	PARAMETER	NM N	Java	metrics
+bp::object	on gradients ready	ATTRIBUTE	P NPL NM	C++	caffe
+size_t	on Header Bytes Generated	DECLARATION	P NM NPL V	C	proxygen
+ImageIcon	on Icon	ATTRIBUTE	NM N	Java	Openfire
+void	on Ingress Error	FUNCTION	P NM N	C++	proxygen
+void	on Metric Removed	FUNCTION	P N V	Java	metrics
+ebb_header_cb	on multipart header field	ATTRIBUTE	P NM NM N	C	ccv
+void	on Ping Reply Latency	FUNCTION	P NM NM N	C++	proxygen
+void	on Post Execute	FUNCTION	P N V	Java	freeminer
+ErrorCode	on Query Current Thread	FUNCTION	P NM NM N	C++	facebook-repo-ds2
+ErrorCode	on Query Hardware Watchpoint Count	FUNCTION	P NM NM NM N	C++	facebook-repo-ds2
+ebb_element_cb	on query string	ATTRIBUTE	P NM N	C	ccv
+http_data_cb	on reason	ATTRIBUTE	P N	C	proxygen
+void*	on request response	FUNCTION	P NM N	C	ccv
+void	on Response Content	FUNCTION	P NM N	Java	metrics
+void	on Server Egress Paused	FUNCTION	P NM N V	C++	proxygen
+bool	on Server Side	PARAMETER	P NM N	C++	toggldesktop
+void	on Shutdown Request	FUNCTION	P NM N	Java	drill
+void	on start	FUNCTION	P N	C++	caffe
+object	on start	ATTRIBUTE	P N	C++	caffe
+http_data_cb	on status	ATTRIBUTE	P N	C	crow
+ErrorCode*	on Thread Is Alive	FUNCTION	P N V NM	C++	facebook-repo-ds2
+void	on Tick	FUNCTION	P N	Java	drill
+bool	on Transport Ready Common	FUNCTION	P NM NM N	C++	proxygen
+ovsthread_once	once	DECLARATION	VM	C	ovs
+bool	only amz	PARAMETER	VM N	C++	s3fs-fuse
+boolean	only Done	ATTRIBUTE	VM NM	Java	Smack
+Entry	only Entry	DECLARATION	VM N	Java	guava
+bool	only if existing	PARAMETER	VM CJ V	C	git2r
+boolean	only Impersonation Enabled	ATTRIBUTE	VM N V	Java	drill
+bool	only in ground	ATTRIBUTE	VM P N	C	freeminer
+igraph_bool_t	only indices	PARAMETER	VM NPL	C	rigraph
+boolean	only Local	PARAMETER	VM N	Java	Openfire
+bool	only pool	PARAMETER	VM N	C++	s3fs-fuse
+int	only user	ATTRIBUTE	VM N	C	toggldesktop
+char	open if empty	DECLARATION	V CJ NM	C	libxo
+int	opposite Major Fragment Id	ATTRIBUTE	NM NM NM N	Java	drill
+u8	or conf	PARAMETER	NM N	C	ccv
+DruidFilter	or Filter At Index	FUNCTION	NM N P N	Java	drill
+RexNode	or Pred	PARAMETER	NM N	Java	drill
+double	or Sel	DECLARATION	NM N	Java	drill
+igraph_vector_int_t	order out	PARAMETER	N NM	C	rigraph
+int	out audio samples	DECLARATION	NM NM NPL	C	toxcore
+T	out Buff	DECLARATION	NM N	C++	deeplearning4j
+int	out Buff Posn	DECLARATION	NM NM N	Java	Openfire
+ErrorCode	out Code	PARAMETER	NM N	C++	proxygen
+char	out dev	DECLARATION	NM N	C	ovs
+ExAttributes	out Ex Attributes	PARAMETER	NM NM NPL	C++	proxygen
+RelFieldCollation	out Field Collation	DECLARATION	NM NM N	Java	drill
+ofstream	out file	DECLARATION	NM N	C++	meta
+unique_ptr	out Header Data	DECLARATION	NM NM N	C++	proxygen
+void	out how	PARAMETER	VM VM	C	mgba
+TypedFieldId	out Key Field Ids	PARAMETER	NM NM NM NPL	Java	drill
+uint32_t	out list	PARAMETER	NM N	C	toxcore
+igraph_inclist_t	out list	DECLARATION	NM N	C	rigraph
+bool	out max val	ATTRIBUTE	V NM N	C++	caffe
+AtomicLong	out Messages	ATTRIBUTE	NM NPL	Java	Openfire
+int	out Name Index	DECLARATION	NM NM N	Java	drill
+void*	out of domain	FUNCTION	P P N	C	rigraph
+boolean	out Of Memory	PARAMETER	P P N	Java	drill
+uint64_t	out Opaque Data	PARAMETER	NM NM N	C++	proxygen
+Prel	out Prel	DECLARATION	NM N	Java	drill
+PushId	out Push Id	PARAMETER	NM NM N	C++	proxygen
+igraph_vector_t	out seq	PARAMETER	NM N	C	rigraph
+String	out Stat Name	PARAMETER	NM NM N	Java	drill
+lm_state	out state	PARAMETER	NM N	C++	meta
+int	out Types Offset	DECLARATION	NM NM N	Java	deeplearning4j
+int	out vlan	ATTRIBUTE	NM N	C	ovs
+size_t	over size	DECLARATION	P N	C++	s3fs-fuse
+OVS_WARN_UNUSED_RESULT*	ovs db transient datum from json	FUNCTION	PRE PRE NM N P N	C	ovs
+json*	ovsdb atom string create no copy	FUNCTION	PRE PRE N V DT V	C	ovs
+int	ovsdb datum compare 3way	FUNCTION	PRE N V VM	C	ovs
+int	p 2	DECLARATION	N D	Java	Openfire
+v2s16	p 2 d	PARAMETER	N D N	C++	freeminer
+Fts5Table	p Fts5	ATTRIBUTE	PRE PRE	C	ccv
+Mem	p Mem 1	PARAMETER	PRE N D	C	ccv
+Token	p Name 2	PARAMETER	PRE N D	C	mgba
+double	p save	DECLARATION	PRE V	C	rigraph
+u8	p5 Err msg	PARAMETER	NM NM N	C	toggldesktop
+long	pad 0	ATTRIBUTE	N D	C++	panda3d
+uint8_t	pad 2	ATTRIBUTE	N D	C	ovs
+int	parallel for	FUNCTION	NM N	C++	deeplearning4j
+vector	param propagate down	ATTRIBUTE	N V P	C++	caffe
+NetParameter	param upgraded pad	PARAMETER	N V N	C++	caffe
+long	part 3	DECLARATION	N D	Java	guava
+boolean	partition Filter Push down	ATTRIBUTE	NM N V P	Java	drill
+boolean	partitionby	DECLARATION	N	Java	drill
+char	pass in	DECLARATION	NM N	C	toggldesktop
+bool	pass through	DECLARATION	V P	C++	freeminer
+List	past Threshold	DECLARATION	P N	Java	Singularity
+String	path Name for Logs	PARAMETER	NM N P NPL	Java	Singularity
+ArrayList	paths 2	DECLARATION	NPL D	Java	deeplearning4j
+long	per Ex Train	DECLARATION	P N V	Java	deeplearning4j
+String	period no Period	ATTRIBUTE	N DT N	Java	Spark
+float*	perlin Map 3D	FUNCTION	NM N NM	C++	freeminer
+int	pfcp up function features o9 flags	DECLARATION	PRE NM N V NM NPL	C	wireshark
+void	place all	FUNCTION	V DT	C++	panda3d
+bool	playing Ch 2	ATTRIBUTE	V N D	C	mgba
+int	plfit errno	PARAMETER	NM N	C	rigraph
+char*	plugin api info color rgb 2 term cb	FUNCTION	PRE PRE PRE NM N P NM N	C	weechat
+void*	plugin if mainwindow update toolbars	FUNCTION	PRE PRE PRE V NPL	C++	wireshark
+char	plugin name for upgrade	ATTRIBUTE	NM N P N	C	weechat
+void*	plugin script str 2 ptr	FUNCTION	PRE PRE N P D	C	weechat
+SchemaPlus	plus Of This	PARAMETER	P P DT	Java	drill
+void	pointer 1	PARAMETER	N D	C	weechat
+int	prefix x 1	ATTRIBUTE	NM N D	C	weechat
+ShaderContext*	prepare now	FUNCTION	V VM	C++	panda3d
+Cell	prev nonsingleton	ATTRIBUTE	DT N	C++	rigraph
+UInt32	price 2	DECLARATION	N D	C	mgba
+void	print and free json	FUNCTION	V CJ V N	C	ovs
+Multimap	probe Side Scan 2 hj	ATTRIBUTE	NM NM N P N	Java	drill
+int	process IpV6 Packet	FUNCTION	V NM N	Java	drill
+gboolean*	propagate when not up	FUNCTION	V CJ VM P	C	naemon-core
+CLzmaEncProps	props 2	PARAMETER	NPL D	C	toggldesktop
+ImmutableList	proto classes	DECLARATION	NM NPL	Java	immutables
+set	provided	DECLARATION	V	C++	deeplearning4j
+ColumnMetadata	provided Column	DECLARATION	V N	Java	drill
+String	provided Password	PARAMETER	V N	Java	Openfire
+TupleMetadata	provided Schema	ATTRIBUTE	V N	Java	drill
+void	ptr 1	PARAMETER	N D	C++	proxygen
+void	ptr 2	PARAMETER	N D	C++	proxygen
+t_irc_channel	ptr channel 2	DECLARATION	NM N D	C	weechat
+char	ptr in buf	DECLARATION	NM NM N	C	weechat
+char	ptr next	DECLARATION	N DT	C	weechat
+t_config_option	ptr option 1	DECLARATION	NM N D	C	weechat
+mxArray*	ptr to handle	FUNCTION	N P N	C++	caffe
+node_info	q 0 h	DECLARATION	N D N	C++	meta
+QueryId	q 1	PARAMETER	N D	C++	drill
+QueryId	q 2	PARAMETER	N D	C++	drill
+uint64_t	r 14	ATTRIBUTE	N D	C	facebook-repo-ds2
+uint32_t	r 8	ATTRIBUTE	N D	C	facebook-repo-ds2
+SEXP*	R igraph 0 or vector bool to SEXP	FUNCTION	PRE PRE D CJ NM N P N	C	rigraph
+SEXP	R igraph add myid to env	FUNCTION	PRE PRE V N P N	C	rigraph
+SEXP	R igraph matrix to SEXP	FUNCTION	PRE PRE N P N	C	rigraph
+shared_ptr	rank 0	PARAMETER	N D	C++	caffe
+bool	read only	ATTRIBUTE	V VM	C	ovs
+bool	Read S3fs Passwd File	FUNCTION	V NM NM N	C++	s3fs-fuse
+bool	refine equal to first	ATTRIBUTE	V N P DT	C++	rigraph
+class	Regression 2D Adapter	CLASS	NM NM N	Java	deeplearning4j
+v3f	rel cam up	DECLARATION	NM N P	C++	freeminer
+void*	relay auth parse pbkdf2	FUNCTION	PRE PRE V N	C	weechat
+void*	remove from menu bar	FUNCTION	V P NM N	C++	panda3d
+int*	rename object no copy	FUNCTION	V N DT V	C++	s3fs-fuse
+class	Replace ES419 Language Filter	CLASS	V NM NM N	Java	Singularity
+bool	replace if exists	FUNCTION	V CJ V	C++	meta
+void	restore degs only	FUNCTION	V NPL VM	C++	rigraph
+ProjResult	result 2	DECLARATION	N D	Java	drill
+int*	revwalk next toposort	FUNCTION	PRE DT N	C	git2r
+char	RFC3526 PRIME 4096	DECLARATION	NM N D	C	toggldesktop
+int	rgb 2	DECLARATION	N D	Java	Smack
+double	rmu 0	ATTRIBUTE	N D	C	rigraph
+int	root 1	ATTRIBUTE	N D	C	panda3d
+Point3	row 2	DECLARATION	N D	C++	panda3d
+RelDataType	row Type 1	PARAMETER	NM N D	Java	drill
+uint8_t	rtc Free Page 1	ATTRIBUTE	PRE V N D	C	mgba
+MediaType	RTF UTF8	ATTRIBUTE	N N	Java	guava
+bool	run once	DECLARATION	V VM	C++	s3fs-fuse
+ovs_be64	rx1024 to 1522 packets	ATTRIBUTE	N P D NPL	C	ovs
+ovs_be64	rx128 to 255 packets	ATTRIBUTE	N P D NPL	C	ovs
+auto	s 1	DECLARATION	N D	C++	meta
+ImmutableBitSet	s Gby	DECLARATION	NM N	Java	drill
+sockaddr_in6	s in 6	DECLARATION	NM N D	C++	freeminer
+ObjectFile	s o File	PARAMETER	NM NM N	C++	cling
+int	s o Timeout	DECLARATION	NM NM N	Java	Openfire
+bool	s set find and delete	FUNCTION	NM N V CJ V	C	ovs
+S3Artifact	s3 Artifact	ATTRIBUTE	NM N	Java	Singularity
+Set	s3 Buckets	DECLARATION	PRE NPL	Java	Singularity
+List	s3 Services	ATTRIBUTE	PRE NPL	Java	Singularity
+String	s3 Uploader Key Pattern	DECLARATION	PRE NM NM N	Java	Singularity
+boolean	s3 Use V2 Signing	ATTRIBUTE	NM V NM N	Java	Singularity
+S3fsCurl	s3fs curl	DECLARATION	PRE N	C++	s3fs-fuse
+void*	s3fs exit fuse loop	FUNCTION	PRE NM NM N	C++	s3fs-fuse
+void*	s3fs init	FUNCTION	PRE V	C++	s3fs-fuse
+fuse_operations	s3fs oper	DECLARATION	PRE N	C++	s3fs-fuse
+int*	s3fs read link	FUNCTION	PRE NM N	C++	s3fs-fuse
+bool	s3fs str to offt	FUNCTION	PRE N P N	C++	s3fs-fuse
+int*	s3fs truncate	FUNCTION	PRE V	C++	s3fs-fuse
+int*	s3fs utimens	FUNCTION	PRE N	C++	s3fs-fuse
+int	samples per buffer	ATTRIBUTE	NPL P N	C++	panda3d
+void	save	ATTRIBUTE	V	C	meta
+String	SAVE AS 16 x 16	ATTRIBUTE	V P D P D	Java	Spark
+bool	save before unloading	DECLARATION	V P V	C++	freeminer
+png_size_t	save buffer max	ATTRIBUTE	V NM N	C	toggldesktop
+size_t	save data length	ATTRIBUTE	NM NM N	C	toxcore
+int	save err no	DECLARATION	V NM N	C	ovs
+Long	save Every Ms	ATTRIBUTE	V DT N	Java	deeplearning4j
+boolean	save Every Since Last	ATTRIBUTE	V DT CJ DT	Java	deeplearning4j
+SingularityCreateResult	save Expiring Object	FUNCTION	V NM N	Java	Singularity
+void	save Mail Record	FUNCTION	V NM N	Java	Singularity
+boolean	save Output	ATTRIBUTE	V N	Java	deeplearning4j
+String	save Password	DECLARATION	V N	Java	Spark
+boolean	save Samples	FUNCTION	V NPL	Java	drill
+SingularityCreateResult	save Task History Update	FUNCTION	V NM NM N	Java	Singularity
+int	sbox 4	DECLARATION	N D	Java	Openfire
+void*	scan for sliders	FUNCTION	V P NPL	C++	panda3d
+uint64_t	select in word	FUNCTION	V P N	C	meta
+void	send everything	FUNCTION	V DT	C++	panda3d
+int	send to buffer	PARAMETER	V P N	C	weechat
+class	Separable Convolution 2D	CLASS	NM N NM	Java	deeplearning4j
+guint32	Server 1	ATTRIBUTE	N D	C	wireshark
+string	server and port	DECLARATION	N CJ N	C++	panda3d
+ccv_cnnp_cmd_exec_io_set_by_t	set by	DECLARATION	V P	C	ccv
+void	set dampen on bodies	FUNCTION	V V P NPL	C++	panda3d
+void	set data 4d	FUNCTION	V N NM	C++	panda3d
+String	SET GROUP NAME 1	ATTRIBUTE	V NM N D	Java	Openfire
+void	set has blob included in max row size	FUNCTION	V V N V P NM NM N	C++	drill
+bool	Set IAM v2 API Token	FUNCTION	V NM NM NM N	C++	s3fs-fuse
+Builder	set Part 1	FUNCTION	V N D	Java	drill
+void	set server and port	FUNCTION	V N CJ N	C++	panda3d
+String	SHA256 ALGORITHM	ATTRIBUTE	NM N	Java	metrics
+int	shift by	PARAMETER	V P	C	git2r
+int	short too	PARAMETER	NM VM	C	mgba
+String	show String for Add	DECLARATION	NM N P N	Java	Spark
+uint64_t	significand	ATTRIBUTE	N	C++	toggldesktop
+Date	since	ATTRIBUTE	VM	Java	Smack
+class	Singularity S3 Service	CLASS	PRE PRE N	Java	Singularity
+class	Singularity S3 Services	CLASS	PRE PRE NPL	Java	Singularity
+class	Singularity S3 Uploader	CLASS	PRE PRE N	Java	Singularity
+class	Singularity S3 Uploader Configuration	CLASS	PRE PRE NM N	Java	Singularity
+class	Singularity S3 Uploader Content Headers	CLASS	PRE PRE NM NM NPL	Java	Singularity
+class	Singularity S3 Uploader File	CLASS	PRE PRE NM N	Java	Singularity
+uint32_t	size from ipv4	DECLARATION	N P N	C	ovs
+int	size in datum	DECLARATION	N P N	C++	caffe
+bool	skip im 2 col	PARAMETER	V N P N	C++	caffe
+class	SLF4J Debugger Factory	CLASS	PRE NM N	Java	Smack
+void	SM83 Instruction IRQ Stall	FUNCTION	PRE PRE NM N	C	mgba
+bool	SM83 Tick Internal	FUNCTION	PRE N NM	C	mgba
+sockaddr_in	sock 4	DECLARATION	N D	C	toxcore
+prpack_result*	solve via gs	FUNCTION	V P N	C++	rigraph
+bool	some air	DECLARATION	DT N	C++	freeminer
+boolean	some Columns Indexed	FUNCTION	DT NPL V	Java	drill
+int	some data not displayed	DECLARATION	DT NM VM V	C	weechat
+Class	some Reference	ATTRIBUTE	DT N	Java	drill
+class	Sort Collector Labels 1	CLASS	V NM NPL D	C++	panda3d
+class	Sort Collector Labels 2	CLASS	V NM NPL D	C++	panda3d
+bool	sorts less	FUNCTION	V DT	C++	panda3d
+int	spaces every	PARAMETER	NPL DT	C++	panda3d
+void*	spell speller add dicts to hash	FUNCTION	PRE PRE V NPL P N	C	weechat
+void*	spx nt prod 1	FUNCTION	PRE NM N D	C	rigraph
+void	sqlite3 Expr If True	FUNCTION	PRE N CJ NM	C	mgba
+void	sqlite3 Fts3 Hash Find	DECLARATION	PRE PRE N V	C	toggldesktop
+void*	sqlite3 Fts3 Hash Insert	FUNCTION	PRE PRE N V	C	ccv
+void	sqlite3 Fts3 Seg Reader Free	DECLARATION	PRE PRE NM N V	C	ccv
+int*	sqlite3 Fts5 Parser Fallback	FUNCTION	PRE PRE N V	C	ccv
+void	sqlite3 Put 4 byte	FUNCTION	PRE V D N	C	ccv
+int*	sqlite3 rtree geometry callback	FUNCTION	PRE PRE NM N	C	ccv
+int	sqlite3 Where Trace	DECLARATION	PRE NM N	C	mgba
+With*	sqlite3 With Dup	FUNCTION	PRE P V	C	ccv
+flow_wildcards	src 1	PARAMETER	N D	C	ovs
+re_node_set	src 2	PARAMETER	N D	C	git2r
+ovs_be16	src as	ATTRIBUTE	NM N	C	ovs
+ostringstream	ss all	DECLARATION	N DT	C++	s3fs-fuse
+stnode_t	st arg 2	PARAMETER	NM N D	C	wireshark
+size_t	start R 1	DECLARATION	NM N D	C++	meta
+int	stash update index from diff	FUNCTION	PRE V N P N	C	git2r
+lm_state	state next	DECLARATION	N DT	C++	meta
+boolean	still Waiting	DECLARATION	VM V	Java	Smack
+void	store Last Err no	FUNCTION	V DT NM N	C	ccv
+int	str 2 int	FUNCTION	N P N	C	rigraph
+const_iterator	str 2 it	DECLARATION	N D N	C++	s3fs-fuse
+string	str all	DECLARATION	N DT	C++	s3fs-fuse
+string	str iam v2 token	DECLARATION	NM NM NM N	C++	s3fs-fuse
+string	str now	DECLARATION	N VM	C++	s3fs-fuse
+optional	stream for	FUNCTION	V P	C++	meta
+git_odb_stream	stream out	PARAMETER	V N	C	git2r
+int*	string base16 decode	FUNCTION	N NM V	C	weechat
+int*	string base16 encode	FUNCTION	N NM V	C	weechat
+int	string base32 decode	FUNCTION	N NM V	C	weechat
+void*	string conv base64 6 x 4 to 8 x 3	FUNCTION	N V N D P D P D P D	C	weechat
+void*	string conv base64 8 x 3 to 6 x 4	FUNCTION	N V N D P D P D P D	C	weechat
+char	string p	PARAMETER	NM N	C	naemon-core
+int	subexp to	ATTRIBUTE	N P	C	git2r
+int*	submodule load each	FUNCTION	PRE V DT	C	git2r
+List	subtypes 1	DECLARATION	NPL D	Java	drill
+int	sum 2	DECLARATION	N D	C	rigraph
+boolean	supports Join Push down	ATTRIBUTE	V V V P	Java	drill
+boolean	supports Sort Push down	ATTRIBUTE	V V V P	Java	drill
+void	T 1 Prepare Mun map Code	FUNCTION	N D V NM NM N	C++	facebook-repo-ds2
+PN_stdfloat	t 1 y	DECLARATION	N D N	C++	panda3d
+uint32_t	T 2 MOV 16 Set Immediate	FUNCTION	N D N D V NM	C++	facebook-repo-ds2
+T3	t 3	DECLARATION	N D	Java	immutables
+PushTransactionRAII	T for Deser	DECLARATION	N P N	C++	cling
+timespec	t s now	DECLARATION	NM N VM	C++	s3fs-fuse
+timeval	t v now	DECLARATION	NM N VM	C	weechat
+ParquetTableMetadata_v4	table Metadata V 4	DECLARATION	NM N NM D	Java	drill
+void*	tag all	FUNCTION	V DT	C++	panda3d
+int	target 1	ATTRIBUTE	N D	C	mgba
+double	task Reconciliation Response P 999	PARAMETER	NM NM NM N D	Java	Singularity
+void	temp Data 1	ATTRIBUTE	NM N D	C++	caffe
+void	temp Data 2	ATTRIBUTE	NM N D	C++	caffe
+fsblkcnt_t	ten 00	DECLARATION	D D	C++	s3fs-fuse
+ccv_nnc_tensor_symbol_info_t	tensor a	PARAMETER	NM N	C	ccv
+double	term and class	FUNCTION	N CJ N	C++	meta
+char	text only	PARAMETER	N VM	C	libxo
+int	text search where	ATTRIBUTE	N V VM	C	weechat
+Occupant	that	DECLARATION	DT	Java	Openfire
+DruidStoragePluginConfig	that Config	DECLARATION	DT N	Java	drill
+LogicalExpression	that Expr	PARAMETER	DT N	Java	drill
+Class	the Class	DECLARATION	DT N	Java	Openfire
+LittleEndianBytes	the Getter	ATTRIBUTE	DT N	Java	guava
+keyValuePair	the max	DECLARATION	DT N	C++	rigraph
+Filter	the New Filter	DECLARATION	DT NM N	Java	drill
+expected_counts	this	DECLARATION	DT	C	meta
+int	this bucket	DECLARATION	DT N	C	naemon-core
+PyObject	this class	PARAMETER	DT N	C++	panda3d
+contactgroup	this contact group	DECLARATION	DT NM N	C	naemon-core
+LogicalExpression	this Expr	PARAMETER	DT N	Java	drill
+xodtemplate_host	this host	DECLARATION	DT N	C	naemon-core
+hostescalation	this host escalation	PARAMETER	DT NM N	C	naemon-core
+hostgroup	this host group	DECLARATION	DT NM N	C	naemon-core
+int	this index	DECLARATION	DT N	C++	panda3d
+void	this Instance	PARAMETER	DT N	C	toggldesktop
+bool	this Iteration	PARAMETER	DT N	C++	proxygen
+string	this Key	DECLARATION	DT N	C++	proxygen
+uchar32_t	this Letter	PARAMETER	DT N	C++	freeminer
+N	this Node	PARAMETER	DT N	Java	guava
+T	this Object	PARAMETER	DT N	Java	Smack
+serviceescalation	this service escalation	PARAMETER	DT NM N	C	naemon-core
+xodtemplate_serviceextinfo	this service ext info	PARAMETER	DT NM NM N	C	naemon-core
+servicesmember	this services member	DECLARATION	DT NM N	C	naemon-core
+Thread	thread B to A	ATTRIBUTE	NM N P N	Java	Openfire
+void	throw if error	FUNCTION	V CJ N	C++	meta
+NativeLong	til or since	ATTRIBUTE	P CJ P	Java	Spark
+int	time for each line	ATTRIBUTE	N P DT N	C	weechat
+float	time from last punch	ATTRIBUTE	N P DT N	C++	freeminer
+time_t	time now	DECLARATION	N VM	C	weechat
+float	time of day smooth	ATTRIBUTE	N P N NM	C++	freeminer
+ifstream	titles in	DECLARATION	NM N	C++	meta
+uint32_t	to Ack	DECLARATION	P N	C++	proxygen
+RuntimeFilterWritable	to Aggregate	DECLARATION	P V	Java	drill
+ValueVector	to Alloc	PARAMETER	P V	Java	drill
+String	to Bare JID	FUNCTION	P NM N	Java	Openfire
+X509CertSelector	to Be Validated	DECLARATION	P V V	Java	Spark
+size_t	to Consume	PARAMETER	P V	C++	proxygen
+AttributeBuilderThirdPartyModel	to Copy From	PARAMETER	P V P	Java	immutables
+mp_part_list_t	to copy list	DECLARATION	P NM N	C++	s3fs-fuse
+DataType	to Count	PARAMETER	P N	Java	deeplearning4j
+CoordinateSystem	to cs	PARAMETER	P N	C++	panda3d
+Delete	to Delete	FUNCTION	P V	Java	drill
+double	to Doubles	FUNCTION	P NPL	Java	deeplearning4j
+C	to Element	PARAMETER	P N	Java	guava
+List	to Field List	FUNCTION	P NM N	Java	drill
+String	to Filename	FUNCTION	P N	Java	immutables
+Map	to Filter Conditions	FUNCTION	P NM NPL	Java	drill
+ValueVectorReadExpression	to Hash Field Exp	DECLARATION	P NM NM N	Java	drill
+ToHeader	to Header	DECLARATION	P N	Java	Spark
+double	to height	DECLARATION	P N	C++	panda3d
+int	to Index	DECLARATION	P N	Java	Openfire
+List	to Launch	DECLARATION	P V	Java	Singularity
+vector	to merge	DECLARATION	P V	C++	meta
+FileMetaData	to Merge	PARAMETER	P V	Java	drill
+int	to Read Remaining	DECLARATION	P V NM	Java	drill
+Prel	to Register	PARAMETER	P V	Java	drill
+char	to repo	PARAMETER	P N	C	git2r
+TupleMetadata	to Schema	FUNCTION	P N	Java	drill
+Long	to Snapshot Id	ATTRIBUTE	P NM N	Java	drill
+Function	to Sort Fn	DECLARATION	P NM N	Java	immutables
+Instant	to Time	PARAMETER	P N	Java	metrics
+MinorType	to Types	PARAMETER	P NPL	Java	drill
+string	to upper	FUNCTION	P NM	C++	meta
+Deque	to Visit	ATTRIBUTE	P V	Java	immutables
+auto	to write	DECLARATION	P V	C++	meta
+String	to Yaml	FUNCTION	P N	Java	deeplearning4j
+boolean	too Many Shuffling Tasks	DECLARATION	VM NM V NPL	Java	Singularity
+boolean	too Old	DECLARATION	VM NM	Java	Singularity
+double	total oov only	DECLARATION	NM N VM	C++	meta
+void*	tox self set no spam	FUNCTION	PRE PRE V DT N	C	toxcore
+igraph_bool_t	transpose a	PARAMETER	V N	C	rigraph
+JPanel	tree and Info	ATTRIBUTE	N CJ N	Java	Spark
+timespec	ts 1	PARAMETER	N D	C++	s3fs-fuse
+timespec	ts 2	PARAMETER	N D	C++	s3fs-fuse
+ovs_be64	tx1523 to max packets	ATTRIBUTE	N P NM NPL	C	ovs
+MajorType	type 2	PARAMETER	N D	Java	drill
+uint32_t	u 1 bit count	PARAMETER	N D NM N	C	ovs
+guint32	u32 Pointer	DECLARATION	NM N	C	wireshark
+udpif	udpif	ATTRIBUTE	N	C	ovs
+double	ul now	PARAMETER	V VM	C++	s3fs-fuse
+class	Un7zip App	CLASS	V N	C++	toggldesktop
+String	UNBLOCK CONTACT 16 x 16	ATTRIBUTE	V N D P D	Java	Spark
+void	unbox or zero	FUNCTION	V CJ N	Java	cglib
+int	under Provisioned Requests	PARAMETER	P V NPL	Java	Singularity
+uint32_t	unknown 20	DECLARATION	N D	C++	cling
+uint32_t	unknown 24	DECLARATION	N D	C++	cling
+int	until	DECLARATION	P	Java	drill
+LVector3	up	ATTRIBUTE	P	C++	panda3d
+bool	Update S3fs Credential	FUNCTION	V NM N	C++	s3fs-fuse
+bool	Upgrade V 1 Layer Parameter	FUNCTION	V N D NM N	C++	caffe
+class	Upsampling 2D	CLASS	N NM	Java	deeplearning4j
+SEXP	us	PARAMETER	N	C	rigraph
+bool	use default if empty	PARAMETER	V N CJ NM	C	git2r
+string	username only	DECLARATION	N VM	C++	panda3d
+int32_t	utf8 next code point	FUNCTION	N DT NM N	C++	meta
+int	utf8 only	ATTRIBUTE	N VM	C	weechat
+V1LayerParameter	v 0 layer connection	PARAMETER	N D NM N	C++	caffe
+float	v 00	PARAMETER	N D	C++	freeminer
+int	V 1	ATTRIBUTE	N D	C++	rigraph
+int	v 1 index	PARAMETER	N D N	Java	drill
+float	v 110	PARAMETER	N D	C++	freeminer
+int	V 2	ATTRIBUTE	N D	C++	rigraph
+V	v 5	PARAMETER	N D	Java	guava
+FvalueFromLiteral	val from literal	ATTRIBUTE	N P N	C	wireshark
+int*	validate tree and parents	FUNCTION	V N CJ NPL	C	git2r
+Any	value 6	PARAMETER	N D	C++	toggldesktop
+guint8	value a and front reserved	DECLARATION	NM N CJ N V	C	wireshark
+T	value or	FUNCTION	N CJ	C++	meta
+NoiseParams	value out	PARAMETER	NM N	C++	freeminer
+vector	vector 1	PARAMETER	N D	C++	drill
+vector	vector 2	PARAMETER	N D	C++	drill
+uint32_t*	version bitmap from version	FUNCTION	NM N P N	C	ovs
+bool	vertex 0 2 connected	ATTRIBUTE	N D D V	C++	freeminer
+igraph_vector_t	vertex to the left	PARAMETER	N P DT NM	C	rigraph
+int	vid 1	PARAMETER	N D	C	rigraph
+IntType	vmax over base	DECLARATION	N P N	C++	deeplearning4j
+LongType	vol Stride 1	DECLARATION	NM N D	C++	deeplearning4j
+BigIntVector	vv 0	DECLARATION	N D	Java	drill
+int*	w clique 1	FUNCTION	NM N D	C	rigraph
+atomic_bool	wait for reload	ATTRIBUTE	V P N	C	ovs
+int	wait For S3 Links Seconds	ATTRIBUTE	V P NM NM NPL	Java	Singularity
+void	wait for server start	FUNCTION	V P NM N	C	crow
+int	waiting for dht connection	DECLARATION	V P NM N	C	toxcore
+Duration	warmup time	ATTRIBUTE	NM N	C++	freeminer
+ALenum	warn if error	FUNCTION	V CJ N	C++	freeminer
+uint32_t	wc 10	DECLARATION	N D	C	ovs
+SCM	weechat guile api string input for buffer	FUNCTION	PRE PRE PRE NM N P N	C	weechat
+void*	weechat js unload all	FUNCTION	PRE PRE V DT	C++	weechat
+SEXP	when	DECLARATION	VM	C	git2r
+CountDownLatch	when Closed	ATTRIBUTE	VM V	Java	guava
+SqlNode	where	ATTRIBUTE	N	Java	drill
+SqlNode	where Clause	ATTRIBUTE	NM N	Java	drill
+size_t	where len	PARAMETER	NM N	C	toggldesktop
+int	which Button	DECLARATION	DT N	C++	panda3d
+int64_t	while count	ATTRIBUTE	NM N	C	ccv
+ccv_nnc_symbolic_graph_t	while graph	PARAMETER	NM N	C	ccv
+MediaType	with Charset	DECLARATION	P N	Java	guava
+Builder	with Conf	FUNCTION	P N	Java	drill
+Builder	with Entries	FUNCTION	P NPL	Java	drill
+void	with Implicit Columns	FUNCTION	P NM NPL	Java	drill
+boolean	with Index	ATTRIBUTE	P N	Java	drill
+MaterializedField	with Path And Type	FUNCTION	P N CJ N	Java	drill
+BatchSchemaBuilder	with Schema Builder	FUNCTION	P NM N	Java	drill
+bool	with System	PARAMETER	P N	C++	cling
+Naming	with Unary Operator	ATTRIBUTE	P NM N	Java	immutables
+FieldReference	within	ATTRIBUTE	P	Java	drill
+char	words to add	PARAMETER	NPL P V	C	weechat
+void	work 1	ATTRIBUTE	N D	C	rigraph
+CPPType	wrapped around	PARAMETER	V P	C++	panda3d
+void	write Data Page V 2	FUNCTION	V NM N NM D	Java	drill
+double	x 3 mean	ATTRIBUTE	N D N	C++	deeplearning4j
+uint64_t	x 4	ATTRIBUTE	N D	C	facebook-repo-ds2
+uint64_t	x 8	ATTRIBUTE	N D	C	facebook-repo-ds2
+int	x from	PARAMETER	N P	C++	panda3d
+png_uint_32	x pixels per unit	ATTRIBUTE	NM NPL P N	C	mgba
+int	x to	PARAMETER	N P	C++	panda3d
+int*	X509 set1 not Before	FUNCTION	PRE PRE DT P	C	toggldesktop
+xdfile_t	xdf 2	PARAMETER	N D	C	git2r
+xdfenv_t	xe 2	PARAMETER	N D	C	git2r
+objectlist*	xodtemplate expand host groups and hosts	FUNCTION	PRE V NM NPL CJ NPL	C	naemon-core
+class	Xpp3 Xml Pull Parser	CLASS	PRE NM NM N	Java	Smack
+class	Xpp3 Xml Pull Parser Factory	CLASS	PRE NM NM NM N	Java	Smack
+png_uint_32	y pixels per unit	ATTRIBUTE	D NPL P N	C	mgba
+class	YOLO 2	CLASS	N D	Java	deeplearning4j
+double	z 2 X mag	DECLARATION	N P N N	C++	freeminer
+felem	z in	DECLARATION	NM N	C	toggldesktop
+void	zero or null	FUNCTION	N CJ N	Java	cglib
\ No newline at end of file
diff --git a/main b/main
index 8a68e5d..86ee7bf 100755
--- a/main
+++ b/main
@@ -1,139 +1,20 @@
 #!/usr/bin/env python
 
-import os, sqlite3, random, argparse
+import os, argparse
 from datetime import datetime
-from src.classifier_multiclass import perform_classification, TrainingAlgorithm
-import pandas as pd
-import numpy as np
+from src.tree_based_tagger.classifier_multiclass import load_config_tree, train_tree
+from src.lm_based_tagger.train_model import train_lm
 from src.tag_identifier import start_server
-from src.download_code2vec_vectors import *
-from src.feature_generator import custom_to_numeric, universal_to_custom, createFeatures
-from src.create_models import createModel, stable_features, mutable_feature_list, columns_to_drop
+from src.tree_based_tagger.download_code2vec_vectors import *
 from version import __version__ 
+from datasets import Dataset
 
-# Get the directory of the current script
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 
 def get_version():
     """Return the current version of SCANL Tagger."""
     return f"SCANL Tagger version {__version__}"
 
-def read_input(sql, features, conn):
-    """
-    Read input data from an SQLite database and preprocess it.
-
-    This function reads data from the specified SQL query and database connection, shuffles the rows, and then applies
-    a preprocessing function called 'createFeatures' to create additional features.
-
-    Args:
-        sql (str): The SQL query to fetch data from the database.
-        conn (sqlite3.Connection): The SQLite database connection.
-
-    Returns:
-        pandas.DataFrame: A DataFrame containing the preprocessed input data.
-    """
-    input_data = pd.read_sql_query(sql, conn)
-    print(" --  --  --  -- Read " + str(len(input_data)) + " input rows --  --  --  -- ")
-    print(input_data.columns)
-    input_data_copy = input_data.copy()
-    rows = input_data_copy.values.tolist()
-    random.shuffle(rows)
-    shuffled_input_data = pd.DataFrame(rows, columns=input_data.columns)
-    modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR)
-    input_data = createFeatures(shuffled_input_data, features, modelGensimEnglish=modelGensimEnglish, modelTokens=modelTokens, modelMethods=modelMethods)
-    return input_data
-
-def train(config):
-    """
-    Train a part of speech tagger model using specified features and a training dataset.
-    This function reads data from an SQLite database, preprocesses it, and performs classification using a specified set
-    of features. The results are written to an output file, including information about the training process and the
-    distribution of labels in the training data.
-    Args:
-        config (dict): A dictionary containing configuration data.
-    Returns:
-        None
-    """
-   
-    # Extract configuration values from the 'config' dictionary
-    input_file = config['input_file']
-    sql_statement = config['sql_statement']
-    identifier_column = config['identifier_column']
-    dependent_variable = config['dependent_variable']
-    pyrandom_seed = config['pyrandom_seed']
-    trainingSeed = config['trainingSeed']
-    classifierSeed = config['classifierSeed']
-   
-    np.random.seed(config['npseed'])
-    random.seed(pyrandom_seed)
-    independent_variables = config['independent_variables']
-    
-    # ###############################################################
-    print(" --  -- Started: Reading Database --  -- ")
-    connection = sqlite3.connect(input_file)
-    df_input = read_input(sql_statement, independent_variables, connection)
-    print(" --  -- Completed: Reading Input --  -- ")
-    # ###############################################################
-    
-    # Create an explicit copy to avoid SettingWithCopyWarning
-    #independent_variables.remove("EMB_FEATURES")
-    df_features = df_input[independent_variables].copy()
-    df_class = df_input[[dependent_variable]].copy()
-    
-    category_variables = []
-    categorical_columns = ['NLTK_POS', 'PREV_POS', 'NEXT_POS']
-    
-    # Safely handle categorical variables
-    for category_column in categorical_columns:
-        if category_column in df_features.columns:
-            category_variables.append(category_column)
-            df_features.loc[:, category_column] = df_features[category_column].astype(str)
-    
-    # Ensure output directories exist
-    output_dir = os.path.join(SCRIPT_DIR, 'output')
-    os.makedirs(output_dir, exist_ok=True)
-    
-    filename = os.path.join(output_dir, 'results.txt')
-    mode = 'a' if os.path.exists(filename) else 'w'
-   
-    with open(filename, mode) as results_text_file:
-        results_text_file.write(datetime.now().strftime("%H:%M:%S") + "\n")
-       
-        # Print config in a readable fashion
-        results_text_file.write("Configuration:\n")
-        for key, value in config.items():
-            results_text_file.write(f"{key}: {value}\n")
-        results_text_file.write("\n")
-
-        for category_column in category_variables:
-            # Explicitly handle categorical conversion
-            unique_values = df_features[category_column].unique()
-            category_map = {}
-            for value in unique_values:
-                print(value)
-                if value in universal_to_custom:
-                    category_map[value] = custom_to_numeric[universal_to_custom[value]]
-                else:
-                    category_map[value] = custom_to_numeric['NOUN']  # Assign 'NM' (8) for unknown categories
-
-            df_features.loc[:, category_column] = df_features[category_column].map(category_map)
-       
-        print(" --  -- Distribution of labels in corpus --  -- ")
-        print(df_class[dependent_variable].value_counts())
-        results_text_file.write(f"SQL: {sql_statement}\n")
-        results_text_file.write(f"Features: {df_features}\n")
-       
-        algorithms = [TrainingAlgorithm.XGBOOST]
-        #pd.set_option('display.max_rows', None)  # Show all rows
-        pd.set_option('display.max_columns', None)  # Show all columns
-        pd.set_option('display.width', None)  # Prevent line wrapping
-        pd.set_option('display.max_colwidth', None)  # Show full content of each cell
-
-        print(df_features)
-        perform_classification(df_features, df_class, results_text_file,
-                                                    output_dir, algorithms, trainingSeed,
-                                                    classifierSeed, columns_to_drop)
-
 if __name__ == "__main__":
     """
     Use argparse to allow the user to choose either running the tagger or training a new tagger
@@ -155,47 +36,53 @@ if __name__ == "__main__":
 
     Note:
     If no arguments are provided or if there is an invalid argument, the script will display usage instructions.
-
-    Author: Christian Newman
+    
     Version: 2.0.0
     """
-    parser = argparse.ArgumentParser()
-    
+
+    parser = argparse.ArgumentParser(description="SCALAR identifier tagger")
     parser.add_argument("-v", "--version", action="store_true", help="print tagger application version")
-    parser.add_argument("-r", "--run", action="store_true", help="run server for part of speech tagging requests") 
-    parser.add_argument("-t", "--train", action="store_true", help="run training set to retrain the model")
-    parser.add_argument("-a", "--address", nargs=1, action="store", help="configure server address", )
-    parser.add_argument("--port", nargs=1, action="store", help="configure server port")
-    parser.add_argument("--protocol", nargs=1, action="store", help="configure whether the server uses http or https")
-    parser.add_argument("--words", nargs=1, action="store", help="provide path to a list of acceptable abbreviations")
+    # Core run/train model arguments
+    parser.add_argument("--mode", choices=["train", "run"], required=True, help="Choose to 'train' or 'run' the model")
+    parser.add_argument("--model_type", choices=["tree_based", "lm_based"], required=True, help="Specify which model type to use")
+    parser.add_argument("--input_path", type=str, help="Path to TSV file for training")
+    parser.add_argument("--model_dir", type=str, default="models", help="Directory to load/save model")
+    parser.add_argument("--config_path", type=str, default="serve.json", help="Path to config JSON (used in run mode)")
+
+    # Run-specific options
+    parser.add_argument("--port", type=int, help="Port to bind server")
+    parser.add_argument("--protocol", type=str, help="Protocol (http/https)")
+    parser.add_argument("--word", type=str, help="Word used in config")
+    parser.add_argument("--address", type=str, help="Server address")
 
     args = parser.parse_args()
-
+    
     if args.version:
         print(get_version())
-    elif args.run:
-        download_files()
-        temp_config = {}
-        print(args)
-        if args.address != None: temp_config["address"] = args.address[0]
-        if args.port != None: temp_config["port"] = args.port[0]
-        if args.protocol != None: temp_config["protocol"] = args.protocol[0]
-        if args.words != None: temp_config["words"] = args.words[0]
-        start_server(temp_config)
-    elif args.train:
-        download_files()
-        # Define a configuration dictionary and pass it to the train function
-        config = {
-            'input_file': os.path.join(SCRIPT_DIR, 'input', 'scanl_tagger_training_db_11_29_2024.db'),
-            'sql_statement': 'select * from training_set',
-            'identifier_column': "ID",
-            'dependent_variable': 'CORRECT_TAG',
-            'pyrandom_seed': random.randint(0, 2**32 - 1),
-            'trainingSeed': random.randint(0, 2**32 - 1),
-            'classifierSeed': random.randint(0, 2**32 - 1),
-            'npseed': random.randint(0, 2**32 - 1),
-            'independent_variables': stable_features + mutable_feature_list
-        }
-        train(config)
+    elif args.mode == "train":
+        if args.model_type == "tree_based":
+            config = load_config_tree(SCRIPT_DIR)
+            download_files()
+            train_tree(config)
+        elif args.model_type == "lm_based":
+            download_files()
+            train_lm(SCRIPT_DIR)
+
+    elif args.mode == "run":
+
+        # Inject overrides
+        config["model_type"] = args.model_type
+        config["model_dir"] = args.model_dir
+
+        if args.port:
+            config["port"] = args.port
+        if args.protocol:
+            config["protocol"] = args.protocol
+        if args.word:
+            config["word"] = args.word
+        if args.address:
+            config["address"] = args.address
+
+        start_server(temp_config=config)
     else:
-        parser.print_usage()
+        parser.print_usage()
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 450f86d..51e31b1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,7 +17,7 @@ filelock==3.17.0
 flair==0.15.0
 Flask==3.1.0
 fonttools==4.55.6
-fsspec==2023.5.0
+fsspec==2024.12.0
 ftfy==6.3.1
 gdown==5.2.0
 gensim==4.3.3
@@ -42,6 +42,18 @@ mpmath==1.3.0
 networkx==3.4.2
 nltk==3.9.1
 numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
 packaging==24.2
 pandas==2.2.3
 pillow==11.1.0
@@ -81,6 +93,7 @@ torch==2.5.1
 tqdm==4.67.1
 transformer-smaller-training-vocab==0.4.0
 transformers==4.48.1
+triton==3.1.0
 typing_extensions==4.12.2
 tzdata==2025.1
 urllib3==2.3.0
@@ -88,4 +101,4 @@ waitress==3.0.2
 wcwidth==0.2.13
 Werkzeug==3.1.3
 Wikipedia-API==0.8.1
-wrapt==1.17.2
+wrapt==1.17.2
\ No newline at end of file
diff --git a/src/lm_based_tagger/__init__.py b/src/lm_based_tagger/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/lm_based_tagger/distilbert_preprocessing.py b/src/lm_based_tagger/distilbert_preprocessing.py
new file mode 100644
index 0000000..874a88a
--- /dev/null
+++ b/src/lm_based_tagger/distilbert_preprocessing.py
@@ -0,0 +1,187 @@
+# distilbert_preprocessing.py
+
+import re
+from nltk import pos_tag
+import nltk
+from difflib import SequenceMatcher
+import pandas as pd
+from datasets import Dataset
+
+# Download once (we’ll just do it quietly here)
+nltk.download('averaged_perceptron_tagger_eng', quiet=True)
+nltk.download('universal_tagset', quiet=True)
+
+# === Constants ===
+VOWELS = set("aeiou")
+LOW_FREQ_TAGS = {"CJ", "VM", "PRE", "V"}
+
+# Map of context strings ➔ “feature tokens”
+CONTEXT_MAP = {
+    "FUNCTION": "@func",
+    "PARAMETER": "@param",
+    "ATTRIBUTE": "@attr",
+    "DECLARATION": "@decl",
+    "CLASS": "@class"
+}
+
+
+def detect_hungarian_prefix(first_token):
+    """
+    If the first token starts with 1–3 letters followed by an uppercase or underscore,
+    return "@hung_<prefix>". Otherwise "@hung_none".
+    """
+    m = re.match(r'^([a-zA-Z]{1,3})[A-Z_]', first_token)
+    if m:
+        return f"@hung_{m.group(1).lower()}"
+    return "@hung_none"
+
+
+def detect_digit_feature(tokens):
+    """
+    If any token has a digit, return "@has_digit", else "@no_digit".
+    """
+    for token in tokens:
+        if any(char.isdigit() for char in token):
+            return "@has_digit"
+    return "@no_digit"
+
+
+def consonant_vowel_ratio_bucket(tokens):
+    """
+    Compute the average consonant/vowel ratio across all alphabetic tokens,
+    then bucket into low/mid/high.
+    """
+    def ratio(tok):
+        tok_lower = tok.lower()
+        num_vowels = sum(1 for c in tok_lower if c in VOWELS)
+        num_consonants = sum(1 for c in tok_lower if c.isalpha() and c not in VOWELS)
+        return num_consonants / (num_vowels + 1e-5)
+
+    ratios = [ratio(tok) for tok in tokens if tok.isalpha()]
+    if not ratios:
+        return "@cvr_none"
+
+    avg_ratio = sum(ratios) / len(ratios)
+    if avg_ratio < 1.5:
+        return "@cvr_low"
+    elif avg_ratio < 3.0:
+        return "@cvr_mid"
+    else:
+        return "@cvr_high"
+
+
+def system_prefix_similarity(first_token, system_name):
+    """
+    Compute a SequenceMatcher ratio against the system name, then bucket:
+      >0.9 ➔ "@sim_high", >0.6 ➔ "@sim_mid", >0.3 ➔ "@sim_low", else "@sim_none".
+    """
+    if not first_token or not system_name:
+        return "@sim_none"
+    sys_lower = system_name.strip().lower()
+    tok_lower = first_token.strip().lower()
+    r = SequenceMatcher(None, tok_lower, sys_lower).ratio()
+    if r > 0.9:
+        return "@sim_high"
+    elif r > 0.6:
+        return "@sim_mid"
+    elif r > 0.3:
+        return "@sim_low"
+    else:
+        return "@sim_none"
+
+
+def prepare_dataset(df: pd.DataFrame, label2id: dict):
+    """
+    Takes a DataFrame with columns:
+       - "tokens"      : List[str] (split identifier)
+       - "tags"        : List[str] (gold PoS tags, same length as tokens)
+       - "CONTEXT"     : e.g. "FUNCTION", "PARAMETER", etc.
+       - "SYSTEM_NAME" : string
+
+    Returns a HuggingFace `datasets.Dataset` with two fields:
+       - "tokens"   : List[List[str]]  (the FULL token sequence, including exactly 7 feature tokens + position tokens + identifier tokens)
+       - "ner_tags" : List[List[int]]  (the aligned label IDs, with -100 in front for each feature token)
+    """
+    rows = []
+    for _, row in df.iterrows():
+        tokens = row["tokens"]
+        tags = row["tags"]
+
+        # 1. Build 7 feature tokens (context, system, hungarian, cvr, digit, sim, nltk)
+        context_token   = CONTEXT_MAP.get(row["CONTEXT"].strip().upper(), "@unknown")
+        system_token    = f"@system_{row['SYSTEM_NAME'].strip().lower().replace(' ', '_')}"
+        hungarian_token = detect_hungarian_prefix(tokens[0]) if tokens else "@hung_none"
+        cvr_token       = consonant_vowel_ratio_bucket(tokens)
+        digit_token     = detect_digit_feature(tokens)
+        sim_token       = system_prefix_similarity(tokens[0], row["SYSTEM_NAME"]) if tokens else "@sim_none"
+
+        # 2. NLTK POS tags (universal tagset)
+        nltk_tags = pos_tag(tokens, tagset="universal")
+        universal_tags = [tag.lower() for _, tag in nltk_tags]
+        nltk_feature = f"@nltk_{'-'.join(universal_tags)}"
+
+        # 3. Position tags: interleave with identifier tokens
+        length = len(tokens)
+        if length == 1:
+            pos_tokens = ["@pos_2"]
+        else:
+            pos_tokens = ["@pos_0"] + ["@pos_1"] * (length - 2) + ["@pos_2"]
+        tokens_with_pos = [val for pair in zip(pos_tokens, tokens) for val in pair]
+
+        # 4. Build the “full” token list (7 feature tokens + 2*len(tokens) position‐identifier tokens)
+        full_tokens = [
+            context_token,
+            system_token,
+            hungarian_token,
+            cvr_token,
+            digit_token,
+            sim_token,
+            nltk_feature,
+        ] + tokens_with_pos
+
+        # 5. Build the aligned labels array:
+        #    - First 7 entries → -100 (because they are feature tokens)
+        #    - Then for each identifier token, [-100, label2id[tag]]
+        ner_tags_with_pos = [val for tag in tags for val in (-100, label2id[tag])]
+        full_labels = [-100] * 7 + ner_tags_with_pos
+
+        rows.append({
+            "tokens":   full_tokens,
+            "ner_tags": full_labels
+        })
+
+    return Dataset.from_dict({
+        "tokens":   [r["tokens"]   for r in rows],
+        "ner_tags": [r["ner_tags"] for r in rows]
+    })
+
+
+def tokenize_and_align_labels(example, tokenizer):
+    """
+    example: a dict with
+      - "tokens"   : List[str] (the full token sequence, including exactly 7 feature tokens)
+      - "ner_tags" : List[int] (same length as above)
+
+    We run `tokenizer(example["tokens"], is_split_into_words=True, truncation=True)`,
+    then align `word_ids()` with `example["ner_tags"]` exactly as in test.py.
+    """
+    tokenized = tokenizer(
+        example["tokens"],
+        truncation=True,
+        is_split_into_words=True
+    )
+
+    labels = []
+    word_ids = tokenized.word_ids()
+
+    for word_id in word_ids:
+        if word_id is None:
+            labels.append(-100)
+        elif word_id < len(example["ner_tags"]):
+            labels.append(example["ner_tags"][word_id])
+        else:
+            # Just in case of truncation
+            labels.append(-100)
+
+    tokenized["labels"] = labels
+    return tokenized
diff --git a/src/lm_based_tagger/distilbert_tagger.py b/src/lm_based_tagger/distilbert_tagger.py
new file mode 100644
index 0000000..bf3c4b7
--- /dev/null
+++ b/src/lm_based_tagger/distilbert_tagger.py
@@ -0,0 +1,178 @@
+import re
+import torch
+from nltk import pos_tag
+import nltk
+from difflib import SequenceMatcher
+from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
+
+# Make sure we have the same NLTK tagset
+nltk.download('averaged_perceptron_tagger_eng', quiet=True)
+nltk.download('universal_tagset', quiet=True)
+
+VOWELS = set("aeiou")
+CONTEXT_MAP = {
+    "FUNCTION": "@func",
+    "PARAMETER": "@param",
+    "ATTRIBUTE": "@attr",
+    "DECLARATION": "@decl",
+    "CLASS": "@class"
+}
+
+
+def detect_hungarian_prefix(first_token):
+    m = re.match(r'^([a-zA-Z]{1,3})[A-Z_]', first_token)
+    if m:
+        return f"@hung_{m.group(1).lower()}"
+    return "@hung_none"
+
+
+def detect_digit_feature(tokens):
+    for token in tokens:
+        if any(char.isdigit() for char in token):
+            return "@has_digit"
+    return "@no_digit"
+
+
+def consonant_vowel_ratio_bucket(tokens):
+    def ratio(tok):
+        tok_lower = tok.lower()
+        num_vowels = sum(1 for c in tok_lower if c in VOWELS)
+        num_consonants = sum(1 for c in tok_lower if c.isalpha() and c not in VOWELS)
+        return num_consonants / (num_vowels + 1e-5)
+
+    ratios = [ratio(tok) for tok in tokens if tok.isalpha()]
+    if not ratios:
+        return "@cvr_none"
+    avg_ratio = sum(ratios) / len(ratios)
+    if avg_ratio < 1.5:
+        return "@cvr_low"
+    elif avg_ratio < 3.0:
+        return "@cvr_mid"
+    else:
+        return "@cvr_high"
+
+
+def normalize_type(type_str):
+    ts = type_str.strip().lower()
+    ts = ts.replace("*", "_ptr")
+    ts = ts.replace(" ", "_")
+    return f"@{ts}"
+
+
+def normalize_language(lang_str):
+    return "@lang_" + lang_str.strip().lower().replace("++", "pp").replace("#", "sharp")
+
+
+def system_prefix_similarity(first_token, system_name):
+    if not first_token or not system_name:
+        return "@sim_none"
+    sys_lower = system_name.strip().lower()
+    tok_lower = first_token.strip().lower()
+    r = SequenceMatcher(None, tok_lower, sys_lower).ratio()
+    if r > 0.9:
+        return "@sim_high"
+    elif r > 0.6:
+        return "@sim_mid"
+    elif r > 0.3:
+        return "@sim_low"
+    else:
+        return "@sim_none"
+
+
+class DistilBertTagger:
+    def __init__(self, model_path: str):
+        """
+        Expects `model_path` to be a folder where the fine-tuned DistilBertForTokenClassification
+        (and its tokenizer) have been saved via `trainer.save_model(...)` and `tokenizer.save_pretrained(...)`.
+        """
+        self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
+        self.model = DistilBertForTokenClassification.from_pretrained(model_path)
+        self.model.eval()
+
+    def tag_identifier(self, tokens, context, type_str, language, system_name):
+        """
+        1) Build the “feature tokens + position tokens + identifier tokens” sequence
+        2) Tokenize with `is_split_into_words=True`
+        3) Run the model, take argmax over token logits
+        4) Align via `word_ids()`, skipping:
+              - Any word_id = None
+              - Any word_id < 9 (because first 9 tokens were “feature tokens” => labels = -100)
+              - Repeated word_ids (so we pick only the first sub-token of each “(pos, identifier‐word)” pair)
+        5) Return a list of numeric labels.  (If you want strings, you can map via id2label externally.)
+        """
+
+        # 1. Re–compute exactly the same feature tokens as in training:
+        context_token = CONTEXT_MAP.get(context.strip().upper(), "@unknown")
+        system_token = f"@system_{system_name.strip().lower().replace(' ', '_')}"
+        hungarian_token = detect_hungarian_prefix(tokens[0]) if tokens else "@hung_none"
+        cvr_token = consonant_vowel_ratio_bucket(tokens)
+        digit_token = detect_digit_feature(tokens)
+        sim_token = system_prefix_similarity(tokens[0], system_name) if tokens else "@sim_none"
+        type_token = normalize_type(type_str)
+        lang_token = normalize_language(language)
+
+        # Position tags for each identifier token
+        length = len(tokens)
+        if length == 1:
+            pos_tokens = ["@pos_2"]
+        else:
+            pos_tokens = ["@pos_0"] + ["@pos_1"] * (length - 2) + ["@pos_2"]
+
+        # NLTK POS feature
+        nltk_tags = pos_tag(tokens, tagset="universal")
+        universal_tags = [tag.lower() for _, tag in nltk_tags]
+        nltk_feature = f"@nltk_{'-'.join(universal_tags)}"
+
+        # Interleave pos_tokens + identifier tokens
+        tokens_with_pos = [val for pair in zip(pos_tokens, tokens) for val in pair]
+
+        # Build the full input token sequence (exactly what training saw):
+        input_tokens = [
+            context_token,
+            system_token,
+            hungarian_token,
+            cvr_token,
+            digit_token,
+            sim_token,
+            type_token,
+            lang_token,
+            nltk_feature
+        ] + tokens_with_pos
+
+        # 2. Tokenize
+        encoded = self.tokenizer(
+            input_tokens,
+            is_split_into_words=True,
+            return_tensors="pt",
+            truncation=True,
+            padding=True
+        )
+
+        # 3. Inference
+        with torch.no_grad():
+            logits = self.model(
+                input_ids=encoded["input_ids"],
+                attention_mask=encoded["attention_mask"]
+            )[0]
+
+        # 4. Take argmax, then align via word_ids()
+        predictions = torch.argmax(logits, dim=-1).squeeze().tolist()
+        word_ids = encoded.word_ids()
+
+        pred_labels = []
+        previous_word_idx = None
+
+        for idx, word_idx in enumerate(word_ids):
+            # Skip if special token (None), or if it's part of the first 9 “feature tokens”
+            if word_idx is None or word_idx < 9:
+                continue
+            # Skip if it’s the same word_idx as the previous (to avoid sub-token duplicates)
+            if word_idx == previous_word_idx:
+                continue
+
+            pred_labels.append(predictions[idx])
+            previous_word_idx = word_idx
+
+        # Now, pred_labels is a list of numeric IDs (length == len(tokens)),
+        # in the same order as your original “tokens” list.
+        return pred_labels
diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py
new file mode 100644
index 0000000..cb9358a
--- /dev/null
+++ b/src/lm_based_tagger/train_model.py
@@ -0,0 +1,127 @@
+import os
+import pandas as pd
+from sklearn.model_selection import train_test_split
+import torch
+from transformers import (
+    Trainer,
+    TrainingArguments,
+    DistilBertTokenizerFast,
+    DistilBertConfig,
+    DistilBertForTokenClassification,
+    DataCollatorForTokenClassification
+)
+from datasets import Dataset
+from src.lm_based_tagger.distilbert_preprocessing import prepare_dataset, tokenize_and_align_labels
+
+# === Labels & Mappings ===
+LABEL_LIST = ["CJ", "D", "DT", "N", "NM", "NPL", "P", "PRE", "V", "VM"]
+LABEL2ID   = {label: i for i, label in enumerate(LABEL_LIST)}
+ID2LABEL   = {i: label for label, i in LABEL2ID.items()}
+
+def train_lm(script_dir: str):
+    input_path = os.path.join(script_dir, "input", "tagger_data.tsv")
+    output_dir = os.path.join(script_dir, "output")
+    os.makedirs(output_dir, exist_ok=True)
+
+    # 1) Read TSV & build tokens/tags lists
+    df = pd.read_csv(input_path, sep="\t", dtype=str).dropna(subset=["SPLIT", "GRAMMAR_PATTERN"])
+    df = df[df["SPLIT"].str.strip().astype(bool)]
+    df["tokens"] = df["SPLIT"].apply(lambda x: x.strip().split())
+    df["tags"]   = df["GRAMMAR_PATTERN"].apply(lambda x: x.strip().split())
+    df = df[df.apply(lambda r: len(r["tokens"]) == len(r["tags"]), axis=1)]
+
+    # 2) Train/Test split (stratify by CONTEXT)
+    train_df, test_df = train_test_split(
+        df, test_size=0.15, random_state=42, stratify=df["CONTEXT"]
+    )
+
+    # 3) Upsample low-frequency tags (in training set only)
+    low_freq_tags = {"CJ", "VM", "PRE", "V"}
+    low_freq_df = train_df[train_df["tags"].apply(lambda tags: any(t in low_freq_tags for t in tags))]
+    train_df = pd.concat([train_df] + [low_freq_df] * 2, ignore_index=True)
+
+    # 4) Tokenizer
+    tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+
+    # 5) Convert each split into a HF Dataset via the shared prepare_dataset(...)
+    train_dataset = prepare_dataset(train_df, LABEL2ID)
+    test_dataset  = prepare_dataset(test_df, LABEL2ID)
+
+    # 6) Tokenize + align labels
+    tokenized_train = train_dataset.map(
+        lambda ex: tokenize_and_align_labels(ex, tokenizer),
+        batched=False
+    )
+    tokenized_test = test_dataset.map(
+        lambda ex: tokenize_and_align_labels(ex, tokenizer),
+        batched=False
+    )
+
+    # 7) Build config & model using uncased vocab
+    config = DistilBertConfig.from_pretrained(
+        "distilbert-base-uncased",
+        num_labels=len(LABEL_LIST),
+        id2label=ID2LABEL,
+        label2id=LABEL2ID
+    )
+    model = DistilBertForTokenClassification.from_pretrained(
+        "distilbert-base-uncased",
+        config=config
+    )
+
+    # 8) Training arguments
+    training_args = TrainingArguments(
+        output_dir=output_dir,
+        evaluation_strategy="epoch",
+        save_strategy="epoch",
+        learning_rate=5e-5,
+        per_device_train_batch_size=16,
+        per_device_eval_batch_size=16,
+        num_train_epochs=10,
+        weight_decay=0.01,
+        warmup_ratio=0.1,
+        lr_scheduler_type="cosine",
+        load_best_model_at_end=True,
+        metric_for_best_model="eval_macro_f1",
+        greater_is_better=True,
+        save_total_limit=1,
+        logging_dir=os.path.join(output_dir, "logs"),
+        report_to="none",
+        seed=42
+    )
+
+    # 9) Collate Data
+    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
+
+    # 10) Macro‐F1 computation
+    def compute_metrics(eval_pred):
+        from sklearn.metrics import f1_score
+        logits, labels = eval_pred
+        preds = logits.argmax(axis=-1)
+
+        true_preds = []
+        true_labels = []
+        for pred_row, label_row in zip(preds, labels):
+            for p, l in zip(pred_row, label_row):
+                if l != -100:
+                    true_preds.append(p)
+                    true_labels.append(l)
+
+        macro_f1 = f1_score(true_labels, true_preds, average="macro")
+        return {"eval_macro_f1": macro_f1}
+
+    # 11) Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_train,
+        eval_dataset=tokenized_test,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics
+    )
+
+    # 12) Train & save
+    trainer.train()
+    trainer.save_model(output_dir)
+    tokenizer.save_pretrained(output_dir)
diff --git a/src/tag_identifier.py b/src/tag_identifier.py
index 305e390..fdc42fc 100644
--- a/src/tag_identifier.py
+++ b/src/tag_identifier.py
@@ -3,16 +3,21 @@
 import joblib
 import nltk
 import pandas as pd
-from src.feature_generator import createFeatures, universal_to_custom, custom_to_numeric
-from flask import Flask
+from flask import Flask, request
 from waitress import serve
 from spiral import ronin
 import json
 import sqlite3
-from src.create_models import createModel, stable_features, mutable_feature_list
+from src.tree_based_tagger.feature_generator import createFeatures, universal_to_custom, custom_to_numeric
+from src.tree_based_tagger.create_models import createModel, stable_features, mutable_feature_list
+from src.lm_based_tagger.distilbert_tagger import DistilBertTagger
+
 app = Flask(__name__)
 
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+model_type = None
+lm_model = None
+
 class ModelData:
     def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> None:
         """
@@ -28,7 +33,6 @@ def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) ->
         self.ModelMethods = modelMethods
         self.ModelGensimEnglish = modelGensimEnglish
         self.wordCount = wordCount
-        # self.ModelClassifier = joblib.load('output/model_RandomForestClassifier.pkl')
 
 class AppCache:
     def __init__(self, Path) -> None:
@@ -127,7 +131,7 @@ def load(self):
     def find(self, item):
         return item in self.Words
 
-def initialize_model():
+def initialize_model(selected_model_type):
     """
     Initialize and load word vectors for the application, and load a word count DataFrame.
 
@@ -137,23 +141,25 @@ def initialize_model():
     Returns:
         tuple: (ModelData, WORD_COUNT DataFrame)
     """
-    print("Loading word vectors!!")
-    modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR)
-    print("Word vectors loaded!!")
-    
-    # Load the word count JSON file into a DataFrame
-    word_count_path = os.path.join("input", "word_count.json")
-    if os.path.exists(word_count_path):
-        print(f"Loading word count data from {word_count_path}...")
-        word_count_df = pd.read_json(word_count_path, orient='index', typ='series').reset_index()
-        word_count_df.columns = ['word', 'log_frequency']
-        print("Word count data loaded!")
-    else:
-        print(f"Word count file not found at {word_count_path}. Initializing empty DataFrame.")
-        word_count_df = pd.DataFrame(columns=['word', 'log_frequency'])
-    
-    # Create and store model data
-    app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df)
+    global model_type, lm_model
+    model_type = selected_model_type
+    if model_type == "tree_based":
+        print("Loading word vectors!!")
+        modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR)
+        print("Word vectors loaded!!")
+        word_count_path = os.path.join("input", "word_count.json")
+        if os.path.exists(word_count_path):
+            print(f"Loading word count data from {word_count_path}...")
+            word_count_df = pd.read_json(word_count_path, orient='index', typ='series').reset_index()
+            word_count_df.columns = ['word', 'log_frequency']
+        else:
+            print(f"Word count file not found at {word_count_path}. Initializing empty DataFrame.")
+            word_count_df = pd.DataFrame(columns=['word', 'log_frequency'])
+        app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df)
+    elif model_type == "lm_based":
+        print("Loading DistilBERT tagger...")
+        lm_model = DistilBertTagger(SCRIPT_DIR)
+        print("DistilBERT tagger loaded!")
 
 def start_server(temp_config = {}):
     """
@@ -169,12 +175,13 @@ def start_server(temp_config = {}):
         None
     """
     print('initializing model...')
-    initialize_model()
+    selected_model = temp_config.get("model_type", "tree_based")
+    initialize_model(selected_model)
 
     print("loading cache...")
     if not os.path.isdir("cache"): os.mkdir("cache")
 
-    print("laoding dictionary")
+    print("loading dictionary")
     app.english_words = set(w.lower() for w in nltk.corpus.words.words())
 
     #insert english words from words/en.txt
@@ -248,6 +255,10 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None)
             cache = AppCache("cache/"+cache_id+".db3")
             cache.load()
     
+    system_name = request.args.get("system_name", default="")
+    programming_language = request.args.get("language", default="")
+    data_type = request.args.get("type", default="")
+    
     #TODO: update this documentation
     """
     Process a web request to analyze an identifier within a specific context.
@@ -267,7 +278,20 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None)
    
     # get the start time
     start_time = time.perf_counter()
-
+    
+    if model_type == "lm_based":
+        result = {
+            "words": []
+        }
+        tags = lm_model.predict(words, identifier_context, programming_language, data_type, system_name)
+        for word, tag in zip(words, tags):
+            dictionary = dictionary_lookup(word)
+            result["words"].append({word: {"tag": tag, "dictionary": dictionary}})
+        tag_time = time.perf_counter() - start_time
+        if cache_id:
+            AppCache(f"cache/{cache_id}.db3").add(identifier_name, result, identifier_context, tag_time)
+        return result
+    
     # Split identifier_name into words
     words = ronin.split(identifier_name)
     
diff --git a/src/tree_based_tagger/__init__.py b/src/tree_based_tagger/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/classifier_multiclass.py b/src/tree_based_tagger/classifier_multiclass.py
similarity index 64%
rename from src/classifier_multiclass.py
rename to src/tree_based_tagger/classifier_multiclass.py
index 104926f..66378d3 100644
--- a/src/classifier_multiclass.py
+++ b/src/tree_based_tagger/classifier_multiclass.py
@@ -7,14 +7,19 @@
 from sklearn.metrics import f1_score
 from sklearn.metrics import matthews_corrcoef
 from sklearn.metrics import make_scorer
-from sklearn.metrics import classification_report, precision_recall_fscore_support
-from sklearn.model_selection import GridSearchCV, cross_validate, StratifiedKFold, cross_val_predict
+from sklearn.metrics import classification_report
+from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
 from sklearn.model_selection import train_test_split
 from sklearn.inspection import permutation_importance
+from src.tree_based_tagger.feature_generator import custom_to_numeric, universal_to_custom, createFeatures
+from src.tree_based_tagger.create_models import createModel, stable_features, mutable_feature_list, columns_to_drop
 import pandas as pd
 from enum import Enum
-import src.feature_generator
 import multiprocessing
+import os, sqlite3, random
+import pandas as pd
+import numpy as np
+from datetime import datetime
 
 class TrainingAlgorithm(Enum):
     RANDOM_FOREST = "RandomForest"
@@ -62,6 +67,138 @@ def __init__(self, X_train, X_test, y_train, y_test, X_train_original, X_test_or
         self.X_test_original = X_test_original
         self.labels = labels
 
+def load_config_tree(SCRIPT_DIR):
+    # Mimic Python-based config instead of JSON
+    config = {
+        'script_dir': SCRIPT_DIR,
+        'input_file': os.path.join(SCRIPT_DIR, 'input', 'scanl_tagger_training_db_11_29_2024.db'),
+        'sql_statement': 'select * from training_set',
+        'identifier_column': "ID",
+        'dependent_variable': 'CORRECT_TAG',
+        'pyrandom_seed': random.randint(0, 2**32 - 1),
+        'trainingSeed': random.randint(0, 2**32 - 1),
+        'classifierSeed': random.randint(0, 2**32 - 1),
+        'npseed': random.randint(0, 2**32 - 1),
+        'independent_variables': stable_features + mutable_feature_list
+    }
+    print(config)
+    return config
+
+def read_input(sql, features, conn, config):
+    """
+    Read input data from an SQLite database and preprocess it.
+
+    This function reads data from the specified SQL query and database connection, shuffles the rows, and then applies
+    a preprocessing function called 'createFeatures' to create additional features.
+
+    Args:
+        sql (str): The SQL query to fetch data from the database.
+        conn (sqlite3.Connection): The SQLite database connection.
+
+    Returns:
+        pandas.DataFrame: A DataFrame containing the preprocessed input data.
+    """
+    input_data = pd.read_sql_query(sql, conn)
+    print(" --  --  --  -- Read " + str(len(input_data)) + " input rows --  --  --  -- ")
+    print(input_data.columns)
+    input_data_copy = input_data.copy()
+    rows = input_data_copy.values.tolist()
+    random.shuffle(rows)
+    shuffled_input_data = pd.DataFrame(rows, columns=input_data.columns)
+    modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=config['script_dir'])
+    input_data = createFeatures(shuffled_input_data, features, modelGensimEnglish=modelGensimEnglish, modelTokens=modelTokens, modelMethods=modelMethods)
+    return input_data
+
+def train_tree(config):
+    """
+    Train a part of speech tagger model using specified features and a training dataset.
+    This function reads data from an SQLite database, preprocesses it, and performs classification using a specified set
+    of features. The results are written to an output file, including information about the training process and the
+    distribution of labels in the training data.
+    Args:
+        config (dict): A dictionary containing configuration data.
+    Returns:
+        None
+    """
+   
+    # Extract configuration values from the 'config' dictionary
+    input_file = config['input_file']
+    sql_statement = config['sql_statement']
+    identifier_column = config['identifier_column']
+    dependent_variable = config['dependent_variable']
+    pyrandom_seed = config['pyrandom_seed']
+    trainingSeed = config['trainingSeed']
+    classifierSeed = config['classifierSeed']
+   
+    np.random.seed(config['npseed'])
+    random.seed(pyrandom_seed)
+    independent_variables = config['independent_variables']
+    
+    # ###############################################################
+    print(" --  -- Started: Reading Database --  -- ")
+    connection = sqlite3.connect(input_file)
+    df_input = read_input(sql_statement, independent_variables, connection, config)
+    print(" --  -- Completed: Reading Input --  -- ")
+    # ###############################################################
+    
+    # Create an explicit copy to avoid SettingWithCopyWarning
+    #independent_variables.remove("EMB_FEATURES")
+    df_features = df_input[independent_variables].copy()
+    df_class = df_input[[dependent_variable]].copy()
+    
+    category_variables = []
+    categorical_columns = ['NLTK_POS', 'PREV_POS', 'NEXT_POS']
+    
+    # Safely handle categorical variables
+    for category_column in categorical_columns:
+        if category_column in df_features.columns:
+            category_variables.append(category_column)
+            df_features.loc[:, category_column] = df_features[category_column].astype(str)
+    
+    # Ensure output directories exist
+    output_dir = os.path.join(config['script_dir'], 'output')
+    os.makedirs(output_dir, exist_ok=True)
+    
+    filename = os.path.join(output_dir, 'results.txt')
+    mode = 'a' if os.path.exists(filename) else 'w'
+   
+    with open(filename, mode) as results_text_file:
+        results_text_file.write(datetime.now().strftime("%H:%M:%S") + "\n")
+       
+        # Print config in a readable fashion
+        results_text_file.write("Configuration:\n")
+        for key, value in config.items():
+            results_text_file.write(f"{key}: {value}\n")
+        results_text_file.write("\n")
+
+        for category_column in category_variables:
+            # Explicitly handle categorical conversion
+            unique_values = df_features[category_column].unique()
+            category_map = {}
+            for value in unique_values:
+                print(value)
+                if value in universal_to_custom:
+                    category_map[value] = custom_to_numeric[universal_to_custom[value]]
+                else:
+                    category_map[value] = custom_to_numeric['NOUN']  # Assign 'NM' (8) for unknown categories
+
+            df_features.loc[:, category_column] = df_features[category_column].map(category_map)
+       
+        print(" --  -- Distribution of labels in corpus --  -- ")
+        print(df_class[dependent_variable].value_counts())
+        results_text_file.write(f"SQL: {sql_statement}\n")
+        results_text_file.write(f"Features: {df_features}\n")
+       
+        algorithms = [TrainingAlgorithm.XGBOOST]
+        #pd.set_option('display.max_rows', None)  # Show all rows
+        pd.set_option('display.max_columns', None)  # Show all columns
+        pd.set_option('display.width', None)  # Prevent line wrapping
+        pd.set_option('display.max_colwidth', None)  # Show full content of each cell
+
+        print(df_features)
+        perform_classification(df_features, df_class, results_text_file,
+                                                    output_dir, algorithms, trainingSeed,
+                                                    classifierSeed, columns_to_drop)
 def build_datasets(X, y, output_directory, trainingSeed):
     # Ensure the output directory exists
     os.makedirs(output_directory, exist_ok=True)
diff --git a/src/create_models.py b/src/tree_based_tagger/create_models.py
similarity index 100%
rename from src/create_models.py
rename to src/tree_based_tagger/create_models.py
diff --git a/src/download_code2vec_vectors.py b/src/tree_based_tagger/download_code2vec_vectors.py
similarity index 100%
rename from src/download_code2vec_vectors.py
rename to src/tree_based_tagger/download_code2vec_vectors.py
diff --git a/src/feature_generator.py b/src/tree_based_tagger/feature_generator.py
similarity index 100%
rename from src/feature_generator.py
rename to src/tree_based_tagger/feature_generator.py

From 347ef4edd46cfea2be70f255be7aaddd4102c48e Mon Sep 17 00:00:00 2001
From: Christian Newman <cdvnse@rit.edu>
Date: Mon, 2 Jun 2025 21:57:21 -0400
Subject: [PATCH 32/51] Prepare to re-add kfold

---
 src/lm_based_tagger/train_model.py     | 67 ++++++++++++++++++--------
 src/tree_based_tagger/create_models.py | 45 -----------------
 2 files changed, 47 insertions(+), 65 deletions(-)

diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py
index cb9358a..dfa8b25 100644
--- a/src/lm_based_tagger/train_model.py
+++ b/src/lm_based_tagger/train_model.py
@@ -13,6 +13,9 @@
 from datasets import Dataset
 from src.lm_based_tagger.distilbert_preprocessing import prepare_dataset, tokenize_and_align_labels
 
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("Using device:", device)
+
 # === Labels & Mappings ===
 LABEL_LIST = ["CJ", "D", "DT", "N", "NM", "NPL", "P", "PRE", "V", "VM"]
 LABEL2ID   = {label: i for i, label in enumerate(LABEL_LIST)}
@@ -69,26 +72,50 @@ def train_lm(script_dir: str):
         config=config
     )
 
-    # 8) Training arguments
-    training_args = TrainingArguments(
-        output_dir=output_dir,
-        evaluation_strategy="epoch",
-        save_strategy="epoch",
-        learning_rate=5e-5,
-        per_device_train_batch_size=16,
-        per_device_eval_batch_size=16,
-        num_train_epochs=10,
-        weight_decay=0.01,
-        warmup_ratio=0.1,
-        lr_scheduler_type="cosine",
-        load_best_model_at_end=True,
-        metric_for_best_model="eval_macro_f1",
-        greater_is_better=True,
-        save_total_limit=1,
-        logging_dir=os.path.join(output_dir, "logs"),
-        report_to="none",
-        seed=42
-    )
+    if device == "cpu":
+        # 8) Training arguments
+        training_args = TrainingArguments(
+            output_dir=output_dir,
+            evaluation_strategy="epoch",
+            save_strategy="epoch",
+            learning_rate=5e-5,
+            per_device_train_batch_size=16,
+            per_device_eval_batch_size=16,
+            num_train_epochs=10,
+            weight_decay=0.01,
+            warmup_ratio=0.1,
+            lr_scheduler_type="cosine",
+            load_best_model_at_end=True,
+            metric_for_best_model="eval_macro_f1",
+            greater_is_better=True,
+            save_total_limit=1,
+            logging_dir=os.path.join(output_dir, "logs"),
+            report_to="none",
+            seed=42
+        )
+    else:
+        training_args = TrainingArguments(
+            output_dir=output_dir,
+            eval_strategy="epoch",
+            save_strategy="epoch",
+            learning_rate=5e-5,
+            per_device_train_batch_size=4,  # ↓ reduce to fit in VRAM
+            per_device_eval_batch_size=4,
+            gradient_accumulation_steps=4,  # simulates batch size of 16
+            num_train_epochs=10,
+            weight_decay=0.01,
+            warmup_ratio=0.1,
+            lr_scheduler_type="cosine",
+            load_best_model_at_end=True,
+            save_total_limit=1,
+            metric_for_best_model="eval_macro_f1",  # or "eval_loss" if macro F1 isn't computed
+            greater_is_better=True,  # set to False if using loss
+            logging_dir=os.path.join(output_dir, "logs"),
+            report_to="none",
+            seed=42,
+            fp16=False,
+            dataloader_pin_memory=False,  # no benefit if no CUDA pinning
+        )
 
     # 9) Collate Data
     data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
diff --git a/src/tree_based_tagger/create_models.py b/src/tree_based_tagger/create_models.py
index 6a13b66..451d697 100644
--- a/src/tree_based_tagger/create_models.py
+++ b/src/tree_based_tagger/create_models.py
@@ -1,7 +1,5 @@
 import gensim.downloader as api
-from gensim.models import KeyedVectors as word2vec
 import json, os
-from gensim.models import KeyedVectors
 import logging
 #'VERB_SCORE', 'DET_SCORE', 'ENGLISHV_SCORE', 'POSITION_RATIO','METHODV_SCORE', 'CONTAINSLISTVERB'
 stable_features = ['WORD', 'SPLIT_IDENTIFIER', 'CONTEXT_NUMBER'] #'LANGUAGE' 'PREP_SCORE' 'CONTAINSLISTVERB','CONTAINSCLOSEDSET'
@@ -75,47 +73,4 @@ def createModel(pklFile="", rootDir=""):
     method_txt_path = os.path.join(rootDir, 'code2vec', 'target_vecs.txt')
     method_native_path = os.path.join(rootDir, 'code2vec', 'target_vecs.kv')
 
-    return modelGensimTokens, modelGensimMethods, modelGensimEnglish
-    
-    # Helper function to load models safely
-    def load_model(txt_path, native_path, model_name):
-        """
-        Load a word vector model, converting from text format if necessary.
-        
-        Args:
-            txt_path (str): Path to the text-based word vectors.
-            native_path (str): Path to the native .kv format file.
-            model_name (str): Name of the model for logging.
-
-        Returns:
-            KeyedVectors or None: The loaded model, or None if unavailable.
-        """
-        try:
-            if os.path.exists(native_path):
-                logger.info(f"Loading {model_name} from native format...")
-                return KeyedVectors.load(native_path)
-            
-            elif os.path.exists(txt_path):
-                logger.info(f"Native format for {model_name} not found. Converting from text format...")
-                model = KeyedVectors.load_word2vec_format(txt_path, binary=False)
-                try:
-                    model.save(native_path)
-                    logger.info(f"{model_name} vectors converted and saved to {native_path}")
-                except PermissionError:
-                    logger.warning(f"Permission denied when saving {model_name} to {native_path}. Using in-memory only.")
-                return model
-            
-            else:
-                logger.warning(f"{model_name} vector file not found at {txt_path} or {native_path}. Skipping.")
-                return None
-        
-        except Exception as e:
-            logger.warning(f"Failed to load {model_name}: {e}")
-            return None
-
-    # Load models with the new safe function
-    modelGensimTokens = load_model(token_txt_path, token_native_path, "Token vectors")
-    modelGensimMethods = load_model(method_txt_path, method_native_path, "Method vectors")
-
-    logger.info("Model loading complete.")
     return modelGensimTokens, modelGensimMethods, modelGensimEnglish
\ No newline at end of file

From 2417e49cc4f9758fd785bd282ce2a4683624d5c4 Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@se.rit.edu>
Date: Tue, 3 Jun 2025 10:42:17 -0400
Subject: [PATCH 33/51] Load model when server runs, listen for url

---
 main                                     |  34 ++-
 src/lm_based_tagger/distilbert_tagger.py |  15 +-
 src/lm_based_tagger/train_model.py       | 357 ++++++++++++++++-------
 src/tag_identifier.py                    | 145 ++++-----
 src/tree_based_tagger/create_models.py   |   2 +-
 5 files changed, 339 insertions(+), 214 deletions(-)

diff --git a/main b/main
index 86ee7bf..07d795b 100755
--- a/main
+++ b/main
@@ -4,6 +4,7 @@ import os, argparse
 from datetime import datetime
 from src.tree_based_tagger.classifier_multiclass import load_config_tree, train_tree
 from src.lm_based_tagger.train_model import train_lm
+from src.lm_based_tagger.distilbert_tagger import DistilBertTagger
 from src.tag_identifier import start_server
 from src.tree_based_tagger.download_code2vec_vectors import *
 from version import __version__ 
@@ -69,20 +70,29 @@ if __name__ == "__main__":
             train_lm(SCRIPT_DIR)
 
     elif args.mode == "run":
+        if args.model_type == "tree_based":
+            config = load_config_tree()
+            # Inject overrides
+            download_files()
+            config["model_type"] = args.model_type
+            config["model_dir"] = args.model_dir
 
-        # Inject overrides
-        config["model_type"] = args.model_type
-        config["model_dir"] = args.model_dir
+            if args.port:
+                config["port"] = args.port
+            if args.protocol:
+                config["protocol"] = args.protocol
+            if args.word:
+                config["word"] = args.word
+            if args.address:
+                config["address"] = args.address
 
-        if args.port:
-            config["port"] = args.port
-        if args.protocol:
-            config["protocol"] = args.protocol
-        if args.word:
-            config["word"] = args.word
-        if args.address:
-            config["address"] = args.address
+            start_server(temp_config=config)
+        elif args.model_type == "lm_based":
+            start_server(temp_config={
+                'script_dir': SCRIPT_DIR,
+                'model': os.path.join(SCRIPT_DIR, 'output', 'best_model'),
+                'model_type':'lm_based'
+            })
 
-        start_server(temp_config=config)
     else:
         parser.print_usage()
\ No newline at end of file
diff --git a/src/lm_based_tagger/distilbert_tagger.py b/src/lm_based_tagger/distilbert_tagger.py
index bf3c4b7..322d3e0 100644
--- a/src/lm_based_tagger/distilbert_tagger.py
+++ b/src/lm_based_tagger/distilbert_tagger.py
@@ -89,6 +89,10 @@ def __init__(self, model_path: str):
         self.model = DistilBertForTokenClassification.from_pretrained(model_path)
         self.model.eval()
 
+        # ── Extract id2label from the saved config.json ──
+        # model.config.id2label maps string keys ("0", "1", ...) to tag names (e.g. "N", "V", "PRE", ...)
+        self.id2label = { int(k): v for k, v in self.model.config.id2label.items() }
+
     def tag_identifier(self, tokens, context, type_str, language, system_name):
         """
         1) Build the “feature tokens + position tokens + identifier tokens” sequence
@@ -97,8 +101,8 @@ def tag_identifier(self, tokens, context, type_str, language, system_name):
         4) Align via `word_ids()`, skipping:
               - Any word_id = None
               - Any word_id < 9 (because first 9 tokens were “feature tokens” => labels = -100)
-              - Repeated word_ids (so we pick only the first sub-token of each “(pos, identifier‐word)” pair)
-        5) Return a list of numeric labels.  (If you want strings, you can map via id2label externally.)
+              - Repeated word_ids (so we pick only the first sub-token of each “(pos, identifier-word)” pair)
+        5) Return a list of string labels by mapping numeric IDs through `self.id2label`.
         """
 
         # 1. Re–compute exactly the same feature tokens as in training:
@@ -173,6 +177,7 @@ def tag_identifier(self, tokens, context, type_str, language, system_name):
             pred_labels.append(predictions[idx])
             previous_word_idx = word_idx
 
-        # Now, pred_labels is a list of numeric IDs (length == len(tokens)),
-        # in the same order as your original “tokens” list.
-        return pred_labels
+        # 5. Map numeric IDs → string tags via self.id2label
+        pred_tag_strings = [ self.id2label[label_id] for label_id in pred_labels ]
+
+        return pred_tag_strings
diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py
index dfa8b25..5f2db21 100644
--- a/src/lm_based_tagger/train_model.py
+++ b/src/lm_based_tagger/train_model.py
@@ -1,154 +1,297 @@
+# train_model.py
+
 import os
+import time
+import random
+
+import numpy as np
 import pandas as pd
-from sklearn.model_selection import train_test_split
 import torch
+
+from sklearn.model_selection import train_test_split, KFold
+from sklearn.metrics import f1_score, accuracy_score, classification_report
+
 from transformers import (
     Trainer,
     TrainingArguments,
     DistilBertTokenizerFast,
     DistilBertConfig,
     DistilBertForTokenClassification,
-    DataCollatorForTokenClassification
+    DataCollatorForTokenClassification,
+    EarlyStoppingCallback
 )
+
 from datasets import Dataset
 from src.lm_based_tagger.distilbert_preprocessing import prepare_dataset, tokenize_and_align_labels
 
+# If CUDA is available, use it; otherwise fallback to CPU
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("Using device:", device)
 
-# === Labels & Mappings ===
+# === Random Seeds ===
+# Match test.py’s seed settings for reproducibility :contentReference[oaicite:0]{index=0}
+RAND_STATE = 209
+random.seed(RAND_STATE)
+np.random.seed(RAND_STATE)
+torch.manual_seed(RAND_STATE)
+torch.cuda.manual_seed_all(RAND_STATE)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+
+# === Hyperparameters / Config ===
+K = 2                     # number of CV folds
+HOLDOUT_RATIO = 0.15      # 15% held out for final evaluation
+EPOCHS = 5               # number of epochs per fold
+EARLY_STOP = 2            # patience for early stopping
+LOW_FREQ_TAGS = {"CJ", "VM", "PRE", "V"}
+
+# === Label List & Mappings (unchanged from your original) :contentReference[oaicite:1]{index=1} ===
 LABEL_LIST = ["CJ", "D", "DT", "N", "NM", "NPL", "P", "PRE", "V", "VM"]
 LABEL2ID   = {label: i for i, label in enumerate(LABEL_LIST)}
 ID2LABEL   = {i: label for label, i in LABEL2ID.items()}
 
+
 def train_lm(script_dir: str):
+    # 1) Paths
     input_path = os.path.join(script_dir, "input", "tagger_data.tsv")
     output_dir = os.path.join(script_dir, "output")
     os.makedirs(output_dir, exist_ok=True)
 
-    # 1) Read TSV & build tokens/tags lists
+    # 2) Read the TSV & build “tokens” / “tags” columns :contentReference[oaicite:2]{index=2}
     df = pd.read_csv(input_path, sep="\t", dtype=str).dropna(subset=["SPLIT", "GRAMMAR_PATTERN"])
     df = df[df["SPLIT"].str.strip().astype(bool)]
     df["tokens"] = df["SPLIT"].apply(lambda x: x.strip().split())
     df["tags"]   = df["GRAMMAR_PATTERN"].apply(lambda x: x.strip().split())
+    # Keep only rows where len(tokens) == len(tags)
     df = df[df.apply(lambda r: len(r["tokens"]) == len(r["tags"]), axis=1)]
 
-    # 2) Train/Test split (stratify by CONTEXT)
-    train_df, test_df = train_test_split(
-        df, test_size=0.15, random_state=42, stratify=df["CONTEXT"]
+    # 3) Initial Train/Val Split (15% hold-out) :contentReference[oaicite:3]{index=3}
+    train_df, val_df = train_test_split(
+        df,
+        test_size=HOLDOUT_RATIO,
+        random_state=RAND_STATE,
+        stratify=df["CONTEXT"]
     )
 
-    # 3) Upsample low-frequency tags (in training set only)
-    low_freq_tags = {"CJ", "VM", "PRE", "V"}
-    low_freq_df = train_df[train_df["tags"].apply(lambda tags: any(t in low_freq_tags for t in tags))]
-    train_df = pd.concat([train_df] + [low_freq_df] * 2, ignore_index=True)
+    # 4) Upsample low-frequency tags **in the training set only** :contentReference[oaicite:4]{index=4}
+    low_freq_df = train_df[train_df["tags"].apply(lambda tags: any(t in LOW_FREQ_TAGS for t in tags))]
+    train_df_upsampled = pd.concat([train_df] + [low_freq_df] * 2, ignore_index=True)
 
-    # 4) Tokenizer
+    # 5) Tokenizer (uncased, matching test.py) 
     tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
 
-    # 5) Convert each split into a HF Dataset via the shared prepare_dataset(...)
-    train_dataset = prepare_dataset(train_df, LABEL2ID)
-    test_dataset  = prepare_dataset(test_df, LABEL2ID)
-
-    # 6) Tokenize + align labels
-    tokenized_train = train_dataset.map(
-        lambda ex: tokenize_and_align_labels(ex, tokenizer),
-        batched=False
-    )
-    tokenized_test = test_dataset.map(
+    # 6) Prepare final hold-out “validation” Dataset :contentReference[oaicite:5]{index=5}
+    val_dataset = prepare_dataset(val_df, LABEL2ID)
+    tokenized_val = val_dataset.map(
         lambda ex: tokenize_and_align_labels(ex, tokenizer),
         batched=False
     )
 
-    # 7) Build config & model using uncased vocab
-    config = DistilBertConfig.from_pretrained(
-        "distilbert-base-uncased",
-        num_labels=len(LABEL_LIST),
-        id2label=ID2LABEL,
-        label2id=LABEL2ID
-    )
-    model = DistilBertForTokenClassification.from_pretrained(
-        "distilbert-base-uncased",
-        config=config
-    )
+    # 7) Set up K-Fold
+    kf = KFold(n_splits=K, shuffle=True, random_state=RAND_STATE)
+    best_macro_f1 = -1.0
+    best_model_dir = None
+
+    fold = 1
+    for train_idx, test_idx in kf.split(train_df_upsampled):
+        print(f"\n=== Fold {fold} ===")
+
+        # 7a) Split the upsampled DataFrame into this fold’s train/test
+        fold_train_df = train_df_upsampled.iloc[train_idx].reset_index(drop=True)
+        fold_test_df  = train_df_upsampled.iloc[test_idx].reset_index(drop=True)
 
-    if device == "cpu":
-        # 8) Training arguments
-        training_args = TrainingArguments(
-            output_dir=output_dir,
-            evaluation_strategy="epoch",
-            save_strategy="epoch",
-            learning_rate=5e-5,
-            per_device_train_batch_size=16,
-            per_device_eval_batch_size=16,
-            num_train_epochs=10,
-            weight_decay=0.01,
-            warmup_ratio=0.1,
-            lr_scheduler_type="cosine",
-            load_best_model_at_end=True,
-            metric_for_best_model="eval_macro_f1",
-            greater_is_better=True,
-            save_total_limit=1,
-            logging_dir=os.path.join(output_dir, "logs"),
-            report_to="none",
-            seed=42
+        # 7b) Build HuggingFace Datasets via prepare_dataset(...) :contentReference[oaicite:6]{index=6}
+        fold_train_dataset = prepare_dataset(fold_train_df, LABEL2ID)
+        fold_test_dataset  = prepare_dataset(fold_test_df, LABEL2ID)
+
+        # 7c) Tokenize + align labels (exactly as before) :contentReference[oaicite:7]{index=7}
+        tokenized_train = fold_train_dataset.map(
+            lambda ex: tokenize_and_align_labels(ex, tokenizer),
+            batched=False
+        )
+        tokenized_test = fold_test_dataset.map(
+            lambda ex: tokenize_and_align_labels(ex, tokenizer),
+            batched=False
         )
-    else:
-        training_args = TrainingArguments(
-            output_dir=output_dir,
-            eval_strategy="epoch",
-            save_strategy="epoch",
-            learning_rate=5e-5,
-            per_device_train_batch_size=4,  # ↓ reduce to fit in VRAM
-            per_device_eval_batch_size=4,
-            gradient_accumulation_steps=4,  # simulates batch size of 16
-            num_train_epochs=10,
-            weight_decay=0.01,
-            warmup_ratio=0.1,
-            lr_scheduler_type="cosine",
-            load_best_model_at_end=True,
-            save_total_limit=1,
-            metric_for_best_model="eval_macro_f1",  # or "eval_loss" if macro F1 isn't computed
-            greater_is_better=True,  # set to False if using loss
-            logging_dir=os.path.join(output_dir, "logs"),
-            report_to="none",
-            seed=42,
-            fp16=False,
-            dataloader_pin_memory=False,  # no benefit if no CUDA pinning
+
+        # 8) Build fresh model + config for this fold :contentReference[oaicite:8]{index=8}
+        config = DistilBertConfig.from_pretrained(
+            "distilbert-base-uncased",
+            num_labels=len(LABEL_LIST),
+            id2label=ID2LABEL,
+            label2id=LABEL2ID
         )
+        model = DistilBertForTokenClassification.from_pretrained(
+            "distilbert-base-uncased",
+            config=config
+        )
+        model.to(device)
+
+        # 9) TrainingArguments (with early stopping) :contentReference[oaicite:9]{index=9}
+        if device.type == "cpu":
+            training_args = TrainingArguments(
+                output_dir=os.path.join(output_dir, f"fold_{fold}"),
+                eval_strategy="epoch",
+                save_strategy="epoch",
+                learning_rate=5e-5,
+                per_device_train_batch_size=16,
+                per_device_eval_batch_size=16,
+                num_train_epochs=EPOCHS,
+                weight_decay=0.01,
+                warmup_ratio=0.1,
+                lr_scheduler_type="cosine",
+                load_best_model_at_end=True,
+                metric_for_best_model="eval_macro_f1",
+                greater_is_better=True,
+                save_total_limit=1,
+                logging_dir=os.path.join(output_dir, "logs", f"fold_{fold}"),
+                report_to="none",
+                seed=RAND_STATE
+            )
+        else:
+            training_args = TrainingArguments(
+                output_dir=os.path.join(output_dir, f"fold_{fold}"),
+                eval_strategy="epoch",
+                save_strategy="epoch",
+                learning_rate=5e-5,
+                per_device_train_batch_size=4,   # smaller per-GPU batch size
+                per_device_eval_batch_size=4,
+                gradient_accumulation_steps=4,   # to simulate batch size = 16
+                num_train_epochs=EPOCHS,
+                weight_decay=0.01,
+                warmup_ratio=0.1,
+                lr_scheduler_type="cosine",
+                load_best_model_at_end=True,
+                metric_for_best_model="eval_macro_f1",
+                greater_is_better=True,
+                save_total_limit=1,
+                logging_dir=os.path.join(output_dir, "logs", f"fold_{fold}"),
+                report_to="none",
+                seed=RAND_STATE,
+                fp16=False,
+                dataloader_pin_memory=False
+            )
 
-    # 9) Collate Data
-    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
-
-    # 10) Macro‐F1 computation
-    def compute_metrics(eval_pred):
-        from sklearn.metrics import f1_score
-        logits, labels = eval_pred
-        preds = logits.argmax(axis=-1)
-
-        true_preds = []
-        true_labels = []
-        for pred_row, label_row in zip(preds, labels):
-            for p, l in zip(pred_row, label_row):
-                if l != -100:
-                    true_preds.append(p)
-                    true_labels.append(l)
-
-        macro_f1 = f1_score(true_labels, true_preds, average="macro")
-        return {"eval_macro_f1": macro_f1}
-
-    # 11) Trainer
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=tokenized_train,
-        eval_dataset=tokenized_test,
+        # 10) Data collator (dynamic padding) :contentReference[oaicite:10]{index=10}
+        data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
+
+        # 11) compute_metrics function (macro-F1) :contentReference[oaicite:11]{index=11}
+        def compute_metrics(eval_pred):
+            logits, labels = eval_pred
+            preds = logits.argmax(axis=-1)
+
+            true_preds = []
+            true_labels = []
+            for pred_row, label_row in zip(preds, labels):
+                for p, l in zip(pred_row, label_row):
+                    if l != -100:
+                        true_preds.append(p)
+                        true_labels.append(l)
+
+            macro_f1 = f1_score(true_labels, true_preds, average="macro")
+            return {"eval_macro_f1": macro_f1}
+
+        # 12) Trainer for this fold (with EarlyStopping) :contentReference[oaicite:12]{index=12}
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=tokenized_train,
+            eval_dataset=tokenized_test,
+            tokenizer=tokenizer,
+            data_collator=data_collator,
+            callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOP)],
+            compute_metrics=compute_metrics
+        )
+        # Avoid deprecation warning (explicitly set tokenizer on trainer)
+        trainer.tokenizer = tokenizer
+
+        # 13) Train this fold
+        trainer.train()
+
+        # 14) Evaluate on this fold’s held-out split
+        preds_logits, labels, _ = trainer.predict(tokenized_test)
+        preds = np.argmax(preds_logits, axis=-1)
+
+        # Convert to (flattened) label strings for F1
+        true_labels_list = [
+            ID2LABEL[l]
+            for sent_labels, sent_preds in zip(labels, preds)
+            for (l, p) in zip(sent_labels, sent_preds)
+            if l != -100
+        ]
+        pred_labels_list = [
+            ID2LABEL[p]
+            for sent_labels, sent_preds in zip(labels, preds)
+            for (l, p) in zip(sent_labels, sent_preds)
+            if l != -100
+        ]
+
+        fold_macro_f1 = f1_score(true_labels_list, pred_labels_list, average="macro")
+        print(f"Fold {fold} Macro F1: {fold_macro_f1:.4f}")
+
+        # 15) If this fold’s model is the best so far, save it
+        if fold_macro_f1 > best_macro_f1:
+            best_macro_f1 = fold_macro_f1
+            best_model_dir = os.path.join(output_dir, "best_model")
+            trainer.save_model(best_model_dir)
+            tokenizer.save_pretrained(best_model_dir)
+
+        fold += 1
+
+    # 16) After all folds, report best fold‐score & load best model for final evaluation
+    print(f"\nBest fold model saved at: {best_model_dir}, Macro F1 = {best_macro_f1:.4f}")
+
+    # 17) Final Evaluation on held-out val_df
+    best_model = DistilBertForTokenClassification.from_pretrained(best_model_dir)
+    best_model.to(device)
+
+    # Build a fresh set of TrainingArguments that never runs evaluation epochs:
+    final_args = TrainingArguments(
+        output_dir=os.path.join(output_dir, "final_eval"),
+        per_device_eval_batch_size=16,
+        eval_strategy="no",
+        save_strategy="no",
+        logging_dir=os.path.join(output_dir, "logs", "final_eval"),
+        report_to="none",
+        seed=RAND_STATE
+    )
+    val_trainer = Trainer(
+        model=best_model,
+        args=final_args,
         tokenizer=tokenizer,
-        data_collator=data_collator,
-        compute_metrics=compute_metrics
+        data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
+        # ← note: no eval_dataset here, because we’ll call .predict(...) manually
     )
 
-    # 12) Train & save
-    trainer.train()
-    trainer.save_model(output_dir)
-    tokenizer.save_pretrained(output_dir)
+    start_time = time.perf_counter()
+    val_preds_logits, val_labels, _ = val_trainer.predict(tokenized_val)
+    end_time = time.perf_counter()
+
+    val_preds = np.argmax(val_preds_logits, axis=-1)
+
+    flat_true = [
+        ID2LABEL[l]
+        for sent_labels, sent_preds in zip(val_labels, val_preds)
+        for (l, p) in zip(sent_labels, sent_preds)
+        if l != -100
+    ]
+    flat_pred = [
+        ID2LABEL[p]
+        for sent_labels, sent_preds in zip(val_labels, val_preds)
+        for (l, p) in zip(sent_labels, sent_preds)
+        if l != -100
+    ]
+
+    print("\nFinal Evaluation on Held-Out Set:")
+    print(classification_report(flat_true, flat_pred))
+
+    # Report inference speed
+    total_tokens = sum(len(ex["tokens"]) for ex in val_dataset)
+    total_examples = len(val_dataset)
+    elapsed = end_time - start_time
+    print(f"\nInference Time: {elapsed:.2f}s for {total_examples} identifiers ({total_tokens} tokens)")
+    print(f"Tokens/sec: {total_tokens / elapsed:.2f}")
+    print(f"Identifiers/sec: {total_examples / elapsed:.2f}")
+
+    final_macro_f1 = f1_score(flat_true, flat_pred, average="macro")
+    print(f"\nFinal Macro F1 on Held-Out Set: {final_macro_f1:.4f}")
diff --git a/src/tag_identifier.py b/src/tag_identifier.py
index fdc42fc..0158bb1 100644
--- a/src/tag_identifier.py
+++ b/src/tag_identifier.py
@@ -131,7 +131,7 @@ def load(self):
     def find(self, item):
         return item in self.Words
 
-def initialize_model(selected_model_type):
+def initialize_model(temp_config = {}):
     """
     Initialize and load word vectors for the application, and load a word count DataFrame.
 
@@ -142,7 +142,7 @@ def initialize_model(selected_model_type):
         tuple: (ModelData, WORD_COUNT DataFrame)
     """
     global model_type, lm_model
-    model_type = selected_model_type
+    model_type = temp_config.get("model_type", "tree_based")
     if model_type == "tree_based":
         print("Loading word vectors!!")
         modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR)
@@ -158,7 +158,7 @@ def initialize_model(selected_model_type):
         app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df)
     elif model_type == "lm_based":
         print("Loading DistilBERT tagger...")
-        lm_model = DistilBertTagger(SCRIPT_DIR)
+        lm_model = DistilBertTagger(temp_config['model'])
         print("DistilBERT tagger loaded!")
 
 def start_server(temp_config = {}):
@@ -176,7 +176,7 @@ def start_server(temp_config = {}):
     """
     print('initializing model...')
     selected_model = temp_config.get("model_type", "tree_based")
-    initialize_model(selected_model)
+    initialize_model(temp_config)
 
     print("loading cache...")
     if not os.path.isdir("cache"): os.mkdir("cache")
@@ -239,132 +239,99 @@ def probe(cache_id: str):
 @app.route('/<identifier_name>/<identifier_context>')
 @app.route('/<identifier_name>/<identifier_context>/<cache_id>')
 def listen(identifier_name: str, identifier_context: str, cache_id: str = None) -> list[dict]:
-    #check if identifier name has already been used
+    # --- Cache lookup (unchanged) ---
     cache = None
-    #find the existing cache in app.caches or create a new one if it doesn't exist
-    if cache_id != None:
-        if os.path.exists("cache/"+cache_id+".db3"):
-            #check if the identifier name is in this cache and return it if so
-            cache = AppCache("cache/"+cache_id+".db3")
+    if cache_id is not None:
+        if os.path.exists("cache/" + cache_id + ".db3"):
+            cache = AppCache("cache/" + cache_id + ".db3")
             data = cache.retrieve(identifier_name, identifier_context)
-            if data != False:
+            if data is not False:
                 cache.encounter(identifier_name, identifier_context)
                 return data
         else:
-            #create the cache
-            cache = AppCache("cache/"+cache_id+".db3")
+            cache = AppCache("cache/" + cache_id + ".db3")
             cache.load()
-    
+
+    # Pull query‐string parameters
     system_name = request.args.get("system_name", default="")
     programming_language = request.args.get("language", default="")
     data_type = request.args.get("type", default="")
-    
-    #TODO: update this documentation
-    """
-    Process a web request to analyze an identifier within a specific context.
-
-    This route function takes two URL parameters (identifier_name, and identifier_context) from an
-    incoming HTTP request and performs data preprocessing and feature extraction on the identifier_name.
-    It then uses a trained classifier to annotate the identifier with part-of-speech tags and other linguistic features.
-
-    Args:
-        identifier_name (str): The name of the identifier to be analyzed.
-        identifier_context (str): The context in which the identifier appears.
 
-    Returns:
-        List[dict]: A list of dictionaries containing words and their predicted POS tags.
-    """
     print(f"INPUT: {identifier_name} {identifier_context}")
-   
-    # get the start time
     start_time = time.perf_counter()
-    
+
+    # 1) Split the identifier into tokens for **both** modes
+    words = ronin.split(identifier_name)
+
+    # 2) If we asked for the LM‐based (DistilBERT) tagger, use it
     if model_type == "lm_based":
-        result = {
-            "words": []
-        }
-        tags = lm_model.predict(words, identifier_context, programming_language, data_type, system_name)
+        result = { "words": [] }
+
+        tags = lm_model.tag_identifier(
+            tokens=words,
+            context=identifier_context,
+            type_str=data_type,
+            language=programming_language,
+            system_name=system_name
+        )
+
         for word, tag in zip(words, tags):
             dictionary = dictionary_lookup(word)
-            result["words"].append({word: {"tag": tag, "dictionary": dictionary}})
+            result["words"].append({
+                word: { "tag": tag, "dictionary": dictionary }
+            })
+
         tag_time = time.perf_counter() - start_time
         if cache_id:
             AppCache(f"cache/{cache_id}.db3").add(identifier_name, result, identifier_context, tag_time)
         return result
-    
-    # Split identifier_name into words
-    words = ronin.split(identifier_name)
-    
-    # # Create initial data frame
+
+    # 3) Else: use the existing tree‐based tagger
+    # Create initial DataFrame
     data = pd.DataFrame({
         'WORD': words,
         'SPLIT_IDENTIFIER': ' '.join(words),
-        'CONTEXT_NUMBER': context_to_number(identifier_context),  # Predefined context number
+        'CONTEXT_NUMBER': context_to_number(identifier_context),
     })
 
-    # create response JSON
-    # tags = list(annotate_identifier(app.model_data.ModelClassifier, data))
-    result = {
-        "words" : []
-    }
-
-    # Add features to the data
+    # Build features
     data = createFeatures(
         data,
         mutable_feature_list,
         modelGensimEnglish=app.model_data.ModelGensimEnglish,
     )
-    
-    categorical_features = ['NLTK_POS','PREV_POS', 'NEXT_POS']
-    category_variables = []
 
+    # Convert any categorical features to numeric
+    categorical_features = ['NLTK_POS', 'PREV_POS', 'NEXT_POS']
     for category_column in categorical_features:
         if category_column in data.columns:
-            category_variables.append(category_column)
-            data.loc[:, category_column] = data[category_column].astype(str)
-
-    for category_column in category_variables:
-        # Explicitly handle categorical conversion
-        unique_values = data[category_column].unique()
-        category_map = {}
-        for value in unique_values:
-            if value in universal_to_custom:
-                category_map[value] = custom_to_numeric[universal_to_custom[value]]
-            else:
-                category_map[value] = custom_to_numeric['NOUN']  # Assign 'NM' (8) for unknown categories
-
-        data.loc[:, category_column] = data[category_column].map(category_map)
-
-    # Convert categorical variables to numeric
-    # Load and apply the classifier
+            data[category_column] = data[category_column].astype(str)
+            unique_vals = data[category_column].unique()
+            category_map = {}
+            for val in unique_vals:
+                if val in universal_to_custom:
+                    category_map[val] = custom_to_numeric[universal_to_custom[val]]
+                else:
+                    category_map[val] = custom_to_numeric['NOUN']
+            data[category_column] = data[category_column].map(category_map)
+
+    # Load classifier and annotate
     clf = joblib.load(os.path.join(SCRIPT_DIR, '..', 'models', 'model_GradientBoostingClassifier.pkl'))
     predicted_tags = annotate_identifier(clf, data)
 
-    # Combine words and their POS tags into a parseable format
-    #result = [{'word': word, 'pos_tag': tag} for word, tag in zip(words, predicted_tags)]
-
-    for i in range(len(words)):
-        #check dictionary
-        dictionary = "UC" #uncategorized
-        word = words[i]
+    result = { "words": [] }
+    for i, word in enumerate(words):
         dictionary = dictionary_lookup(word)
-        result["words"].append(
-            {
-                words[i] : {
-                    "tag" : predicted_tags[i],
-                    "dictionary" : dictionary
-                }
-            }
-        )
+        result["words"].append({
+            word: { "tag": predicted_tags[i], "dictionary": dictionary }
+        })
 
-    # get time it took to tag the identifier
     tag_time = time.perf_counter() - start_time
-
-    # append result to cache
-    if cache_id != None:
+    if cache_id is not None:
         cache.add(identifier_name, result, identifier_context, tag_time)
 
     return result
+
     
 def context_to_number(context):
     """
diff --git a/src/tree_based_tagger/create_models.py b/src/tree_based_tagger/create_models.py
index 451d697..e147a7c 100644
--- a/src/tree_based_tagger/create_models.py
+++ b/src/tree_based_tagger/create_models.py
@@ -1,4 +1,3 @@
-import gensim.downloader as api
 import json, os
 import logging
 #'VERB_SCORE', 'DET_SCORE', 'ENGLISHV_SCORE', 'POSITION_RATIO','METHODV_SCORE', 'CONTAINSLISTVERB'
@@ -39,6 +38,7 @@ def createModel(pklFile="", rootDir=""):
                (modelGensimTokens, modelGensimMethods, modelGensimEnglish).
                Models that fail to load are set to None.
     """
+    import gensim.downloader as api
     # Configure logging
     logging.basicConfig(level=logging.INFO, 
                        format='%(asctime)s - %(levelname)s - %(message)s')

From 4dcd8d4e3f5a72962729a56e76fa30d9677828d7 Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@se.rit.edu>
Date: Wed, 4 Jun 2025 01:52:33 -0400
Subject: [PATCH 34/51] A half vibe coded mess, but I think it works. Needs a
 ton of clean up.

---
 src/lm_based_tagger/distilbert_crf.py         | 111 ++++++++++++
 .../distilbert_preprocessing.py               |   4 +-
 src/lm_based_tagger/distilbert_tagger.py      |  74 +++++---
 src/lm_based_tagger/train_model.py            | 158 ++++++++++++++----
 4 files changed, 283 insertions(+), 64 deletions(-)
 create mode 100644 src/lm_based_tagger/distilbert_crf.py

diff --git a/src/lm_based_tagger/distilbert_crf.py b/src/lm_based_tagger/distilbert_crf.py
new file mode 100644
index 0000000..4fc5486
--- /dev/null
+++ b/src/lm_based_tagger/distilbert_crf.py
@@ -0,0 +1,111 @@
+# distilbert_crf.py
+import torch, os
+import torch.nn as nn
+from TorchCRF import CRF
+from transformers import DistilBertModel, DistilBertConfig
+
+class DistilBertCRFForTokenClassification(nn.Module):
+    """
+    DistilBERT ➜ dropout ➜ linear projection ➜ CRF.
+    The CRF layer models label‑to‑label transitions, so the model
+    is optimised at *sequence* level rather than *token* level.
+    """
+    def __init__(self,
+                 num_labels: int,
+                 id2label: dict,
+                 label2id: dict,
+                 pretrained_name: str = "distilbert-base-uncased",
+                 dropout_prob: float = 0.1):
+        super().__init__()
+
+        self.config = DistilBertConfig.from_pretrained(
+            pretrained_name,
+            num_labels=num_labels,
+            id2label=id2label,
+            label2id=label2id,
+        )
+        self.bert = DistilBertModel.from_pretrained(pretrained_name, config=self.config)
+        self.dropout = nn.Dropout(dropout_prob)
+        self.classifier = nn.Linear(self.config.hidden_size, num_labels)
+        self.crf = CRF(num_labels, batch_first=True)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                labels=None,
+                **kwargs):
+
+        # Hugging Face occasionally injects helper fields (e.g. num_items_in_batch)
+        # Filter `kwargs` down to what DistilBertModel.forward actually accepts.
+        ALLOWED = {
+            "head_mask", "inputs_embeds", "position_ids",
+            "output_attentions", "output_hidden_states", "return_dict"
+        }
+        bert_kwargs = {k: v for k, v in kwargs.items() if k in ALLOWED}
+
+        outputs = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            **bert_kwargs,
+        )
+        # —— Build emissions once ——————————————————————————————
+        sequence_output = self.dropout(outputs[0])          # [B, T, H]
+        emission_scores = self.classifier(sequence_output)  # [B, T, C]
+
+        seq_len = emission_scores.size(1)                   # original token length
+
+        # ============================== TRAINING ==============================
+        if labels is not None:
+            # 1. Drop [CLS] (idx 0) and [SEP] (idx –1)
+            emissions = emission_scores[:, 1:-1, :]         # [B, T‑2, C]
+            tags      = labels[:,           1:-1].clone()   # [B, T‑2]
+            crf_mask  = (tags != -100)                      # True = keep
+
+            # 2. For any position that’s masked‑off ➜ set tag to a valid id (0)
+            tags[~crf_mask] = 0
+
+            # 3. Guarantee first timestep is ON for every sequence
+            first_off = (~crf_mask[:, 0]).nonzero(as_tuple=True)[0]
+            if len(first_off):
+                crf_mask[first_off, 0] = True        # flip mask to ON
+                tags[first_off, 0] = 0               # give it tag 0
+
+            loss = -self.crf(emissions, tags, mask=crf_mask, reduction="mean")
+            return {"loss": loss, "logits": emission_scores}
+
+        # ============================= INFERENCE ==============================
+        else:
+            crf_mask  = attention_mask[:, 1:-1].bool()      # [B, T‑2]
+            emissions = emission_scores[:, 1:-1, :]         # [B, T‑2, C]
+            best_paths = self.crf.decode(emissions, mask=crf_mask)
+            return {"logits": emission_scores,
+                    "predictions": best_paths}
+
+    from transformers import DistilBertConfig
+    @classmethod
+    def from_pretrained(cls, ckpt_dir, **kw):
+        from safetensors import safe_open
+        cfg = DistilBertConfig.from_pretrained(ckpt_dir)
+        model = cls(
+            num_labels=cfg.num_labels,
+            id2label=cfg.id2label,
+            label2id=cfg.label2id,
+            pretrained_name=cfg._name_or_path or "distilbert-base-uncased",
+            **kw,
+        )
+
+        weight_path_pt  = os.path.join(ckpt_dir, "pytorch_model.bin")
+        weight_path_safe = os.path.join(ckpt_dir, "model.safetensors")
+
+        if os.path.exists(weight_path_pt):
+            state = torch.load(weight_path_pt, map_location="cpu")
+        elif os.path.exists(weight_path_safe):
+            state = {}
+            with safe_open(weight_path_safe, framework="pt", device="cpu") as f:
+                for k in f.keys():
+                    state[k] = f.get_tensor(k)
+        else:
+            raise FileNotFoundError("No weight file found in checkpoint directory.")
+
+        model.load_state_dict(state)
+        return model
\ No newline at end of file
diff --git a/src/lm_based_tagger/distilbert_preprocessing.py b/src/lm_based_tagger/distilbert_preprocessing.py
index 874a88a..1043463 100644
--- a/src/lm_based_tagger/distilbert_preprocessing.py
+++ b/src/lm_based_tagger/distilbert_preprocessing.py
@@ -143,7 +143,7 @@ def prepare_dataset(df: pd.DataFrame, label2id: dict):
         #    - First 7 entries → -100 (because they are feature tokens)
         #    - Then for each identifier token, [-100, label2id[tag]]
         ner_tags_with_pos = [val for tag in tags for val in (-100, label2id[tag])]
-        full_labels = [-100] * 7 + ner_tags_with_pos
+        full_labels = [0] * 7 + ner_tags_with_pos      # ← use 0, not -100
 
         rows.append({
             "tokens":   full_tokens,
@@ -163,7 +163,7 @@ def tokenize_and_align_labels(example, tokenizer):
       - "ner_tags" : List[int] (same length as above)
 
     We run `tokenizer(example["tokens"], is_split_into_words=True, truncation=True)`,
-    then align `word_ids()` with `example["ner_tags"]` exactly as in test.py.
+    then align `word_ids()` with `example["ner_tags"]`
     """
     tokenized = tokenizer(
         example["tokens"],
diff --git a/src/lm_based_tagger/distilbert_tagger.py b/src/lm_based_tagger/distilbert_tagger.py
index 322d3e0..4d62bb9 100644
--- a/src/lm_based_tagger/distilbert_tagger.py
+++ b/src/lm_based_tagger/distilbert_tagger.py
@@ -4,6 +4,7 @@
 import nltk
 from difflib import SequenceMatcher
 from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
+from .distilbert_crf import DistilBertCRFForTokenClassification 
 
 # Make sure we have the same NLTK tagset
 nltk.download('averaged_perceptron_tagger_eng', quiet=True)
@@ -82,16 +83,25 @@ def system_prefix_similarity(first_token, system_name):
 class DistilBertTagger:
     def __init__(self, model_path: str):
         """
-        Expects `model_path` to be a folder where the fine-tuned DistilBertForTokenClassification
-        (and its tokenizer) have been saved via `trainer.save_model(...)` and `tokenizer.save_pretrained(...)`.
+        `model_path` must contain:
+          • config.json
+          • model.safetensors  OR  pytorch_model.bin
+          • tokenizer files (tokenizer.json, vocab.txt, …)
         """
         self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
-        self.model = DistilBertForTokenClassification.from_pretrained(model_path)
-        self.model.eval()
 
-        # ── Extract id2label from the saved config.json ──
-        # model.config.id2label maps string keys ("0", "1", ...) to tag names (e.g. "N", "V", "PRE", ...)
-        self.id2label = { int(k): v for k, v in self.model.config.id2label.items() }
+        # Try CRF wrapper first (it can load .safetensors or .bin)
+        try:
+            self.model = DistilBertCRFForTokenClassification.from_pretrained(model_path)
+        except Exception:
+            # Fallback: plain DistilBERT head (no CRF layer present)
+            from transformers import DistilBertForTokenClassification
+            self.model = DistilBertForTokenClassification.from_pretrained(model_path)
+
+        self.model.eval()                      # inference mode
+
+        # id2label keys can be strings → convert to int
+        self.id2label = {int(k): v for k, v in self.model.config.id2label.items()}
 
     def tag_identifier(self, tokens, context, type_str, language, system_name):
         """
@@ -138,9 +148,7 @@ def tag_identifier(self, tokens, context, type_str, language, system_name):
             cvr_token,
             digit_token,
             sim_token,
-            type_token,
-            lang_token,
-            nltk_feature
+            nltk_feature,
         ] + tokens_with_pos
 
         # 2. Tokenize
@@ -152,32 +160,44 @@ def tag_identifier(self, tokens, context, type_str, language, system_name):
             padding=True
         )
 
-        # 3. Inference
+        # ─── 3. Inference ───────────────────────────────────────────
         with torch.no_grad():
-            logits = self.model(
+            out = self.model(
                 input_ids=encoded["input_ids"],
-                attention_mask=encoded["attention_mask"]
-            )[0]
+                attention_mask=encoded["attention_mask"],
+            )
 
-        # 4. Take argmax, then align via word_ids()
-        predictions = torch.argmax(logits, dim=-1).squeeze().tolist()
-        word_ids = encoded.word_ids()
+        # One label per *input* token
+        if isinstance(out, dict) and "predictions" in out:        # CRF path
+            labels_per_token = out["predictions"][0]              # list[int]
+        else:                                                     # logits
+            logits = out[0] if isinstance(out, (tuple, list)) else out
+            labels_per_token = torch.argmax(logits, dim=-1).squeeze().tolist()
 
-        pred_labels = []
-        previous_word_idx = None
+        # ─── 4. Re‑align to identifier words ──────────────────────
+        pred_labels, previous_word_idx = [], None
+        word_ids = encoded.word_ids()           # same length as labels_per_token
 
         for idx, word_idx in enumerate(word_ids):
-            # Skip if special token (None), or if it's part of the first 9 “feature tokens”
-            if word_idx is None or word_idx < 9:
+            # a) skip special tokens ([CLS]/[SEP])
+            if word_idx is None:
+                continue
+            # b) skip the 7 leading feature tokens
+            if word_idx < 7:
                 continue
-            # Skip if it’s the same word_idx as the previous (to avoid sub-token duplicates)
+            # c) skip every @pos_* placeholder   (@pos tokens sit at even
+            #    offsets after the 7 features: 7,9,11, … so (w‑7)%2 == 0)
+            if (word_idx - 7) % 2 == 0:
+                continue
+            # d) skip duplicate word‑pieces
             if word_idx == previous_word_idx:
                 continue
 
-            pred_labels.append(predictions[idx])
+            label_idx = idx - 1          # shift because [CLS] was removed
+            if label_idx < len(labels_per_token):
+                pred_labels.append(labels_per_token[label_idx])
             previous_word_idx = word_idx
 
-        # 5. Map numeric IDs → string tags via self.id2label
-        pred_tag_strings = [ self.id2label[label_id] for label_id in pred_labels ]
-
-        return pred_tag_strings
+        # Map numeric IDs → tag strings
+        pred_tag_strings = [self.id2label[i] for i in pred_labels]
+        return pred_tag_strings
\ No newline at end of file
diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py
index 5f2db21..0139d25 100644
--- a/src/lm_based_tagger/train_model.py
+++ b/src/lm_based_tagger/train_model.py
@@ -1,5 +1,3 @@
-# train_model.py
-
 import os
 import time
 import random
@@ -7,6 +5,7 @@
 import numpy as np
 import pandas as pd
 import torch
+from .distilbert_crf import DistilBertCRFForTokenClassification
 
 from sklearn.model_selection import train_test_split, KFold
 from sklearn.metrics import f1_score, accuracy_score, classification_report
@@ -29,7 +28,6 @@
 print("Using device:", device)
 
 # === Random Seeds ===
-# Match test.py’s seed settings for reproducibility :contentReference[oaicite:0]{index=0}
 RAND_STATE = 209
 random.seed(RAND_STATE)
 np.random.seed(RAND_STATE)
@@ -39,13 +37,13 @@
 torch.backends.cudnn.benchmark = False
 
 # === Hyperparameters / Config ===
-K = 2                     # number of CV folds
+K = 5                     # number of CV folds
 HOLDOUT_RATIO = 0.15      # 15% held out for final evaluation
-EPOCHS = 5               # number of epochs per fold
+EPOCHS = 10            # number of epochs per fold
 EARLY_STOP = 2            # patience for early stopping
 LOW_FREQ_TAGS = {"CJ", "VM", "PRE", "V"}
 
-# === Label List & Mappings (unchanged from your original) :contentReference[oaicite:1]{index=1} ===
+# === Label List & Mappings ===
 LABEL_LIST = ["CJ", "D", "DT", "N", "NM", "NPL", "P", "PRE", "V", "VM"]
 LABEL2ID   = {label: i for i, label in enumerate(LABEL_LIST)}
 ID2LABEL   = {i: label for label, i in LABEL2ID.items()}
@@ -57,7 +55,7 @@ def train_lm(script_dir: str):
     output_dir = os.path.join(script_dir, "output")
     os.makedirs(output_dir, exist_ok=True)
 
-    # 2) Read the TSV & build “tokens” / “tags” columns :contentReference[oaicite:2]{index=2}
+    # 2) Read the TSV & build “tokens” / “tags” columns 
     df = pd.read_csv(input_path, sep="\t", dtype=str).dropna(subset=["SPLIT", "GRAMMAR_PATTERN"])
     df = df[df["SPLIT"].str.strip().astype(bool)]
     df["tokens"] = df["SPLIT"].apply(lambda x: x.strip().split())
@@ -65,7 +63,7 @@ def train_lm(script_dir: str):
     # Keep only rows where len(tokens) == len(tags)
     df = df[df.apply(lambda r: len(r["tokens"]) == len(r["tags"]), axis=1)]
 
-    # 3) Initial Train/Val Split (15% hold-out) :contentReference[oaicite:3]{index=3}
+    # 3) Initial Train/Val Split (15% hold-out) 
     train_df, val_df = train_test_split(
         df,
         test_size=HOLDOUT_RATIO,
@@ -73,14 +71,14 @@ def train_lm(script_dir: str):
         stratify=df["CONTEXT"]
     )
 
-    # 4) Upsample low-frequency tags **in the training set only** :contentReference[oaicite:4]{index=4}
+    # 4) Upsample low-frequency tags **in the training set only** 
     low_freq_df = train_df[train_df["tags"].apply(lambda tags: any(t in LOW_FREQ_TAGS for t in tags))]
     train_df_upsampled = pd.concat([train_df] + [low_freq_df] * 2, ignore_index=True)
 
-    # 5) Tokenizer (uncased, matching test.py) 
+    # 5) Tokenizer
     tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
 
-    # 6) Prepare final hold-out “validation” Dataset :contentReference[oaicite:5]{index=5}
+    # 6) Prepare final hold-out “validation” Dataset 
     val_dataset = prepare_dataset(val_df, LABEL2ID)
     tokenized_val = val_dataset.map(
         lambda ex: tokenize_and_align_labels(ex, tokenizer),
@@ -100,11 +98,11 @@ def train_lm(script_dir: str):
         fold_train_df = train_df_upsampled.iloc[train_idx].reset_index(drop=True)
         fold_test_df  = train_df_upsampled.iloc[test_idx].reset_index(drop=True)
 
-        # 7b) Build HuggingFace Datasets via prepare_dataset(...) :contentReference[oaicite:6]{index=6}
+        # 7b) Build HuggingFace Datasets via prepare_dataset(...) 
         fold_train_dataset = prepare_dataset(fold_train_df, LABEL2ID)
         fold_test_dataset  = prepare_dataset(fold_test_df, LABEL2ID)
 
-        # 7c) Tokenize + align labels (exactly as before) :contentReference[oaicite:7]{index=7}
+        # 7c) Tokenize + align labels (exactly as before) 
         tokenized_train = fold_train_dataset.map(
             lambda ex: tokenize_and_align_labels(ex, tokenizer),
             batched=False
@@ -114,20 +112,23 @@ def train_lm(script_dir: str):
             batched=False
         )
 
-        # 8) Build fresh model + config for this fold :contentReference[oaicite:8]{index=8}
+        # 8) Build fresh model + config for this fold 
         config = DistilBertConfig.from_pretrained(
             "distilbert-base-uncased",
             num_labels=len(LABEL_LIST),
             id2label=ID2LABEL,
             label2id=LABEL2ID
         )
-        model = DistilBertForTokenClassification.from_pretrained(
-            "distilbert-base-uncased",
-            config=config
-        )
+        model = DistilBertCRFForTokenClassification(
+            num_labels=len(LABEL_LIST),
+            id2label=ID2LABEL,
+            label2id=LABEL2ID,
+            pretrained_name="distilbert-base-uncased",
+            dropout_prob=0.1
+        ).to(device)
         model.to(device)
 
-        # 9) TrainingArguments (with early stopping) :contentReference[oaicite:9]{index=9}
+        # 9) TrainingArguments (with early stopping) 
         if device.type == "cpu":
             training_args = TrainingArguments(
                 output_dir=os.path.join(output_dir, f"fold_{fold}"),
@@ -172,26 +173,74 @@ def train_lm(script_dir: str):
                 dataloader_pin_memory=False
             )
 
-        # 10) Data collator (dynamic padding) :contentReference[oaicite:10]{index=10}
+        # 10) Data collator (dynamic padding) 
         data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
 
-        # 11) compute_metrics function (macro-F1) :contentReference[oaicite:11]{index=11}
+        # 11) compute_metrics function (macro-F1) 
+
         def compute_metrics(eval_pred):
-            logits, labels = eval_pred
-            preds = logits.argmax(axis=-1)
+            """
+            Works for both:
+                • Plain classifier logits  → argmax along last dim
+                • CRF Viterbi paths (list/2‑D ndarray) → use directly
+            Returns:
+                - eval_macro_f1
+                - eval_token_accuracy
+                - eval_identifier_accuracy
+            """
+            # ── 1. Unpack ────────────────────────────────────────────────────
+            if isinstance(eval_pred, tuple):          # older HF (<4.38)
+                preds, labels = eval_pred
+            else:                                     # EvalPrediction obj
+                preds  = eval_pred.predictions
+                labels = eval_pred.label_ids
+
+            # ── 2. Convert logits → label IDs if needed ─────────────────────
+            #    * 3‑D tensor  : [B, T, C]  → argmax(C)
+            #    * 2‑D tensor  : already IDs
+            #    * list/obj‑nd : variable‑length decode paths
+            if isinstance(preds, np.ndarray) and preds.ndim == 3:
+                preds = np.argmax(preds, axis=-1)     # [B, T]
+            elif isinstance(preds, list):
+                preds = np.array(preds, dtype=object) # each row is a list
+
+            # ── 3. Accumulate token & identifier stats ──────────────────────
+            all_true, all_pred, id_correct_flags = [], [], []
 
-            true_preds = []
-            true_labels = []
             for pred_row, label_row in zip(preds, labels):
-                for p, l in zip(pred_row, label_row):
-                    if l != -100:
-                        true_preds.append(p)
-                        true_labels.append(l)
-
-            macro_f1 = f1_score(true_labels, true_preds, average="macro")
-            return {"eval_macro_f1": macro_f1}
-
-        # 12) Trainer for this fold (with EarlyStopping) :contentReference[oaicite:12]{index=12}
+                ptr = 0
+                example_correct = True
+
+                for lbl in label_row:                 # iterate gold labels
+                    if lbl == -100:                   # skip padding / specials
+                        continue
+
+                    # pick the corresponding prediction
+                    if isinstance(pred_row, (list, np.ndarray)):
+                        pred_lbl = pred_row[ptr]
+                    else:                             # pred_row is scalar
+                        pred_lbl = pred_row
+                    ptr += 1
+
+                    all_true.append(lbl)
+                    all_pred.append(pred_lbl)
+                    if pred_lbl != lbl:
+                        example_correct = False
+
+                id_correct_flags.append(example_correct)
+
+            # ── 4. Metrics ──────────────────────────────────────────────────
+            macro_f1  = f1_score(all_true, all_pred, average="macro")
+            token_acc = accuracy_score(all_true, all_pred)
+            id_acc    = float(sum(id_correct_flags)) / len(id_correct_flags)
+
+            return {
+                "eval_macro_f1":          macro_f1,
+                "eval_token_accuracy":    token_acc,
+                "eval_identifier_accuracy": id_acc,
+            }
+
+        # 12) Trainer for this fold (with EarlyStopping) 
         trainer = Trainer(
             model=model,
             args=training_args,
@@ -242,7 +291,7 @@ def compute_metrics(eval_pred):
     print(f"\nBest fold model saved at: {best_model_dir}, Macro F1 = {best_macro_f1:.4f}")
 
     # 17) Final Evaluation on held-out val_df
-    best_model = DistilBertForTokenClassification.from_pretrained(best_model_dir)
+    best_model = DistilBertCRFForTokenClassification.from_pretrained(best_model_dir)
     best_model.to(device)
 
     # Build a fresh set of TrainingArguments that never runs evaluation epochs:
@@ -295,3 +344,42 @@ def compute_metrics(eval_pred):
 
     final_macro_f1 = f1_score(flat_true, flat_pred, average="macro")
     print(f"\nFinal Macro F1 on Held-Out Set: {final_macro_f1:.4f}")
+    final_accuracy = accuracy_score(flat_true, flat_pred)
+    print(f"Final Token-level Accuracy on Held-Out Set: {final_accuracy:.4f}")
+    
+    # 18) Write hold-out predictions to CSV so that each row contains
+    #     (tokens, true_tags, pred_tags) for sanity checking.
+    from .distilbert_tagger import DistilBertTagger
+
+    # Re-instantiate the exact same DistilBERT tagger we saved
+    tagger = DistilBertTagger(best_model_dir)
+
+    rows = []
+    for _, row in val_df.iterrows():
+        tokens     = row["tokens"]            # e.g. ["my", "Identifier", "Name"]
+        true_tags  = row["tags"]              # e.g. ["NM", "DT", "DT"]
+        context    = row.get("CONTEXT", "")   # e.g. "FUNCTION"
+        type_str   = row.get("TYPE", "")      # if present; otherwise ""
+        language   = row.get("LANGUAGE", "")  # if present; otherwise ""
+        system_name= row.get("SYSTEM_NAME", "")  # if present; otherwise ""
+
+        # `tag_identifier` now returns a list of string labels, not IDs
+        pred_tags = tagger.tag_identifier(tokens, context, type_str, language, system_name)
+
+        rows.append({
+            "tokens":      " ".join(tokens),
+            "true_tags":   " ".join(true_tags),
+            "pred_tags":   " ".join(pred_tags)
+        })
+
+    preds_df = pd.DataFrame(rows)
+    csv_path = os.path.join(output_dir, "holdout_predictions.csv")
+    preds_df.to_csv(csv_path, index=False)
+    print(f"\nWrote hold-out predictions to: {csv_path}")
+
+    # Now also compute identifier-level accuracy from the “flat_true/flat_pred” folds:
+    # We need to compare per-example (not flattened) again, so re-run a grouping logic.
+    df = pd.read_csv(os.path.join(output_dir, "holdout_predictions.csv"))
+    df["row_correct"] = df["true_tags"] == df["pred_tags"]
+    id_level_acc = df["row_correct"].mean()
+    print(f"Final Identifier-level Accuracy on Held-Out Set: {id_level_acc:.4f}")
\ No newline at end of file

From b30e9286054289a6021e2c0d94953ebf8a76e03b Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@se.rit.edu>
Date: Wed, 4 Jun 2025 09:59:58 -0400
Subject: [PATCH 35/51] Fix bug with the masking

---
 src/lm_based_tagger/distilbert_preprocessing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lm_based_tagger/distilbert_preprocessing.py b/src/lm_based_tagger/distilbert_preprocessing.py
index 1043463..5c905c6 100644
--- a/src/lm_based_tagger/distilbert_preprocessing.py
+++ b/src/lm_based_tagger/distilbert_preprocessing.py
@@ -143,7 +143,7 @@ def prepare_dataset(df: pd.DataFrame, label2id: dict):
         #    - First 7 entries → -100 (because they are feature tokens)
         #    - Then for each identifier token, [-100, label2id[tag]]
         ner_tags_with_pos = [val for tag in tags for val in (-100, label2id[tag])]
-        full_labels = [0] * 7 + ner_tags_with_pos      # ← use 0, not -100
+        full_labels = [-100] * 7 + ner_tags_with_pos      # ← use 0, not -100
 
         rows.append({
             "tokens":   full_tokens,

From a84f3adba6b7d3d46d9625e6432eb3481dd6c00e Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@se.rit.edu>
Date: Wed, 4 Jun 2025 13:54:07 -0400
Subject: [PATCH 36/51] Remove system as a feature

---
 src/lm_based_tagger/distilbert_preprocessing.py | 10 +++++-----
 src/lm_based_tagger/distilbert_tagger.py        |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/lm_based_tagger/distilbert_preprocessing.py b/src/lm_based_tagger/distilbert_preprocessing.py
index 5c905c6..8ee9c4c 100644
--- a/src/lm_based_tagger/distilbert_preprocessing.py
+++ b/src/lm_based_tagger/distilbert_preprocessing.py
@@ -109,11 +109,11 @@ def prepare_dataset(df: pd.DataFrame, label2id: dict):
 
         # 1. Build 7 feature tokens (context, system, hungarian, cvr, digit, sim, nltk)
         context_token   = CONTEXT_MAP.get(row["CONTEXT"].strip().upper(), "@unknown")
-        system_token    = f"@system_{row['SYSTEM_NAME'].strip().lower().replace(' ', '_')}"
+        # system_token    = f"@system_{row['SYSTEM_NAME'].strip().lower().replace(' ', '_')}"
         hungarian_token = detect_hungarian_prefix(tokens[0]) if tokens else "@hung_none"
         cvr_token       = consonant_vowel_ratio_bucket(tokens)
         digit_token     = detect_digit_feature(tokens)
-        sim_token       = system_prefix_similarity(tokens[0], row["SYSTEM_NAME"]) if tokens else "@sim_none"
+        # sim_token       = system_prefix_similarity(tokens[0], row["SYSTEM_NAME"]) if tokens else "@sim_none"
 
         # 2. NLTK POS tags (universal tagset)
         nltk_tags = pos_tag(tokens, tagset="universal")
@@ -131,11 +131,11 @@ def prepare_dataset(df: pd.DataFrame, label2id: dict):
         # 4. Build the “full” token list (7 feature tokens + 2*len(tokens) position‐identifier tokens)
         full_tokens = [
             context_token,
-            system_token,
+            # system_token,
             hungarian_token,
             cvr_token,
             digit_token,
-            sim_token,
+            # sim_token,
             nltk_feature,
         ] + tokens_with_pos
 
@@ -143,7 +143,7 @@ def prepare_dataset(df: pd.DataFrame, label2id: dict):
         #    - First 7 entries → -100 (because they are feature tokens)
         #    - Then for each identifier token, [-100, label2id[tag]]
         ner_tags_with_pos = [val for tag in tags for val in (-100, label2id[tag])]
-        full_labels = [-100] * 7 + ner_tags_with_pos      # ← use 0, not -100
+        full_labels = [-100] * 5 + ner_tags_with_pos      # ← use 0, not -100
 
         rows.append({
             "tokens":   full_tokens,
diff --git a/src/lm_based_tagger/distilbert_tagger.py b/src/lm_based_tagger/distilbert_tagger.py
index 4d62bb9..b3aa2fb 100644
--- a/src/lm_based_tagger/distilbert_tagger.py
+++ b/src/lm_based_tagger/distilbert_tagger.py
@@ -143,11 +143,11 @@ def tag_identifier(self, tokens, context, type_str, language, system_name):
         # Build the full input token sequence (exactly what training saw):
         input_tokens = [
             context_token,
-            system_token,
+            # system_token,
             hungarian_token,
             cvr_token,
             digit_token,
-            sim_token,
+            # sim_token,
             nltk_feature,
         ] + tokens_with_pos
 
@@ -183,11 +183,11 @@ def tag_identifier(self, tokens, context, type_str, language, system_name):
             if word_idx is None:
                 continue
             # b) skip the 7 leading feature tokens
-            if word_idx < 7:
+            if word_idx < 5:
                 continue
             # c) skip every @pos_* placeholder   (@pos tokens sit at even
             #    offsets after the 7 features: 7,9,11, … so (w‑7)%2 == 0)
-            if (word_idx - 7) % 2 == 0:
+            if (word_idx - 5) % 2 == 0:
                 continue
             # d) skip duplicate word‑pieces
             if word_idx == previous_word_idx:

From ca22c5954d60db1f848bcb486568e2cb463504f7 Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@se.rit.edu>
Date: Wed, 4 Jun 2025 16:01:27 -0400
Subject: [PATCH 37/51] Update to pull from huggingface or local based on
 --local

---
 main                                     | 19 +++++++----
 src/lm_based_tagger/distilbert_crf.py    | 43 ++++++++++++++----------
 src/lm_based_tagger/distilbert_tagger.py | 21 +++---------
 src/tag_identifier.py                    |  3 +-
 4 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/main b/main
index 07d795b..83cfb18 100755
--- a/main
+++ b/main
@@ -43,6 +43,7 @@ if __name__ == "__main__":
 
     parser = argparse.ArgumentParser(description="SCALAR identifier tagger")
     parser.add_argument("-v", "--version", action="store_true", help="print tagger application version")
+    parser.add_argument("--local", action="store_true", help="Use local model/tokenizer instead of HuggingFace repo.")
     # Core run/train model arguments
     parser.add_argument("--mode", choices=["train", "run"], required=True, help="Choose to 'train' or 'run' the model")
     parser.add_argument("--model_type", choices=["tree_based", "lm_based"], required=True, help="Specify which model type to use")
@@ -88,11 +89,17 @@ if __name__ == "__main__":
 
             start_server(temp_config=config)
         elif args.model_type == "lm_based":
-            start_server(temp_config={
-                'script_dir': SCRIPT_DIR,
-                'model': os.path.join(SCRIPT_DIR, 'output', 'best_model'),
-                'model_type':'lm_based'
-            })
-
+            if not args.local:
+                start_server(temp_config={
+                    'script_dir': SCRIPT_DIR,
+                    'model': 'sourceslicer/scalar_lm_best',
+                    'model_type':'lm_based',
+                })
+            else:
+                start_server(temp_config={
+                    'script_dir': SCRIPT_DIR,
+                    'model': os.path.join(SCRIPT_DIR, 'output', 'best_model'),
+                    'model_type':'lm_based',
+                })
     else:
         parser.print_usage()
\ No newline at end of file
diff --git a/src/lm_based_tagger/distilbert_crf.py b/src/lm_based_tagger/distilbert_crf.py
index 4fc5486..daa5fd0 100644
--- a/src/lm_based_tagger/distilbert_crf.py
+++ b/src/lm_based_tagger/distilbert_crf.py
@@ -80,12 +80,13 @@ def forward(self,
             best_paths = self.crf.decode(emissions, mask=crf_mask)
             return {"logits": emission_scores,
                     "predictions": best_paths}
-
-    from transformers import DistilBertConfig
     @classmethod
-    def from_pretrained(cls, ckpt_dir, **kw):
-        from safetensors import safe_open
-        cfg = DistilBertConfig.from_pretrained(ckpt_dir)
+    def from_pretrained(cls, ckpt_dir, local=False, **kw):
+        from safetensors.torch import load_file as load_safe_file
+        from huggingface_hub import hf_hub_download
+        import os
+        cfg = DistilBertConfig.from_pretrained(ckpt_dir, local_files_only=local)
+
         model = cls(
             num_labels=cfg.num_labels,
             id2label=cfg.id2label,
@@ -94,18 +95,24 @@ def from_pretrained(cls, ckpt_dir, **kw):
             **kw,
         )
 
-        weight_path_pt  = os.path.join(ckpt_dir, "pytorch_model.bin")
-        weight_path_safe = os.path.join(ckpt_dir, "model.safetensors")
+        # Attempt to load model.safetensors only
+        try:
+            if os.path.isdir(ckpt_dir):
+                # Load from local directory
+                weight_path = os.path.join(ckpt_dir, "model.safetensors")
+                if not os.path.exists(weight_path):
+                    raise FileNotFoundError(f"No model.safetensors found in local path: {weight_path}")
+            else:
+                # Load from Hugging Face Hub
+                weight_path = hf_hub_download(
+                    repo_id=ckpt_dir,
+                    filename="model.safetensors",
+                    local_files_only=local
+                )
 
-        if os.path.exists(weight_path_pt):
-            state = torch.load(weight_path_pt, map_location="cpu")
-        elif os.path.exists(weight_path_safe):
-            state = {}
-            with safe_open(weight_path_safe, framework="pt", device="cpu") as f:
-                for k in f.keys():
-                    state[k] = f.get_tensor(k)
-        else:
-            raise FileNotFoundError("No weight file found in checkpoint directory.")
+            state_dict = load_safe_file(weight_path, device="cpu")
+            model.load_state_dict(state_dict)
+            return model
 
-        model.load_state_dict(state)
-        return model
\ No newline at end of file
+        except Exception as e:
+            raise RuntimeError(f"Failed to load model.safetensors from {ckpt_dir}: {e}")
\ No newline at end of file
diff --git a/src/lm_based_tagger/distilbert_tagger.py b/src/lm_based_tagger/distilbert_tagger.py
index b3aa2fb..8847487 100644
--- a/src/lm_based_tagger/distilbert_tagger.py
+++ b/src/lm_based_tagger/distilbert_tagger.py
@@ -81,26 +81,15 @@ def system_prefix_similarity(first_token, system_name):
 
 
 class DistilBertTagger:
-    def __init__(self, model_path: str):
-        """
-        `model_path` must contain:
-          • config.json
-          • model.safetensors  OR  pytorch_model.bin
-          • tokenizer files (tokenizer.json, vocab.txt, …)
-        """
-        self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
+    def __init__(self, model_path: str, local: bool = False):
+        self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_path, local_files_only=local)
 
-        # Try CRF wrapper first (it can load .safetensors or .bin)
         try:
-            self.model = DistilBertCRFForTokenClassification.from_pretrained(model_path)
+            self.model = DistilBertCRFForTokenClassification.from_pretrained(model_path, local=local)
         except Exception:
-            # Fallback: plain DistilBERT head (no CRF layer present)
-            from transformers import DistilBertForTokenClassification
-            self.model = DistilBertForTokenClassification.from_pretrained(model_path)
-
-        self.model.eval()                      # inference mode
+            self.model = DistilBertForTokenClassification.from_pretrained(model_path, local_files_only=local)
 
-        # id2label keys can be strings → convert to int
+        self.model.eval()
         self.id2label = {int(k): v for k, v in self.model.config.id2label.items()}
 
     def tag_identifier(self, tokens, context, type_str, language, system_name):
diff --git a/src/tag_identifier.py b/src/tag_identifier.py
index 0158bb1..c847aa9 100644
--- a/src/tag_identifier.py
+++ b/src/tag_identifier.py
@@ -158,7 +158,8 @@ def initialize_model(temp_config = {}):
         app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df)
     elif model_type == "lm_based":
         print("Loading DistilBERT tagger...")
-        lm_model = DistilBertTagger(temp_config['model'])
+        is_local = temp_config.get("local", False)
+        lm_model = DistilBertTagger(temp_config['model'], local=is_local)
         print("DistilBERT tagger loaded!")
 
 def start_server(temp_config = {}):

From e083b390c18db9026b8f48d3cd653ef2eb404ae0 Mon Sep 17 00:00:00 2001
From: Christian Newman <cdvnse@rit.edu>
Date: Wed, 4 Jun 2025 16:38:03 -0400
Subject: [PATCH 38/51] Fix requirements and I dunno how the crf imports are
 working

---
 requirements.txt                      | 134 +++++++++-----------------
 requirements_gpu.txt                  |  12 ---
 src/lm_based_tagger/distilbert_crf.py |   4 +-
 src/lm_based_tagger/train_model.py    |   7 --
 4 files changed, 49 insertions(+), 108 deletions(-)
 delete mode 100644 requirements_gpu.txt

diff --git a/requirements.txt b/requirements.txt
index 51e31b1..74fc9d6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,104 +1,64 @@
-accelerate==1.3.0
-attrs==25.1.0
-beautifulsoup4==4.12.3
-bioc==2.1
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.9
+aiosignal==1.3.2
+attrs==25.3.0
 blinker==1.9.0
-boto3==1.36.6
-botocore==1.36.6
-certifi==2024.12.14
-charset-normalizer==3.4.1
-click==8.1.8
-conllu==4.5.3
-contourpy==1.3.1
-cycler==0.12.1
-Deprecated==1.2.17
-docopt==0.6.2
-filelock==3.17.0
-flair==0.15.0
-Flask==3.1.0
-fonttools==4.55.6
-fsspec==2024.12.0
-ftfy==6.3.1
-gdown==5.2.0
-gensim==4.3.3
-huggingface-hub==0.27.1
-humanize==4.11.0
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.2.1
+datasets==3.6.0
+dill==0.3.8
+filelock==3.18.0
+Flask==3.1.1
+frozenlist==1.6.2
+fsspec==2025.3.0
+hf-xet==1.1.3
+huggingface-hub==0.32.4
+humanize==4.12.3
 idna==3.10
-iniconfig==2.0.0
-intervaltree==3.1.0
+iniconfig==2.1.0
 itsdangerous==2.2.0
-Jinja2==3.1.5
-jmespath==1.0.1
-joblib==1.4.2
-jsonlines==4.0.0
-kiwisolver==1.4.8
-langdetect==1.0.9
-lxml==5.3.0
+Jinja2==3.1.6
+joblib==1.5.1
 MarkupSafe==3.0.2
-matplotlib==3.10.0
-more-itertools==10.6.0
-mpld3==0.5.10
 mpmath==1.3.0
-networkx==3.4.2
+multidict==6.4.4
+multiprocess==0.70.16
+networkx==3.5
 nltk==3.9.1
-numpy==1.26.4
-nvidia-cublas-cu12==12.4.5.8
-nvidia-cuda-cupti-cu12==12.4.127
-nvidia-cuda-nvrtc-cu12==12.4.127
-nvidia-cuda-runtime-cu12==12.4.127
-nvidia-cudnn-cu12==9.1.0.70
-nvidia-cufft-cu12==11.2.1.3
-nvidia-curand-cu12==10.3.5.147
-nvidia-cusolver-cu12==11.6.1.9
-nvidia-cusparse-cu12==12.3.1.170
-nvidia-nccl-cu12==2.21.5
-nvidia-nvjitlink-cu12==12.4.127
-nvidia-nvtx-cu12==12.4.127
-packaging==24.2
+numpy==2.2.6
+packaging==25.0
 pandas==2.2.3
-pillow==11.1.0
-plac==1.4.3
-pluggy==1.5.0
-pptree==3.1
-protobuf==5.29.3
-psutil==6.1.1
-pyparsing==3.2.1
-PySocks==1.7.1
-pytest==8.3.4
+plac==1.4.5
+pluggy==1.6.0
+propcache==0.3.1
+pyarrow==20.0.0
+Pygments==2.19.1
+pytest==8.4.0
 python-dateutil==2.9.0.post0
-pytorch_revgrad==0.2.0
-pytz==2024.2
+pytorch-crf==0.7.2
+pytz==2025.2
 PyYAML==6.0.2
 regex==2024.11.6
 requests==2.32.3
-s3transfer==0.11.2
-safetensors==0.5.2
+safetensors==0.5.3
 scikit-learn==1.6.1
-scipy==1.13.1
-segtok==1.5.11
-sentencepiece==0.2.0
-setuptools==75.8.0
+scipy==1.15.3
+setuptools==80.9.0
 six==1.17.0
-smart-open==7.1.0
-sortedcontainers==2.4.0
-soupsieve==2.6
 spiral @ git+https://github.com/cnewman/spiral.git@dff537320c15849c10e583968036df2d966eddee
-sqlitedict==2.1.0
-sympy==1.13.1
-tabulate==0.9.0
-termcolor==2.5.0
-threadpoolctl==3.5.0
-tokenizers==0.21.0
-torch==2.5.1
+sympy==1.14.0
+termcolor==3.1.0
+threadpoolctl==3.6.0
+tokenizers==0.21.1
+torch==2.7.1
 tqdm==4.67.1
-transformer-smaller-training-vocab==0.4.0
-transformers==4.48.1
-triton==3.1.0
-typing_extensions==4.12.2
-tzdata==2025.1
-urllib3==2.3.0
+transformers==4.52.4
+triton==3.3.1
+typing_extensions==4.14.0
+tzdata==2025.2
+urllib3==2.4.0
 waitress==3.0.2
-wcwidth==0.2.13
 Werkzeug==3.1.3
-Wikipedia-API==0.8.1
-wrapt==1.17.2
\ No newline at end of file
+xxhash==3.5.0
+yarl==1.20.0
diff --git a/requirements_gpu.txt b/requirements_gpu.txt
deleted file mode 100644
index c9a1ba1..0000000
--- a/requirements_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-nvidia-cublas-cu12==12.4.5.8
-nvidia-cuda-cupti-cu12==12.4.127
-nvidia-cuda-nvrtc-cu12==12.4.127
-nvidia-cuda-runtime-cu12==12.4.127
-nvidia-cudnn-cu12==9.1.1.17
-nvidia-cufft-cu12==11.2.1.3
-nvidia-curand-cu12==10.3.5.147
-nvidia-cusolver-cu12==11.6.1.9
-nvidia-cusparse-cu12==12.3.1.170
-nvidia-nccl-cu12==2.23.4
-nvidia-nvjitlink-cu12==12.4.127
-nvidia-nvtx-cu12==12.4.127
\ No newline at end of file
diff --git a/src/lm_based_tagger/distilbert_crf.py b/src/lm_based_tagger/distilbert_crf.py
index daa5fd0..d359aa0 100644
--- a/src/lm_based_tagger/distilbert_crf.py
+++ b/src/lm_based_tagger/distilbert_crf.py
@@ -1,7 +1,7 @@
 # distilbert_crf.py
-import torch, os
+import torch
+from torchcrf import CRF
 import torch.nn as nn
-from TorchCRF import CRF
 from transformers import DistilBertModel, DistilBertConfig
 
 class DistilBertCRFForTokenClassification(nn.Module):
diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py
index 0139d25..ea9c0f8 100644
--- a/src/lm_based_tagger/train_model.py
+++ b/src/lm_based_tagger/train_model.py
@@ -113,12 +113,6 @@ def train_lm(script_dir: str):
         )
 
         # 8) Build fresh model + config for this fold 
-        config = DistilBertConfig.from_pretrained(
-            "distilbert-base-uncased",
-            num_labels=len(LABEL_LIST),
-            id2label=ID2LABEL,
-            label2id=LABEL2ID
-        )
         model = DistilBertCRFForTokenClassification(
             num_labels=len(LABEL_LIST),
             id2label=ID2LABEL,
@@ -126,7 +120,6 @@ def train_lm(script_dir: str):
             pretrained_name="distilbert-base-uncased",
             dropout_prob=0.1
         ).to(device)
-        model.to(device)
 
         # 9) TrainingArguments (with early stopping) 
         if device.type == "cpu":

From 27035665bdd14379a349dbee5ad620066617a592 Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@se.rit.edu>
Date: Wed, 4 Jun 2025 16:57:22 -0400
Subject: [PATCH 39/51] Remove req that won't work on windows

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 74fc9d6..8a4e554 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -54,7 +54,7 @@ tokenizers==0.21.1
 torch==2.7.1
 tqdm==4.67.1
 transformers==4.52.4
-triton==3.3.1
+# triton==3.3.1 - doesn't work on windows
 typing_extensions==4.14.0
 tzdata==2025.2
 urllib3==2.4.0

From e135cd6c8deeec976cb9039e3e53d74cddae1ded Mon Sep 17 00:00:00 2001
From: Christian Newman <cdvnse@rit.edu>
Date: Wed, 4 Jun 2025 17:06:56 -0400
Subject: [PATCH 40/51] Greatly reduce the requirements.txt to just the top
 level reqs

---
 requirements.txt | 56 +-----------------------------------------------
 1 file changed, 1 insertion(+), 55 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 74fc9d6..233014e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,64 +1,10 @@
-aiohappyeyeballs==2.6.1
 aiohttp==3.12.9
-aiosignal==1.3.2
-attrs==25.3.0
-blinker==1.9.0
-certifi==2025.4.26
-charset-normalizer==3.4.2
-click==8.2.1
 datasets==3.6.0
-dill==0.3.8
-filelock==3.18.0
 Flask==3.1.1
-frozenlist==1.6.2
-fsspec==2025.3.0
-hf-xet==1.1.3
-huggingface-hub==0.32.4
-humanize==4.12.3
-idna==3.10
-iniconfig==2.1.0
-itsdangerous==2.2.0
-Jinja2==3.1.6
-joblib==1.5.1
-MarkupSafe==3.0.2
-mpmath==1.3.0
-multidict==6.4.4
-multiprocess==0.70.16
-networkx==3.5
-nltk==3.9.1
-numpy==2.2.6
-packaging==25.0
-pandas==2.2.3
-plac==1.4.5
-pluggy==1.6.0
-propcache==0.3.1
-pyarrow==20.0.0
-Pygments==2.19.1
-pytest==8.4.0
-python-dateutil==2.9.0.post0
+pipdeptree==2.26.1
 pytorch-crf==0.7.2
-pytz==2025.2
-PyYAML==6.0.2
-regex==2024.11.6
-requests==2.32.3
-safetensors==0.5.3
 scikit-learn==1.6.1
-scipy==1.15.3
-setuptools==80.9.0
-six==1.17.0
 spiral @ git+https://github.com/cnewman/spiral.git@dff537320c15849c10e583968036df2d966eddee
-sympy==1.14.0
-termcolor==3.1.0
-threadpoolctl==3.6.0
-tokenizers==0.21.1
 torch==2.7.1
-tqdm==4.67.1
 transformers==4.52.4
-triton==3.3.1
-typing_extensions==4.14.0
-tzdata==2025.2
-urllib3==2.4.0
 waitress==3.0.2
-Werkzeug==3.1.3
-xxhash==3.5.0
-yarl==1.20.0

From 059eeb01e936f45c0e85f0540a0dbc7667a83541 Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@se.rit.edu>
Date: Wed, 4 Jun 2025 19:23:37 -0400
Subject: [PATCH 41/51] Make it so that classification report gets printed to a
 file

---
 src/lm_based_tagger/train_model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py
index ea9c0f8..92f5c5e 100644
--- a/src/lm_based_tagger/train_model.py
+++ b/src/lm_based_tagger/train_model.py
@@ -326,6 +326,8 @@ def compute_metrics(eval_pred):
 
     print("\nFinal Evaluation on Held-Out Set:")
     print(classification_report(flat_true, flat_pred))
+    with open('holdout_report.txt', 'w') as f:
+        print(classification_report(flat_true, flat_pred), file=f)
 
     # Report inference speed
     total_tokens = sum(len(ex["tokens"]) for ex in val_dataset)

From ecc88550f536ae1e390b73f7426aa17cf68fec07 Mon Sep 17 00:00:00 2001
From: Christian Newman <cdvnse@rit.edu>
Date: Wed, 4 Jun 2025 19:30:54 -0400
Subject: [PATCH 42/51] Update readme

---
 README.md | 161 ++----------------------------------------------------
 1 file changed, 5 insertions(+), 156 deletions(-)

diff --git a/README.md b/README.md
index fdd99e9..3d9ac8d 100644
--- a/README.md
+++ b/README.md
@@ -1,162 +1,11 @@
 # SCALAR Part-of-speech tagger
-This the official release of the SCALAR Part-of-speech tagger
 
-There are two ways to run the tagger. This document describes both ways.
+THIS IS AN EXPERIMENTAL VERSION OF SCALAR
 
-1. Using Docker compose (which runs the tagger's built-in server for you)
-2. Running the tagger's built-in server without Docker
+Install requirements via `pip install -r requirements.txt`
 
-## Current Metrics (this will be updated every time we update/change the model!)
-|            | Accuracy | Balanced Accuracy | Weighted Recall | Weighted Precision | Weighted F1 | Performance (seconds) |
-|------------|:--------:|:------------------:|:---------------:|:------------------:|:-----------:|:---------------------:|
-| **SCALAR** | **0.8216** | **0.9160** | **0.8216** | **0.8245** | **0.8220** | **249.05** |
-| Ensemble   | 0.7124   | 0.8311             | 0.7124          | 0.7597             | 0.7235      | 1149.44                |
-| Flair      | 0.6087   | 0.7844             | 0.6087          | 0.7755             | 0.6497      | 807.03                 |
+Run via `python3 main --mode run --model_type lm_based`
 
-## Getting Started with Docker
+You can attempt to traint it `python main --mode train --model_type lm_based` -- but I make no guarantees about how easily it will work at this stage
 
-To run SCALAR in a Docker container you can clone the repository and pull the latest docker impage from `sourceslicer/scalar_tagger:latest`
-
-Make sure you have Docker and Docker Compose installed:
-
-https://docs.docker.com/engine/install/
-
-https://docs.docker.com/compose/install/
-
-```
-git clone git@github.com:SCANL/scanl_tagger.git
-cd scanl_tagger
-docker compose pull
-docker compose up
-```
-
-## Getting Started without Docker
-You will need `python3.12` installed. 
-
-You'll need to install `pip` -- https://pip.pypa.io/en/stable/installation/
-
-Set up a virtual environtment: `python -m venv /tmp/tagger` -- feel free to put it somewhere else (change /tmp/tagger) if you prefer
-
-Activate the virtual environment: `source /tmp/tagger/bin/activate` (you can find how to activate it here if `source` does not work for you -- https://docs.python.org/3/library/venv.html#how-venvs-work)
-
-After it's installed and your virtual environment is activated, in the root of the repo, run `pip install -r requirements.txt`
-
-Finally, we require the `token` and `target` vectors from [code2vec](https://github.com/tech-srl/code2vec). The tagger will attempt to automatically download them if it doesn't find them, but you could download them yourself if you like. It will place them in your local directory under `./code2vec/*`
-
-## Usage
-
-```
-usage: main [-h] [-v] [-r] [-t] [-a ADDRESS] [--port PORT] [--protocol PROTOCOL]
-            [--words WORDS]
-
-options:
-  -h, --help            show this help message and exit
-  -v, --version         print tagger application version
-  -r, --run             run server for part of speech tagging requests
-  -t, --train           run training set to retrain the model
-  -a ADDRESS, --address ADDRESS
-                        configure server address
-  --port PORT           configure server port
-  --protocol PROTOCOL   configure whether the server uses http or https
-  --words WORDS         provide path to a list of acceptable abbreviations
-```
-
-`./main -r` will start the server, which will listen for identifier names sent via HTTP over the route:
-
-http://127.0.0.1:8080/{identifier_name}/{code_context}/{database_name (optional)}
-
-"database name" specifies an sqlite database to be used for result caching and data collection. If the database specified does not exist, one will be created. 
-
-You can check wehther or not a database exists by using the `/probe` route by sending an HTTP request like this:
-
-http://127.0.0.1:5000/probe/{database_name}
-
-"code context" is one of:
-- FUNCTION
-- ATTRIBUTE
-- CLASS
-- DECLARATION
-- PARAMETER
-
-For example:
-
-Tag a declaration: ``http://127.0.0.1:8000/numberArray/DECLARATION/database``
-
-Tag a function: ``http://127.0.0.1:8000/GetNumberArray/FUNCTION/database``
-
-Tag an class: ``http://127.0.0.1:8000/PersonRecord/CLASS/database``
-
-#### Note
-Kebab case is not currently supported due to the limitations of Spiral. Attempting to send the tagger identifiers which are in kebab case will result in the entry of a single noun. 
-
-You will need to have a way to parse code and filter out identifier names if you want to do some on-the-fly analysis of source code. We recommend [srcML](https://www.srcml.org/). Since the actual tagger is a web server, you don't have to use srcML. You could always use other AST-based code representations, or any other method of obtaining identifier information. 
-
-
-## Tagset
-
-**Supported Tagset**
-| Abbreviation |                 Expanded Form                |                   Examples                   |
-|:------------:|:--------------------------------------------:|:--------------------------------------------:|
-|       N      |                     noun                     | Disneyland, shoe, faucet, mother             |
-|      DT      |                  determiner                  | the, this, that, these, those, which         |
-|      CJ      |                  conjunction                 | and, for, nor, but, or, yet, so              |
-|       P      |                  preposition                 | behind, in front of, at, under, above        |
-|      NPL     |                  noun plural                 | Streets, cities, cars, people, lists         |
-|      NM      | noun modifier  (**noun-adjunct**, adjective) | red, cold, hot, **bit**Set, **employee**Name |
-|       V      |                     verb                     | Run, jump, spin,                             |
-|      VM      |            verb modifier  (adverb)           | Very, loudly, seriously, impatiently         |
-|       D      |                     digit                    | 1, 2, 10, 4.12, 0xAF                         |
-|      PRE     |                   preamble                   | Gimp, GLEW, GL, G, p, m, b                   |
-
-**Penn Treebank to SCALAR tagset**
-
-|   Penn Treebank Annotation  | SCALAR Tagset            |
-|:---------------------------:|:------------------------:|
-|       Conjunction (CC)      |     Conjunction (CJ)     |
-|          Digit (CD)         |         Digit (D)        |
-|       Determiner (DT)       |      Determiner (DT)     |
-|      Foreign Word (FW)      |         Noun (N)         |
-|       Preposition (IN)      |      Preposition (P)     |
-|        Adjective (JJ)       |    Noun Modifier (NM)    |
-| Comparative Adjective (JJR) |    Noun Modifier (NM)    |
-| Superlative Adjective (JJS) |    Noun Modifier (NM)    |
-|        List Item (LS)       |         Noun (N)         |
-|          Modal (MD)         |         Verb (V)         |
-|      Noun Singular (NN)     |         Noun (N)         |
-|      Proper Noun (NNP)      |         Noun (N)         |
-|  Proper Noun Plural (NNPS)  |     Noun Plural (NPL)    |
-|      Noun Plural (NNS)      |     Noun Plural (NPL)    |
-|         Adverb (RB)         |    Verb Modifier (VM)    |
-|   Comparative Adverb (RBR)  |    Verb Modifier (VM)    |
-|        Particle (RP)        |    Verb Modifier (VM)    |
-|         Symbol (SYM)        |         Noun (N)         |
-|     To Preposition (TO)     |      Preposition (P)     |
-|          Verb (VB)          |         Verb (V)         |
-|          Verb (VBD)         |         Verb (V)         |
-|          Verb (VBG)         |         Verb (V)         |
-|          Verb (VBN)         |         Verb (V)         |
-|          Verb (VBP)         |         Verb (V)         |
-|          Verb (VBZ)         |         Verb (V)         |
-
-## Training the tagger
-You can train this tagger using the `-t` option (which will re-run the training routine). For the moment, most of this is hard-coded in, so if you want to use a different data set/different seeds, you'll need to modify the code. This will potentially change in the future.
-
-## Errors?
-Please make an issue if you run into errors
-
-# Please Cite the Paper(s)!
-
-Newman, Christian, Scholten , Brandon, Testa, Sophia, Behler, Joshua, Banabilah, Syreen, Collard, Michael L., Decker, Michael, Mkaouer, Mohamed Wiem, Zampieri, Marcos, Alomar, Eman Abdullah, Alsuhaibani, Reem, Peruma, Anthony, Maletic, Jonathan I., (2025), “SCALAR: A Part-of-speech Tagger for Identifiers”, in the Proceedings of the 33rd IEEE/ACM International Conference on Program Comprehension - Tool Demonstrations Track (ICPC), Ottawa, ON, Canada, April 27 -28, 5 pages TO APPEAR.
-
-Christian  D.  Newman,  Michael  J.  Decker,  Reem  S.  AlSuhaibani,  Anthony  Peruma,  Satyajit  Mohapatra,  Tejal  Vishnoi, Marcos Zampieri, Mohamed W. Mkaouer, Timothy J. Sheldon, and Emily Hill, "An Ensemble Approach for Annotating Source Code Identifiers with Part-of-speech Tags," in IEEE Transactions on Software Engineering, doi: 10.1109/TSE.2021.3098242.
-
-# Training set
-The data used to train this tagger can be found in the most recent database update in the repo -- https://github.com/SCANL/scanl_tagger/blob/master/input/scanl_tagger_training_db_11_29_2024.db
-
-# Interested in our other work?
-Find our other research [at our webpage](https://www.scanl.org/) and check out the [Identifier Name Structure Catalogue](https://github.com/SCANL/identifier_name_structure_catalogue)
-
-# WordNet
-This project uses WordNet to perform a dictionary lookup on the individual words in each identifier:
-
-Princeton University "About WordNet." [WordNet](https://wordnet.princeton.edu/). Princeton University. 2010
+It still technically supports the old gradientboost model, too... but no guarantees as to how well it functions in this branch.
\ No newline at end of file

From 6ea557f9216313ca0505877496507b434ac8b916 Mon Sep 17 00:00:00 2001
From: Christian Newman <cdvnse@rit.edu>
Date: Wed, 4 Jun 2025 22:36:47 -0400
Subject: [PATCH 43/51] DRY

---
 src/lm_based_tagger/distilbert_crf.py         |   2 -
 .../distilbert_preprocessing.py               | 110 ++++---------
 src/lm_based_tagger/distilbert_tagger.py      | 147 +++---------------
 3 files changed, 53 insertions(+), 206 deletions(-)

diff --git a/src/lm_based_tagger/distilbert_crf.py b/src/lm_based_tagger/distilbert_crf.py
index d359aa0..729c74b 100644
--- a/src/lm_based_tagger/distilbert_crf.py
+++ b/src/lm_based_tagger/distilbert_crf.py
@@ -52,8 +52,6 @@ def forward(self,
         sequence_output = self.dropout(outputs[0])          # [B, T, H]
         emission_scores = self.classifier(sequence_output)  # [B, T, C]
 
-        seq_len = emission_scores.size(1)                   # original token length
-
         # ============================== TRAINING ==============================
         if labels is not None:
             # 1. Drop [CLS] (idx 0) and [SEP] (idx –1)
diff --git a/src/lm_based_tagger/distilbert_preprocessing.py b/src/lm_based_tagger/distilbert_preprocessing.py
index 8ee9c4c..fee656d 100644
--- a/src/lm_based_tagger/distilbert_preprocessing.py
+++ b/src/lm_based_tagger/distilbert_preprocessing.py
@@ -1,5 +1,3 @@
-# distilbert_preprocessing.py
-
 import re
 from nltk import pos_tag
 import nltk
@@ -24,33 +22,40 @@
     "CLASS": "@class"
 }
 
+FEATURES = [
+    "context",
+    "hungarian",
+    "cvr",
+    "digit",
+    "nltk"
+]
+
+FEATURE_FUNCTIONS = {
+    "context": lambda row, tokens: CONTEXT_MAP.get(row["CONTEXT"].strip().upper(), "@unknown"),
+    "hungarian": lambda row, tokens: detect_hungarian_prefix(tokens[0]) if tokens else "@hung_none",
+    "cvr": lambda row, tokens: consonant_vowel_ratio_bucket(tokens),
+    "digit": lambda row, tokens: detect_digit_feature(tokens),
+    "nltk": lambda row, tokens: "@nltk_" + '-'.join(tag.lower() for _, tag in pos_tag(tokens, tagset="universal"))
+}
+
+def get_feature_tokens(row, tokens):
+    return [FEATURE_FUNCTIONS[feat](row, tokens) for feat in FEATURES]
+
+NUMBER_OF_FEATURES = len(FEATURES)
 
 def detect_hungarian_prefix(first_token):
-    """
-    If the first token starts with 1–3 letters followed by an uppercase or underscore,
-    return "@hung_<prefix>". Otherwise "@hung_none".
-    """
     m = re.match(r'^([a-zA-Z]{1,3})[A-Z_]', first_token)
     if m:
         return f"@hung_{m.group(1).lower()}"
     return "@hung_none"
 
-
 def detect_digit_feature(tokens):
-    """
-    If any token has a digit, return "@has_digit", else "@no_digit".
-    """
     for token in tokens:
         if any(char.isdigit() for char in token):
             return "@has_digit"
     return "@no_digit"
 
-
 def consonant_vowel_ratio_bucket(tokens):
-    """
-    Compute the average consonant/vowel ratio across all alphabetic tokens,
-    then bucket into low/mid/high.
-    """
     def ratio(tok):
         tok_lower = tok.lower()
         num_vowels = sum(1 for c in tok_lower if c in VOWELS)
@@ -69,12 +74,7 @@ def ratio(tok):
     else:
         return "@cvr_high"
 
-
 def system_prefix_similarity(first_token, system_name):
-    """
-    Compute a SequenceMatcher ratio against the system name, then bucket:
-      >0.9 ➔ "@sim_high", >0.6 ➔ "@sim_mid", >0.3 ➔ "@sim_low", else "@sim_none".
-    """
     if not first_token or not system_name:
         return "@sim_none"
     sys_lower = system_name.strip().lower()
@@ -89,82 +89,41 @@ def system_prefix_similarity(first_token, system_name):
     else:
         return "@sim_none"
 
+def normalize_type(type_str):
+    ts = type_str.strip().lower()
+    ts = ts.replace("*", "_ptr")
+    ts = ts.replace(" ", "_")
+    return f"@{ts}"
+
+def normalize_language(lang_str):
+    return "@lang_" + lang_str.strip().lower().replace("++", "pp").replace("#", "sharp")
 
 def prepare_dataset(df: pd.DataFrame, label2id: dict):
-    """
-    Takes a DataFrame with columns:
-       - "tokens"      : List[str] (split identifier)
-       - "tags"        : List[str] (gold PoS tags, same length as tokens)
-       - "CONTEXT"     : e.g. "FUNCTION", "PARAMETER", etc.
-       - "SYSTEM_NAME" : string
-
-    Returns a HuggingFace `datasets.Dataset` with two fields:
-       - "tokens"   : List[List[str]]  (the FULL token sequence, including exactly 7 feature tokens + position tokens + identifier tokens)
-       - "ner_tags" : List[List[int]]  (the aligned label IDs, with -100 in front for each feature token)
-    """
     rows = []
     for _, row in df.iterrows():
         tokens = row["tokens"]
         tags = row["tags"]
+        feature_tokens = get_feature_tokens(row, tokens)
 
-        # 1. Build 7 feature tokens (context, system, hungarian, cvr, digit, sim, nltk)
-        context_token   = CONTEXT_MAP.get(row["CONTEXT"].strip().upper(), "@unknown")
-        # system_token    = f"@system_{row['SYSTEM_NAME'].strip().lower().replace(' ', '_')}"
-        hungarian_token = detect_hungarian_prefix(tokens[0]) if tokens else "@hung_none"
-        cvr_token       = consonant_vowel_ratio_bucket(tokens)
-        digit_token     = detect_digit_feature(tokens)
-        # sim_token       = system_prefix_similarity(tokens[0], row["SYSTEM_NAME"]) if tokens else "@sim_none"
-
-        # 2. NLTK POS tags (universal tagset)
-        nltk_tags = pos_tag(tokens, tagset="universal")
-        universal_tags = [tag.lower() for _, tag in nltk_tags]
-        nltk_feature = f"@nltk_{'-'.join(universal_tags)}"
-
-        # 3. Position tags: interleave with identifier tokens
         length = len(tokens)
-        if length == 1:
-            pos_tokens = ["@pos_2"]
-        else:
-            pos_tokens = ["@pos_0"] + ["@pos_1"] * (length - 2) + ["@pos_2"]
+        pos_tokens = ["@pos_2"] if length == 1 else ["@pos_0"] + ["@pos_1"] * (length - 2) + ["@pos_2"]
         tokens_with_pos = [val for pair in zip(pos_tokens, tokens) for val in pair]
 
-        # 4. Build the “full” token list (7 feature tokens + 2*len(tokens) position‐identifier tokens)
-        full_tokens = [
-            context_token,
-            # system_token,
-            hungarian_token,
-            cvr_token,
-            digit_token,
-            # sim_token,
-            nltk_feature,
-        ] + tokens_with_pos
-
-        # 5. Build the aligned labels array:
-        #    - First 7 entries → -100 (because they are feature tokens)
-        #    - Then for each identifier token, [-100, label2id[tag]]
+        full_tokens = feature_tokens + tokens_with_pos
         ner_tags_with_pos = [val for tag in tags for val in (-100, label2id[tag])]
-        full_labels = [-100] * 5 + ner_tags_with_pos      # ← use 0, not -100
+        full_labels = [-100] * NUMBER_OF_FEATURES + ner_tags_with_pos
 
         rows.append({
-            "tokens":   full_tokens,
+            "tokens": full_tokens,
             "ner_tags": full_labels
         })
 
     return Dataset.from_dict({
-        "tokens":   [r["tokens"]   for r in rows],
+        "tokens": [r["tokens"] for r in rows],
         "ner_tags": [r["ner_tags"] for r in rows]
     })
 
-
 def tokenize_and_align_labels(example, tokenizer):
-    """
-    example: a dict with
-      - "tokens"   : List[str] (the full token sequence, including exactly 7 feature tokens)
-      - "ner_tags" : List[int] (same length as above)
-
-    We run `tokenizer(example["tokens"], is_split_into_words=True, truncation=True)`,
-    then align `word_ids()` with `example["ner_tags"]`
-    """
     tokenized = tokenizer(
         example["tokens"],
         truncation=True,
@@ -180,7 +139,6 @@ def tokenize_and_align_labels(example, tokenizer):
         elif word_id < len(example["ner_tags"]):
             labels.append(example["ner_tags"][word_id])
         else:
-            # Just in case of truncation
             labels.append(-100)
 
     tokenized["labels"] = labels
diff --git a/src/lm_based_tagger/distilbert_tagger.py b/src/lm_based_tagger/distilbert_tagger.py
index 8847487..394c340 100644
--- a/src/lm_based_tagger/distilbert_tagger.py
+++ b/src/lm_based_tagger/distilbert_tagger.py
@@ -1,84 +1,8 @@
-import re
 import torch
 from nltk import pos_tag
-import nltk
-from difflib import SequenceMatcher
 from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
 from .distilbert_crf import DistilBertCRFForTokenClassification 
-
-# Make sure we have the same NLTK tagset
-nltk.download('averaged_perceptron_tagger_eng', quiet=True)
-nltk.download('universal_tagset', quiet=True)
-
-VOWELS = set("aeiou")
-CONTEXT_MAP = {
-    "FUNCTION": "@func",
-    "PARAMETER": "@param",
-    "ATTRIBUTE": "@attr",
-    "DECLARATION": "@decl",
-    "CLASS": "@class"
-}
-
-
-def detect_hungarian_prefix(first_token):
-    m = re.match(r'^([a-zA-Z]{1,3})[A-Z_]', first_token)
-    if m:
-        return f"@hung_{m.group(1).lower()}"
-    return "@hung_none"
-
-
-def detect_digit_feature(tokens):
-    for token in tokens:
-        if any(char.isdigit() for char in token):
-            return "@has_digit"
-    return "@no_digit"
-
-
-def consonant_vowel_ratio_bucket(tokens):
-    def ratio(tok):
-        tok_lower = tok.lower()
-        num_vowels = sum(1 for c in tok_lower if c in VOWELS)
-        num_consonants = sum(1 for c in tok_lower if c.isalpha() and c not in VOWELS)
-        return num_consonants / (num_vowels + 1e-5)
-
-    ratios = [ratio(tok) for tok in tokens if tok.isalpha()]
-    if not ratios:
-        return "@cvr_none"
-    avg_ratio = sum(ratios) / len(ratios)
-    if avg_ratio < 1.5:
-        return "@cvr_low"
-    elif avg_ratio < 3.0:
-        return "@cvr_mid"
-    else:
-        return "@cvr_high"
-
-
-def normalize_type(type_str):
-    ts = type_str.strip().lower()
-    ts = ts.replace("*", "_ptr")
-    ts = ts.replace(" ", "_")
-    return f"@{ts}"
-
-
-def normalize_language(lang_str):
-    return "@lang_" + lang_str.strip().lower().replace("++", "pp").replace("#", "sharp")
-
-
-def system_prefix_similarity(first_token, system_name):
-    if not first_token or not system_name:
-        return "@sim_none"
-    sys_lower = system_name.strip().lower()
-    tok_lower = first_token.strip().lower()
-    r = SequenceMatcher(None, tok_lower, sys_lower).ratio()
-    if r > 0.9:
-        return "@sim_high"
-    elif r > 0.6:
-        return "@sim_mid"
-    elif r > 0.3:
-        return "@sim_low"
-    else:
-        return "@sim_none"
-
+from .distilbert_preprocessing import *
 
 class DistilBertTagger:
     def __init__(self, model_path: str, local: bool = False):
@@ -99,48 +23,24 @@ def tag_identifier(self, tokens, context, type_str, language, system_name):
         3) Run the model, take argmax over token logits
         4) Align via `word_ids()`, skipping:
               - Any word_id = None
-              - Any word_id < 9 (because first 9 tokens were “feature tokens” => labels = -100)
+              - Any word_id < N (number of feature tokens) => labels = -100
               - Repeated word_ids (so we pick only the first sub-token of each “(pos, identifier-word)” pair)
         5) Return a list of string labels by mapping numeric IDs through `self.id2label`.
         """
+        row = {
+            "CONTEXT": context,
+            "SYSTEM_NAME": system_name,
+            "TYPE": type_str,
+            "LANGUAGE": language
+        }
+        feature_tokens = get_feature_tokens(row, tokens)
 
-        # 1. Re–compute exactly the same feature tokens as in training:
-        context_token = CONTEXT_MAP.get(context.strip().upper(), "@unknown")
-        system_token = f"@system_{system_name.strip().lower().replace(' ', '_')}"
-        hungarian_token = detect_hungarian_prefix(tokens[0]) if tokens else "@hung_none"
-        cvr_token = consonant_vowel_ratio_bucket(tokens)
-        digit_token = detect_digit_feature(tokens)
-        sim_token = system_prefix_similarity(tokens[0], system_name) if tokens else "@sim_none"
-        type_token = normalize_type(type_str)
-        lang_token = normalize_language(language)
-
-        # Position tags for each identifier token
         length = len(tokens)
-        if length == 1:
-            pos_tokens = ["@pos_2"]
-        else:
-            pos_tokens = ["@pos_0"] + ["@pos_1"] * (length - 2) + ["@pos_2"]
-
-        # NLTK POS feature
-        nltk_tags = pos_tag(tokens, tagset="universal")
-        universal_tags = [tag.lower() for _, tag in nltk_tags]
-        nltk_feature = f"@nltk_{'-'.join(universal_tags)}"
-
-        # Interleave pos_tokens + identifier tokens
+        pos_tokens = ["@pos_2"] if length == 1 else ["@pos_0"] + ["@pos_1"] * (length - 2) + ["@pos_2"]
         tokens_with_pos = [val for pair in zip(pos_tokens, tokens) for val in pair]
 
-        # Build the full input token sequence (exactly what training saw):
-        input_tokens = [
-            context_token,
-            # system_token,
-            hungarian_token,
-            cvr_token,
-            digit_token,
-            # sim_token,
-            nltk_feature,
-        ] + tokens_with_pos
+        input_tokens = feature_tokens + tokens_with_pos
 
-        # 2. Tokenize
         encoded = self.tokenizer(
             input_tokens,
             is_split_into_words=True,
@@ -149,44 +49,35 @@ def tag_identifier(self, tokens, context, type_str, language, system_name):
             padding=True
         )
 
-        # ─── 3. Inference ───────────────────────────────────────────
         with torch.no_grad():
             out = self.model(
                 input_ids=encoded["input_ids"],
                 attention_mask=encoded["attention_mask"],
             )
 
-        # One label per *input* token
-        if isinstance(out, dict) and "predictions" in out:        # CRF path
-            labels_per_token = out["predictions"][0]              # list[int]
-        else:                                                     # logits
+        if isinstance(out, dict) and "predictions" in out:
+            labels_per_token = out["predictions"][0]
+        else:
             logits = out[0] if isinstance(out, (tuple, list)) else out
             labels_per_token = torch.argmax(logits, dim=-1).squeeze().tolist()
 
-        # ─── 4. Re‑align to identifier words ──────────────────────
         pred_labels, previous_word_idx = [], None
-        word_ids = encoded.word_ids()           # same length as labels_per_token
+        word_ids = encoded.word_ids()
 
         for idx, word_idx in enumerate(word_ids):
-            # a) skip special tokens ([CLS]/[SEP])
             if word_idx is None:
                 continue
-            # b) skip the 7 leading feature tokens
-            if word_idx < 5:
+            if word_idx < NUMBER_OF_FEATURES:
                 continue
-            # c) skip every @pos_* placeholder   (@pos tokens sit at even
-            #    offsets after the 7 features: 7,9,11, … so (w‑7)%2 == 0)
-            if (word_idx - 5) % 2 == 0:
+            if (word_idx - NUMBER_OF_FEATURES) % 2 == 0:
                 continue
-            # d) skip duplicate word‑pieces
             if word_idx == previous_word_idx:
                 continue
 
-            label_idx = idx - 1          # shift because [CLS] was removed
+            label_idx = idx - 1
             if label_idx < len(labels_per_token):
                 pred_labels.append(labels_per_token[label_idx])
             previous_word_idx = word_idx
 
-        # Map numeric IDs → tag strings
         pred_tag_strings = [self.id2label[i] for i in pred_labels]
-        return pred_tag_strings
\ No newline at end of file
+        return pred_tag_strings

From dc5c8a4dfece3a2b769971cff5ea445239e2ae21 Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@se.rit.edu>
Date: Sun, 8 Jun 2025 02:08:07 -0400
Subject: [PATCH 44/51] Remove reliance on NLTK. Does not reduce effectiveness
 of the model, and actually makes it faster

---
 .../distilbert_preprocessing.py               |  2 +-
 src/lm_based_tagger/train_model.py            | 40 ++++++++++---------
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/src/lm_based_tagger/distilbert_preprocessing.py b/src/lm_based_tagger/distilbert_preprocessing.py
index fee656d..bd10c2b 100644
--- a/src/lm_based_tagger/distilbert_preprocessing.py
+++ b/src/lm_based_tagger/distilbert_preprocessing.py
@@ -27,7 +27,7 @@
     "hungarian",
     "cvr",
     "digit",
-    "nltk"
+    #"nltk"
 ]
 
 FEATURE_FUNCTIONS = {
diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py
index 92f5c5e..2e11a7a 100644
--- a/src/lm_based_tagger/train_model.py
+++ b/src/lm_based_tagger/train_model.py
@@ -48,6 +48,9 @@
 LABEL2ID   = {label: i for i, label in enumerate(LABEL_LIST)}
 ID2LABEL   = {i: label for label, i in LABEL2ID.items()}
 
+def dual_print(*args, file, **kwargs):
+    print(*args, **kwargs)         # stdout
+    print(*args, file=file, **kwargs)  # file
 
 def train_lm(script_dir: str):
     # 1) Paths
@@ -276,6 +279,7 @@ def compute_metrics(eval_pred):
             best_macro_f1 = fold_macro_f1
             best_model_dir = os.path.join(output_dir, "best_model")
             trainer.save_model(best_model_dir)
+            model.config.save_pretrained(best_model_dir)
             tokenizer.save_pretrained(best_model_dir)
 
         fold += 1
@@ -324,24 +328,6 @@ def compute_metrics(eval_pred):
         if l != -100
     ]
 
-    print("\nFinal Evaluation on Held-Out Set:")
-    print(classification_report(flat_true, flat_pred))
-    with open('holdout_report.txt', 'w') as f:
-        print(classification_report(flat_true, flat_pred), file=f)
-
-    # Report inference speed
-    total_tokens = sum(len(ex["tokens"]) for ex in val_dataset)
-    total_examples = len(val_dataset)
-    elapsed = end_time - start_time
-    print(f"\nInference Time: {elapsed:.2f}s for {total_examples} identifiers ({total_tokens} tokens)")
-    print(f"Tokens/sec: {total_tokens / elapsed:.2f}")
-    print(f"Identifiers/sec: {total_examples / elapsed:.2f}")
-
-    final_macro_f1 = f1_score(flat_true, flat_pred, average="macro")
-    print(f"\nFinal Macro F1 on Held-Out Set: {final_macro_f1:.4f}")
-    final_accuracy = accuracy_score(flat_true, flat_pred)
-    print(f"Final Token-level Accuracy on Held-Out Set: {final_accuracy:.4f}")
-    
     # 18) Write hold-out predictions to CSV so that each row contains
     #     (tokens, true_tags, pred_tags) for sanity checking.
     from .distilbert_tagger import DistilBertTagger
@@ -377,4 +363,20 @@ def compute_metrics(eval_pred):
     df = pd.read_csv(os.path.join(output_dir, "holdout_predictions.csv"))
     df["row_correct"] = df["true_tags"] == df["pred_tags"]
     id_level_acc = df["row_correct"].mean()
-    print(f"Final Identifier-level Accuracy on Held-Out Set: {id_level_acc:.4f}")
\ No newline at end of file
+    
+    # Report inference speed
+    total_tokens = sum(len(ex["tokens"]) for ex in val_dataset)
+    total_examples = len(val_dataset)
+    elapsed = end_time - start_time
+    final_macro_f1 = f1_score(flat_true, flat_pred, average="macro")
+    final_accuracy = accuracy_score(flat_true, flat_pred)
+    print("\nFinal Evaluation on Held-Out Set:")
+    with open('holdout_report.txt', 'w') as f:
+        report = classification_report(flat_true, flat_pred)
+        dual_print(report, file=f)
+        dual_print(f"\nInference Time: {elapsed:.2f}s for {total_examples} identifiers ({total_tokens} tokens)", file=f)
+        dual_print(f"Tokens/sec: {total_tokens / elapsed:.2f}", file=f)
+        dual_print(f"Identifiers/sec: {total_examples / elapsed:.2f}", file=f)
+        dual_print(f"\nFinal Macro F1 on Held-Out Set: {final_macro_f1:.4f}", file=f)
+        dual_print(f"Final Token-level Accuracy on Held-Out Set: {final_accuracy:.4f}", file=f)
+        dual_print(f"Final Identifier-level Accuracy on Held-Out Set: {id_level_acc:.4f}", file=f)
\ No newline at end of file

From f0671866408a62d2626803232b93063363cdcf0e Mon Sep 17 00:00:00 2001
From: Christian Newman <cnewman@kent.edu>
Date: Sun, 8 Jun 2025 02:19:12 -0400
Subject: [PATCH 45/51] Add current metrics

---
 README.md | 42 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3d9ac8d..2bf4b7f 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,44 @@ Install requirements via `pip install -r requirements.txt`
 
 Run via `python3 main --mode run --model_type lm_based`
 
-You can attempt to traint it `python main --mode train --model_type lm_based` -- but I make no guarantees about how easily it will work at this stage
+You can attempt to train it `python main --mode train --model_type lm_based` -- but I make no guarantees about how easily it will work at this stage
 
-It still technically supports the old gradientboost model, too... but no guarantees as to how well it functions in this branch.
\ No newline at end of file
+It still technically supports the old gradientboost model, too... but no guarantees as to how well it functions in this branch.
+
+## Evaluation Results (Held-Out Set)
+
+### Per-Class Metrics
+
+| Label | Precision | Recall | F1-Score | Support |
+|-------|-----------|--------|----------|---------|
+| CJ    | 0.88      | 0.88   | 0.88     | 8       |
+| D     | 0.98      | 0.96   | 0.97     | 52      |
+| DT    | 0.95      | 0.93   | 0.94     | 45      |
+| N     | 0.94      | 0.94   | 0.94     | 418     |
+| NM    | 0.91      | 0.93   | 0.92     | 440     |
+| NPL   | 0.97      | 0.97   | 0.97     | 79      |
+| P     | 0.94      | 0.92   | 0.93     | 79      |
+| PRE   | 0.79      | 0.79   | 0.79     | 68      |
+| V     | 0.89      | 0.84   | 0.86     | 110     |
+| VM    | 0.79      | 0.85   | 0.81     | 13      |
+
+### Aggregate Metrics
+
+| Metric              | Score  |
+|---------------------|--------|
+| Accuracy            | 0.92   |
+| Macro Avg F1        | 0.90   |
+| Weighted Avg F1     | 0.92   |
+| Total Examples      | 1312   |
+
+### Inference Statistics
+
+- **Inference Time:** 1.74s for 392 identifiers (3746 tokens)  
+- **Tokens/sec:** 2157.78  
+- **Identifiers/sec:** 225.80  
+
+### Final Scores
+
+- **Final Macro F1 on Held-Out Set:** 0.9032  
+- **Final Token-level Accuracy:** 0.9223  
+- **Final Identifier-level Accuracy:** 0.8291  

From 26857a12c4336bb590d9f33ebbcf6082dc55c908 Mon Sep 17 00:00:00 2001
From: Christian Newman <cdvnse@rit.edu>
Date: Mon, 9 Jun 2025 22:43:27 -0400
Subject: [PATCH 46/51] Tested tree and lm based run and train. Did some
 thorough documenting on how the nn code works.

---
 main                                          |   3 +-
 requirements.txt                              |   3 +-
 src/lm_based_tagger/distilbert_crf.py         | 121 ++++++---
 .../distilbert_preprocessing.py               |  73 +++++-
 src/lm_based_tagger/distilbert_tagger.py      |  60 +++--
 src/lm_based_tagger/train_model.py            | 230 ++++++++++++------
 6 files changed, 349 insertions(+), 141 deletions(-)

diff --git a/main b/main
index 83cfb18..536cb71 100755
--- a/main
+++ b/main
@@ -67,12 +67,11 @@ if __name__ == "__main__":
             download_files()
             train_tree(config)
         elif args.model_type == "lm_based":
-            download_files()
             train_lm(SCRIPT_DIR)
 
     elif args.mode == "run":
         if args.model_type == "tree_based":
-            config = load_config_tree()
+            config = load_config_tree(SCRIPT_DIR)
             # Inject overrides
             download_files()
             config["model_type"] = args.model_type
diff --git a/requirements.txt b/requirements.txt
index 233014e..7ed1a05 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,5 +6,6 @@ pytorch-crf==0.7.2
 scikit-learn==1.6.1
 spiral @ git+https://github.com/cnewman/spiral.git@dff537320c15849c10e583968036df2d966eddee
 torch==2.7.1
-transformers==4.52.4
 waitress==3.0.2
+gensim==4.3.3
+transformers[torch]
diff --git a/src/lm_based_tagger/distilbert_crf.py b/src/lm_based_tagger/distilbert_crf.py
index 729c74b..e1007c5 100644
--- a/src/lm_based_tagger/distilbert_crf.py
+++ b/src/lm_based_tagger/distilbert_crf.py
@@ -6,16 +6,39 @@
 
 class DistilBertCRFForTokenClassification(nn.Module):
     """
-    DistilBERT ➜ dropout ➜ linear projection ➜ CRF.
-    The CRF layer models label‑to‑label transitions, so the model
-    is optimised at *sequence* level rather than *token* level.
+    Token-level classifier that combines DistilBERT with a CRF layer for structured prediction.
+
+    Architecture:
+        input_ids, attention_mask
+            ↓
+        DistilBERT (pretrained encoder)
+            ↓
+        Dropout
+            ↓
+        Linear layer (projects hidden size → num_labels)
+            ↓
+        CRF layer (models sequence-level transitions)
+
+    Training:
+        - Uses negative log-likelihood from CRF as loss.
+        - Learns both emission scores (token-level confidence) and
+          transition scores (label-to-label sequence consistency).
+
+    Inference:
+        - Uses Viterbi decoding to predict the most likely sequence of labels.
+
+    Output:
+        During training:
+            {"loss": ..., "logits": ...}
+        During inference:
+            {"logits": ..., "predictions": List[List[int]]}
+
+    Example input shape:
+        input_ids:      [B, T]      — e.g. [16, 128]
+        attention_mask: [B, T]      — 1 for real tokens, 0 for padding
+        logits:         [B, T, C]   — C = number of label classes
     """
-    def __init__(self,
-                 num_labels: int,
-                 id2label: dict,
-                 label2id: dict,
-                 pretrained_name: str = "distilbert-base-uncased",
-                 dropout_prob: float = 0.1):
+    def __init__(self, num_labels: int, id2label: dict, label2id: dict, pretrained_name: str = "distilbert-base-uncased",  dropout_prob: float = 0.1):
         super().__init__()
 
         self.config = DistilBertConfig.from_pretrained(
@@ -29,11 +52,34 @@ def __init__(self,
         self.classifier = nn.Linear(self.config.hidden_size, num_labels)
         self.crf = CRF(num_labels, batch_first=True)
 
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                labels=None,
-                **kwargs):
+    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
+        """
+        Forward pass for training or inference.
+
+        Args:
+            input_ids (Tensor): Token IDs of shape [B, T]
+            attention_mask (Tensor): Attention mask of shape [B, T]
+            labels (Tensor, optional): Ground-truth labels of shape [B, T]. Required during training.
+            kwargs: Any additional DistilBERT-compatible inputs (e.g., head_mask, position_ids, etc.)
+
+        Returns:
+            If labels are provided (training mode):
+                dict with:
+                    - loss (Tensor): scalar negative log-likelihood from CRF
+                    - logits (Tensor): emission scores of shape [B, T, C]
+
+            If labels are not provided (inference mode):
+                dict with:
+                    - logits (Tensor): emission scores of shape [B, T, C]
+                    - predictions (List[List[int]]): decoded label IDs from CRF,
+                                                    one list per sequence,
+                                                    each of length T-2 (excluding [CLS] and [SEP])
+
+        Notes:
+            - logits: [B, T, C], where B = batch size, T = sequence length, C = number of label classes
+            - predictions: List[List[int]], where each inner list has length T-2
+                        (i.e., excludes [CLS] and [SEP]) and contains Viterbi-decoded label IDs
+        """
 
         # Hugging Face occasionally injects helper fields (e.g. num_items_in_batch)
         # Filter `kwargs` down to what DistilBertModel.forward actually accepts.
@@ -48,36 +94,49 @@ def forward(self,
             attention_mask=attention_mask,
             **bert_kwargs,
         )
-        # —— Build emissions once ——————————————————————————————
-        sequence_output = self.dropout(outputs[0])          # [B, T, H]
-        emission_scores = self.classifier(sequence_output)  # [B, T, C]
+        # 1) Compute per-token emission scores
+        # Applies dropout to the BERT hidden states, then projects them to label logits.
+        # Shape: [B, T, C], where B=batch size, T=sequence length, C=number of classes
+        sequence_output = self.dropout(outputs[0])
+        emission_scores = self.classifier(sequence_output)
 
-        # ============================== TRAINING ==============================
         if labels is not None:
-            # 1. Drop [CLS] (idx 0) and [SEP] (idx –1)
-            emissions = emission_scores[:, 1:-1, :]         # [B, T‑2, C]
-            tags      = labels[:,           1:-1].clone()   # [B, T‑2]
-            crf_mask  = (tags != -100)                      # True = keep
+            # 2) Remove [CLS] and [SEP] special tokens from emissions and labels
+            # These tokens were added by the tokenizer but are not part of the identifier
+            emissions = emission_scores[:, 1:-1, :]         # [B, T-2, C]
+            tags      = labels[:, 1:-1].clone()             # [B, T-2]
 
-            # 2. For any position that’s masked‑off ➜ set tag to a valid id (0)
+            # 3) Create a mask: True where label is valid, False where label == -100
+            # The CRF will use this to ignore special/padded tokens
+            crf_mask  = (tags != -100)
+
+            # 4) Replace invalid label positions (-100) with a dummy label (e.g., 0)
+            # This is required because CRF expects a label at every position, even if masked
             tags[~crf_mask] = 0
 
-            # 3. Guarantee first timestep is ON for every sequence
+            # 5) Ensure the first token of every sequence is active in the CRF mask
+            # This avoids CRF errors when the first token is masked out (which breaks decoding)
             first_off = (~crf_mask[:, 0]).nonzero(as_tuple=True)[0]
             if len(first_off):
-                crf_mask[first_off, 0] = True        # flip mask to ON
-                tags[first_off, 0] = 0               # give it tag 0
+                crf_mask[first_off, 0] = True
+                tags[first_off, 0] = 0  # assign a dummy label
 
+            # 6) Compute CRF negative log-likelihood loss
             loss = -self.crf(emissions, tags, mask=crf_mask, reduction="mean")
             return {"loss": loss, "logits": emission_scores}
 
-        # ============================= INFERENCE ==============================
         else:
-            crf_mask  = attention_mask[:, 1:-1].bool()      # [B, T‑2]
-            emissions = emission_scores[:, 1:-1, :]         # [B, T‑2, C]
+            # INFERENCE MODE
+
+            # 2) Remove [CLS] and [SEP] from emissions and build CRF mask from attention
+            # Only use the inner content of the input sequence
+            crf_mask  = attention_mask[:, 1:-1].bool()      # [B, T-2]
+            emissions = emission_scores[:, 1:-1, :]         # [B, T-2, C]
+
+            # 3) Run Viterbi decoding to get best label sequence for each input
             best_paths = self.crf.decode(emissions, mask=crf_mask)
-            return {"logits": emission_scores,
-                    "predictions": best_paths}
+            return {"logits": emission_scores, "predictions": best_paths}
+    
     @classmethod
     def from_pretrained(cls, ckpt_dir, local=False, **kw):
         from safetensors.torch import load_file as load_safe_file
diff --git a/src/lm_based_tagger/distilbert_preprocessing.py b/src/lm_based_tagger/distilbert_preprocessing.py
index bd10c2b..f386c28 100644
--- a/src/lm_based_tagger/distilbert_preprocessing.py
+++ b/src/lm_based_tagger/distilbert_preprocessing.py
@@ -1,14 +1,8 @@
 import re
-from nltk import pos_tag
-import nltk
 from difflib import SequenceMatcher
 import pandas as pd
 from datasets import Dataset
 
-# Download once (we’ll just do it quietly here)
-nltk.download('averaged_perceptron_tagger_eng', quiet=True)
-nltk.download('universal_tagset', quiet=True)
-
 # === Constants ===
 VOWELS = set("aeiou")
 LOW_FREQ_TAGS = {"CJ", "VM", "PRE", "V"}
@@ -27,7 +21,6 @@
     "hungarian",
     "cvr",
     "digit",
-    #"nltk"
 ]
 
 FEATURE_FUNCTIONS = {
@@ -35,7 +28,6 @@
     "hungarian": lambda row, tokens: detect_hungarian_prefix(tokens[0]) if tokens else "@hung_none",
     "cvr": lambda row, tokens: consonant_vowel_ratio_bucket(tokens),
     "digit": lambda row, tokens: detect_digit_feature(tokens),
-    "nltk": lambda row, tokens: "@nltk_" + '-'.join(tag.lower() for _, tag in pos_tag(tokens, tagset="universal"))
 }
 
 def get_feature_tokens(row, tokens):
@@ -99,6 +91,38 @@ def normalize_language(lang_str):
     return "@lang_" + lang_str.strip().lower().replace("++", "pp").replace("#", "sharp")
 
 def prepare_dataset(df: pd.DataFrame, label2id: dict):
+    """
+    Converts a DataFrame of identifier tokens and grammar tags into a HuggingFace Dataset
+    formatted for NER training with feature and position tokens.
+
+    Each row in the input DataFrame should contain:
+        - tokens: List[str] (e.g., ['get', 'Employee', 'Name'])
+        - tags:   List[str] (e.g., ['V', 'NM', 'N'])
+        - CONTEXT: str (e.g., 'function')
+
+    The function adds:
+        - Feature tokens: ['@hung_get', '@no_digit', '@cvr_mid', '@func']
+        - Interleaved position and real tokens:
+            ['@pos_0', 'get', '@pos_1', 'Employee', '@pos_2', 'Name']
+
+    The NER tags are aligned so that:
+        - Feature tokens and position markers get label -100 (ignored in loss)
+        - Real tokens are converted from grammar tags using `label2id`
+
+    Example Input:
+        df = pd.DataFrame([{
+            "tokens": ["get", "Employee", "Name"],
+            "tags": ["V", "NM", "N"],
+            "CONTEXT": "function"
+        }])
+
+    Example Output:
+        Dataset with:
+            tokens:    ['@hung_get', '@no_digit', '@cvr_mid', '@func',
+                        '@pos_0', 'get', '@pos_1', 'Employee', '@pos_2', 'Name']
+            ner_tags:  [-100, -100, -100, -100,
+                        -100, 1, -100, 2, -100, 3]  # assuming label2id = {"V": 1, "NM": 2, "N": 3}
+    """
     rows = []
     for _, row in df.iterrows():
         tokens = row["tokens"]
@@ -123,9 +147,34 @@ def prepare_dataset(df: pd.DataFrame, label2id: dict):
         "ner_tags": [r["ner_tags"] for r in rows]
     })
 
-def tokenize_and_align_labels(example, tokenizer):
+def tokenize_and_align_labels(sample, tokenizer):
+    """
+    Tokenizes an example and aligns NER labels with subword tokens.
+
+    The input `example` comes from `prepare_dataset()` and contains:
+        - tokens: List[str], including feature and position tokens
+        - ner_tags: List[int], aligned with `tokens`, with -100 for ignored tokens
+
+    This function:
+        - Uses `is_split_into_words=True` to tokenize each item in `tokens`
+        - Uses `tokenizer.word_ids()` to map each subword back to its original token index
+        - Assigns the corresponding label (or -100) for each subword token
+
+    Example Input:
+        example = {
+            "tokens": ['@hung_get', '@no_digit', '@cvr_mid', '@func',
+                       '@pos_0', 'get', '@pos_1', 'Employee', '@pos_2', 'Name'],
+            "ner_tags": [-100, -100, -100, -100,
+                         -100, 1, -100, 2, -100, 3]
+        }
+
+    Assuming 'Employee' is tokenized to ['Em', '##ployee'],
+    Example Output:
+        tokenized["labels"] = [-100, -100, -100, -100,
+                               -100, 1, -100, 2, 2, -100, 3]
+    """
     tokenized = tokenizer(
-        example["tokens"],
+        sample["tokens"],
         truncation=True,
         is_split_into_words=True
     )
@@ -136,8 +185,8 @@ def tokenize_and_align_labels(example, tokenizer):
     for word_id in word_ids:
         if word_id is None:
             labels.append(-100)
-        elif word_id < len(example["ner_tags"]):
-            labels.append(example["ner_tags"][word_id])
+        elif word_id < len(sample["ner_tags"]):
+            labels.append(sample["ner_tags"][word_id])
         else:
             labels.append(-100)
 
diff --git a/src/lm_based_tagger/distilbert_tagger.py b/src/lm_based_tagger/distilbert_tagger.py
index 394c340..9f89d00 100644
--- a/src/lm_based_tagger/distilbert_tagger.py
+++ b/src/lm_based_tagger/distilbert_tagger.py
@@ -1,31 +1,53 @@
 import torch
-from nltk import pos_tag
 from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
 from .distilbert_crf import DistilBertCRFForTokenClassification 
 from .distilbert_preprocessing import *
 
 class DistilBertTagger:
+    """
+    A lightweight wrapper around a DistilBERT+CRF or DistilBERT-only model for tagging identifier tokens
+    with part-of-speech-like grammar labels (e.g., V, NM, N, etc.).
+
+    Automatically handles:
+    - Tokenization (with custom feature and position tokens)
+    - Running the model
+    - Post-processing the raw logits or CRF predictions
+    - Aligning subword tokens back to word-level predictions
+    """
     def __init__(self, model_path: str, local: bool = False):
+        # Load tokenizer from local directory or remote HuggingFace path
         self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_path, local_files_only=local)
 
+        # Try loading CRF-enhanced model; fallback to plain classifier if not available
         try:
             self.model = DistilBertCRFForTokenClassification.from_pretrained(model_path, local=local)
         except Exception:
             self.model = DistilBertForTokenClassification.from_pretrained(model_path, local_files_only=local)
 
+        # disable dropout, etc. for inference
         self.model.eval()
+        
+        # map label IDs to strings
         self.id2label = {int(k): v for k, v in self.model.config.id2label.items()}
 
     def tag_identifier(self, tokens, context, type_str, language, system_name):
         """
-        1) Build the “feature tokens + position tokens + identifier tokens” sequence
-        2) Tokenize with `is_split_into_words=True`
-        3) Run the model, take argmax over token logits
-        4) Align via `word_ids()`, skipping:
-              - Any word_id = None
-              - Any word_id < N (number of feature tokens) => labels = -100
-              - Repeated word_ids (so we pick only the first sub-token of each “(pos, identifier-word)” pair)
-        5) Return a list of string labels by mapping numeric IDs through `self.id2label`.
+        Tag a split identifier using the model, returning a sequence of grammar pattern labels (e.g., ["V", "NM", "N"]).
+
+        Steps:
+        1) Build full input token list:
+              [feature tokens] + [@pos_0, w1, @pos_1, w2, ..., @pos_2, wn]
+        2) Tokenize using HuggingFace tokenizer with is_split_into_words=True
+        3) Run the model forward pass (handles CRF or logits automatically)
+        4) Use word_ids() to align predictions back to full words
+              - Skip special tokens (None)
+              - Skip feature tokens (index < NUMBER_OF_FEATURES)
+              - Use only the *second* token in each [@pos_X, word] pair (the word)
+              - Skip repeated subword tokens (only use the first subtoken per word)
+        5) Return a list of string labels corresponding to the original identifier tokens.
+
+        Returns:
+            List[str]: a list of grammar tags (e.g., ['V', 'NM', 'N']) aligned to `tokens`
         """
         row = {
             "CONTEXT": context,
@@ -33,6 +55,8 @@ def tag_identifier(self, tokens, context, type_str, language, system_name):
             "TYPE": type_str,
             "LANGUAGE": language
         }
+        
+        # Step 1: Feature tokens + alternating position/word tokens
         feature_tokens = get_feature_tokens(row, tokens)
 
         length = len(tokens)
@@ -41,6 +65,7 @@ def tag_identifier(self, tokens, context, type_str, language, system_name):
 
         input_tokens = feature_tokens + tokens_with_pos
 
+        # Step 2: Tokenize using word-alignment aware tokenizer
         encoded = self.tokenizer(
             input_tokens,
             is_split_into_words=True,
@@ -49,35 +74,40 @@ def tag_identifier(self, tokens, context, type_str, language, system_name):
             padding=True
         )
 
+        # Step 3: Forward pass
         with torch.no_grad():
             out = self.model(
                 input_ids=encoded["input_ids"],
                 attention_mask=encoded["attention_mask"],
             )
 
+        # Step 4: Get predictions depending on model type (CRF vs logits)
         if isinstance(out, dict) and "predictions" in out:
             labels_per_token = out["predictions"][0]
         else:
             logits = out[0] if isinstance(out, (tuple, list)) else out
             labels_per_token = torch.argmax(logits, dim=-1).squeeze().tolist()
 
+        # Step 5: Convert subtoken-level predictions to word-level predictions
         pred_labels, previous_word_idx = [], None
         word_ids = encoded.word_ids()
 
         for idx, word_idx in enumerate(word_ids):
             if word_idx is None:
-                continue
+                continue  # special token (CLS, SEP, PAD, etc.)
             if word_idx < NUMBER_OF_FEATURES:
-                continue
+                continue  # feature tokens (shouldn't be labeled)
             if (word_idx - NUMBER_OF_FEATURES) % 2 == 0:
-                continue
+                continue  # position tokens (e.g., @pos_0)
             if word_idx == previous_word_idx:
-                continue
-
+                continue  # skip repeated subword tokens
+            
+            # Heuristic: labels lag by 1 position relative to input_ids
             label_idx = idx - 1
             if label_idx < len(labels_per_token):
                 pred_labels.append(labels_per_token[label_idx])
             previous_word_idx = word_idx
-
+        
+        # Step 6: Map label IDs back to string labels
         pred_tag_strings = [self.id2label[i] for i in pred_labels]
         return pred_tag_strings
diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py
index 2e11a7a..49cd92d 100644
--- a/src/lm_based_tagger/train_model.py
+++ b/src/lm_based_tagger/train_model.py
@@ -14,8 +14,6 @@
     Trainer,
     TrainingArguments,
     DistilBertTokenizerFast,
-    DistilBertConfig,
-    DistilBertForTokenClassification,
     DataCollatorForTokenClassification,
     EarlyStoppingCallback
 )
@@ -52,7 +50,120 @@ def dual_print(*args, file, **kwargs):
     print(*args, **kwargs)         # stdout
     print(*args, file=file, **kwargs)  # file
 
+
+# 11) compute_metrics function (macro-F1) 
+def compute_metrics(eval_pred):
+    """
+    Computes macro-F1, token-level accuracy, and identifier-level accuracy.
+
+    Supports both:
+    - Raw logits from the model (shape [B, T, C])
+    - Viterbi-decoded label paths from CRF models (List[List[int]])
+
+    Args:
+        eval_pred: Either a tuple (preds, labels) or a HuggingFace EvalPrediction object.
+                   `preds` can be:
+                       • [B, T, C] logits (e.g., output of a classifier head)
+                       • [B, T] label IDs
+                       • List[List[int]] variable-length decoded paths (CRF)
+
+    Returns:
+        dict with:
+            - "eval_macro_f1": F1 averaged over classes (not tokens)
+            - "eval_token_accuracy": token-level accuracy (ignores -100)
+            - "eval_identifier_accuracy": percentage of rows where all tokens matched
+
+    Example (logits of shape [B=2, T=3, C=4]):
+        preds = np.array([
+            [  # Example 1 (B=0)
+                [0.1, 2.5, 0.3, -1.0],  # Token 1 → class 1 (NM)
+                [1.5, 0.4, 0.2, -0.5], # Token 2 → class 0 (V)
+                [0.3, 0.1, 3.2, 0.0],  # Token 3 → class 2 (N)
+            ],
+            [  # Example 2 (B=1)
+                [0.2, 0.1, 0.4, 2.1],  # Token 1 → class 3 (P)
+                [0.9, 1.0, 0.3, 0.0],  # Token 2 → class 1 (NM)
+                [1.1, 1.1, 1.1, 1.1],  # Token 3 → tie (say model picks class 0)
+            ]
+        ])
+
+        Converted via argmax(preds, axis=-1):
+            → [[1, 0, 2],  # Example 1 predictions
+               [3, 1, 0]]  # Example 2 predictions
+
+        Gold:  [V, NM, N]    → label_row = [-100, 1, -100, 2, -100, 3]
+        Pred:  [V, NM, N]    → pred_row  =         [1,        2,       3]
+        All tokens match → example_correct = True
+    """
+    # 1) Extract predictions and labels
+    if isinstance(eval_pred, tuple):  # older HuggingFace versions
+        preds, labels = eval_pred
+    else:  # EvalPrediction object
+        preds = eval_pred.predictions
+        labels = eval_pred.label_ids
+
+    # 2) Normalize predictions format
+    # Convert [B, T, C] logits → [B, T] class IDs
+    if isinstance(preds, np.ndarray) and preds.ndim == 3:
+        preds = np.argmax(preds, axis=-1)
+    # Convert CRF list-of-lists → numpy object array
+    elif isinstance(preds, list):
+        preds = np.array(preds, dtype=object)
+
+    # 3) Compare predictions to labels, ignoring -100
+    all_true, all_pred, id_correct_flags = [], [], []
+
+    for pred_row, label_row in zip(preds, labels):
+        ptr = 0
+        example_correct = True
+
+        for lbl in label_row:                 # iterate gold labels
+            if lbl == -100:                   # skip padding / specials
+                continue
+
+            # pick the corresponding prediction
+            if isinstance(pred_row, (list, np.ndarray)):
+                pred_lbl = pred_row[ptr]
+            else:                             # pred_row is scalar
+                pred_lbl = pred_row
+            ptr += 1
+
+            all_true.append(lbl)
+            all_pred.append(pred_lbl)
+            if pred_lbl != lbl:
+                example_correct = False
+
+        id_correct_flags.append(example_correct)
+
+    # 4) Compute metrics from flattened predictions
+    macro_f1  = f1_score(all_true, all_pred, average="macro")
+    token_acc = accuracy_score(all_true, all_pred)
+    id_acc    = float(sum(id_correct_flags)) / len(id_correct_flags)
+
+    return {
+        "eval_macro_f1":          macro_f1,
+        "eval_token_accuracy":    token_acc,
+        "eval_identifier_accuracy": id_acc,
+    }
+
 def train_lm(script_dir: str):
+    """
+    Trains a DistilBERT+CRF model using k-fold cross-validation for token-level grammar tagging.
+    Performs model selection based on macro F1 score, and evaluates the best model on a final hold-out set.
+
+    Input TSV must contain:
+        - SPLIT: tokenized identifier as space-separated subtokens (e.g., "get Employee Name")
+        - GRAMMAR_PATTERN: space-separated labels (e.g., "V NM N")
+        - CONTEXT: usage context string (e.g., FUNCTION, PARAMETER, ...)
+
+    Example input row:
+        SPLIT="get Employee Name", GRAMMAR_PATTERN="V NM N", CONTEXT="FUNCTION"
+
+    Output:
+        - Trained model checkpoints (best fold + final eval)
+        - Hold-out predictions and metrics (saved to output/holdout_predictions.csv)
+        - Text report of macro-F1, token-level and identifier-level accuracy
+    """
     # 1) Paths
     input_path = os.path.join(script_dir, "input", "tagger_data.tsv")
     output_dir = os.path.join(script_dir, "output")
@@ -107,11 +218,11 @@ def train_lm(script_dir: str):
 
         # 7c) Tokenize + align labels (exactly as before) 
         tokenized_train = fold_train_dataset.map(
-            lambda ex: tokenize_and_align_labels(ex, tokenizer),
+            lambda sample: tokenize_and_align_labels(sample, tokenizer),
             batched=False
         )
         tokenized_test = fold_test_dataset.map(
-            lambda ex: tokenize_and_align_labels(ex, tokenizer),
+            lambda sample: tokenize_and_align_labels(sample, tokenizer),
             batched=False
         )
 
@@ -169,74 +280,18 @@ def train_lm(script_dir: str):
                 dataloader_pin_memory=False
             )
 
-        # 10) Data collator (dynamic padding) 
+        # 10) Define collator that handles dynamic padding + label alignment
+        #     For example, if two tokenized examples have:
+        #         input_ids = [[101, 2121, 5661, 2171, 102], [101, 2064, 102]]
+        #     the collator will pad them to the same length and align
+        #     their attention_mask and labels accordingly.
         data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
 
-        # 11) compute_metrics function (macro-F1) 
-
-        def compute_metrics(eval_pred):
-            """
-            Works for both:
-                • Plain classifier logits  → argmax along last dim
-                • CRF Viterbi paths (list/2‑D ndarray) → use directly
-            Returns:
-                - eval_macro_f1
-                - eval_token_accuracy
-                - eval_identifier_accuracy
-            """
-            # ── 1. Unpack ────────────────────────────────────────────────────
-            if isinstance(eval_pred, tuple):          # older HF (<4.38)
-                preds, labels = eval_pred
-            else:                                     # EvalPrediction obj
-                preds  = eval_pred.predictions
-                labels = eval_pred.label_ids
-
-            # ── 2. Convert logits → label IDs if needed ─────────────────────
-            #    * 3‑D tensor  : [B, T, C]  → argmax(C)
-            #    * 2‑D tensor  : already IDs
-            #    * list/obj‑nd : variable‑length decode paths
-            if isinstance(preds, np.ndarray) and preds.ndim == 3:
-                preds = np.argmax(preds, axis=-1)     # [B, T]
-            elif isinstance(preds, list):
-                preds = np.array(preds, dtype=object) # each row is a list
-
-            # ── 3. Accumulate token & identifier stats ──────────────────────
-            all_true, all_pred, id_correct_flags = [], [], []
-
-            for pred_row, label_row in zip(preds, labels):
-                ptr = 0
-                example_correct = True
-
-                for lbl in label_row:                 # iterate gold labels
-                    if lbl == -100:                   # skip padding / specials
-                        continue
-
-                    # pick the corresponding prediction
-                    if isinstance(pred_row, (list, np.ndarray)):
-                        pred_lbl = pred_row[ptr]
-                    else:                             # pred_row is scalar
-                        pred_lbl = pred_row
-                    ptr += 1
-
-                    all_true.append(lbl)
-                    all_pred.append(pred_lbl)
-                    if pred_lbl != lbl:
-                        example_correct = False
-
-                id_correct_flags.append(example_correct)
-
-            # ── 4. Metrics ──────────────────────────────────────────────────
-            macro_f1  = f1_score(all_true, all_pred, average="macro")
-            token_acc = accuracy_score(all_true, all_pred)
-            id_acc    = float(sum(id_correct_flags)) / len(id_correct_flags)
-
-            return {
-                "eval_macro_f1":          macro_f1,
-                "eval_token_accuracy":    token_acc,
-                "eval_identifier_accuracy": id_acc,
-            }
-
-        # 12) Trainer for this fold (with EarlyStopping) 
+
+        # 11) Initialize Trainer for this fold with early stopping
+        #     Trainer handles batching, optimizer, eval, LR scheduling, logging, etc.
+        #     We also assign the tokenizer to `trainer.tokenizer` so that
+        #     it is correctly saved with the model and used during predict().
         trainer = Trainer(
             model=model,
             args=training_args,
@@ -250,10 +305,19 @@ def compute_metrics(eval_pred):
         # Avoid deprecation warning (explicitly set tokenizer on trainer)
         trainer.tokenizer = tokenizer
 
-        # 13) Train this fold
+        # 12) Train model on this fold
+        #     During training, the CRF computes loss using both:
+        #         - emission scores (per-token label likelihoods from DistilBERT)
+        #         - transition scores (likelihoods of label sequences)
+        #     It uses the Viterbi algorithm to find the most likely label path
+        #     and compares it to the true label sequence to compute loss.
         trainer.train()
 
-        # 14) Evaluate on this fold’s held-out split
+
+        # 13) Evaluate fold performance on validation split
+        #     We run inference and obtain predictions as either logits (softmax) or Viterbi-decoded paths.
+        #     Here, since we use CRF; 'preds_logits' contains Viterbi sequences of label IDs.
+        #     We then flatten and decode both true and predicted labels for macro-F1 calculation.
         preds_logits, labels, _ = trainer.predict(tokenized_test)
         preds = np.argmax(preds_logits, axis=-1)
 
@@ -264,6 +328,7 @@ def compute_metrics(eval_pred):
             for (l, p) in zip(sent_labels, sent_preds)
             if l != -100
         ]
+        
         pred_labels_list = [
             ID2LABEL[p]
             for sent_labels, sent_preds in zip(labels, preds)
@@ -274,7 +339,8 @@ def compute_metrics(eval_pred):
         fold_macro_f1 = f1_score(true_labels_list, pred_labels_list, average="macro")
         print(f"Fold {fold} Macro F1: {fold_macro_f1:.4f}")
 
-        # 15) If this fold’s model is the best so far, save it
+        # 14) Save model checkpoint if this fold is the best so far
+        #     This ensures we retain the model with highest validation performance
         if fold_macro_f1 > best_macro_f1:
             best_macro_f1 = fold_macro_f1
             best_model_dir = os.path.join(output_dir, "best_model")
@@ -284,14 +350,15 @@ def compute_metrics(eval_pred):
 
         fold += 1
 
-    # 16) After all folds, report best fold‐score & load best model for final evaluation
+    # 15) Final summary after cross-validation
+    #     Reports where the best model is saved and its macro F1 on fold validation data
     print(f"\nBest fold model saved at: {best_model_dir}, Macro F1 = {best_macro_f1:.4f}")
 
-    # 17) Final Evaluation on held-out val_df
+    # 16) Load best model and prepare for final evaluation on held-out set
     best_model = DistilBertCRFForTokenClassification.from_pretrained(best_model_dir)
     best_model.to(device)
 
-    # Build a fresh set of TrainingArguments that never runs evaluation epochs:
+    # Use new TrainingArguments to disable evaluation during predict
     final_args = TrainingArguments(
         output_dir=os.path.join(output_dir, "final_eval"),
         per_device_eval_batch_size=16,
@@ -301,6 +368,8 @@ def compute_metrics(eval_pred):
         report_to="none",
         seed=RAND_STATE
     )
+    
+    # Set up Trainer to run inference on hold-out set
     val_trainer = Trainer(
         model=best_model,
         args=final_args,
@@ -308,7 +377,8 @@ def compute_metrics(eval_pred):
         data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)
         # ← note: no eval_dataset here, because we’ll call .predict(...) manually
     )
-
+    
+    # 17) Run prediction on hold-out set and record inference time
     start_time = time.perf_counter()
     val_preds_logits, val_labels, _ = val_trainer.predict(tokenized_val)
     end_time = time.perf_counter()
@@ -328,8 +398,7 @@ def compute_metrics(eval_pred):
         if l != -100
     ]
 
-    # 18) Write hold-out predictions to CSV so that each row contains
-    #     (tokens, true_tags, pred_tags) for sanity checking.
+    # 18) Output predictions per row to CSV for inspection or error analysis
     from .distilbert_tagger import DistilBertTagger
 
     # Re-instantiate the exact same DistilBERT tagger we saved
@@ -364,12 +433,13 @@ def compute_metrics(eval_pred):
     df["row_correct"] = df["true_tags"] == df["pred_tags"]
     id_level_acc = df["row_correct"].mean()
     
-    # Report inference speed
+    # Report evaluation metrics and timing info
     total_tokens = sum(len(ex["tokens"]) for ex in val_dataset)
     total_examples = len(val_dataset)
     elapsed = end_time - start_time
     final_macro_f1 = f1_score(flat_true, flat_pred, average="macro")
     final_accuracy = accuracy_score(flat_true, flat_pred)
+    
     print("\nFinal Evaluation on Held-Out Set:")
     with open('holdout_report.txt', 'w') as f:
         report = classification_report(flat_true, flat_pred)

From bde70e21afc80ac5c3d762ebe45db7088dd1f5db Mon Sep 17 00:00:00 2001
From: Christian Newman <cdvnse@rit.edu>
Date: Mon, 9 Jun 2025 23:16:50 -0400
Subject: [PATCH 47/51] Update readme with new arguments and data

---
 README.md | 201 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 166 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index 2bf4b7f..fa1481b 100644
--- a/README.md
+++ b/README.md
@@ -1,49 +1,180 @@
-# SCALAR Part-of-speech tagger
+# SCALAR Part-of-Speech Tagger for Identifiers
 
-THIS IS AN EXPERIMENTAL VERSION OF SCALAR
+**SCALAR** is a part-of-speech tagger for source code identifiers. It supports two model types:
 
-Install requirements via `pip install -r requirements.txt`
+- **DistilBERT-based model with CRF layer** (Recommended: faster, more accurate)
+- Legacy Gradient Boosting model (for compatibility)
 
-Run via `python3 main --mode run --model_type lm_based`
+---
 
-You can attempt to train it `python main --mode train --model_type lm_based` -- but I make no guarantees about how easily it will work at this stage
+## Installation
 
-It still technically supports the old gradientboost model, too... but no guarantees as to how well it functions in this branch.
+Make sure you have `python3.12` installed. Then:
 
-## Evaluation Results (Held-Out Set)
+```bash
+git clone https://github.com/SCANL/scanl_tagger.git
+cd scanl_tagger
+python -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+```
 
-### Per-Class Metrics
+---
 
-| Label | Precision | Recall | F1-Score | Support |
-|-------|-----------|--------|----------|---------|
-| CJ    | 0.88      | 0.88   | 0.88     | 8       |
-| D     | 0.98      | 0.96   | 0.97     | 52      |
-| DT    | 0.95      | 0.93   | 0.94     | 45      |
-| N     | 0.94      | 0.94   | 0.94     | 418     |
-| NM    | 0.91      | 0.93   | 0.92     | 440     |
-| NPL   | 0.97      | 0.97   | 0.97     | 79      |
-| P     | 0.94      | 0.92   | 0.93     | 79      |
-| PRE   | 0.79      | 0.79   | 0.79     | 68      |
-| V     | 0.89      | 0.84   | 0.86     | 110     |
-| VM    | 0.79      | 0.85   | 0.81     | 13      |
+## Usage
 
-### Aggregate Metrics
+You can run SCALAR in multiple ways:
 
-| Metric              | Score  |
-|---------------------|--------|
-| Accuracy            | 0.92   |
-| Macro Avg F1        | 0.90   |
-| Weighted Avg F1     | 0.92   |
-| Total Examples      | 1312   |
+### CLI (with DistilBERT or GradientBoosting model)
 
-### Inference Statistics
+```bash
+python main --mode run --model_type lm_based         # DistilBERT (recommended)
+python main --mode run --model_type tree_based       # Legacy model
+```
 
-- **Inference Time:** 1.74s for 392 identifiers (3746 tokens)  
-- **Tokens/sec:** 2157.78  
-- **Identifiers/sec:** 225.80  
+Then query like:
 
-### Final Scores
+```
+http://127.0.0.1:8080/GetValue/FUNCTION
+```
 
-- **Final Macro F1 on Held-Out Set:** 0.9032  
-- **Final Token-level Accuracy:** 0.9223  
-- **Final Identifier-level Accuracy:** 0.8291  
+Supports context types:
+- FUNCTION
+- CLASS
+- ATTRIBUTE
+- DECLARATION
+- PARAMETER
+
+---
+
+## Training
+
+You can retrain either model (default parameters are currently hardcoded):
+
+```bash
+python main --mode train --model_type lm_based
+python main --mode train --model_type tree_based
+```
+
+---
+
+## Evaluation Results
+
+### DistilBERT (LM-Based Model) — Recommended
+
+| Metric                   | Score   |
+|--------------------------|---------|
+| **Macro F1**             | 0.9032  |
+| **Token Accuracy**       | 0.9223  |
+| **Identifier Accuracy**  | 0.8291  |
+
+| Label | Precision | Recall | F1    | Support |
+|-------|-----------|--------|-------|---------|
+| CJ    | 0.88      | 0.88   | 0.88  | 8       |
+| D     | 0.98      | 0.96   | 0.97  | 52      |
+| DT    | 0.95      | 0.93   | 0.94  | 45      |
+| N     | 0.94      | 0.94   | 0.94  | 418     |
+| NM    | 0.91      | 0.93   | 0.92  | 440     |
+| NPL   | 0.97      | 0.97   | 0.97  | 79      |
+| P     | 0.94      | 0.92   | 0.93  | 79      |
+| PRE   | 0.79      | 0.79   | 0.79  | 68      |
+| V     | 0.89      | 0.84   | 0.86  | 110     |
+| VM    | 0.79      | 0.85   | 0.81  | 13      |
+
+**Inference Performance:**
+- Identifiers/sec: 225.8
+
+---
+
+### Gradient Boost Model (Legacy)
+
+| Metric               | Score     |
+|----------------------|-----------|
+| Accuracy             | 0.8216    |
+| Balanced Accuracy    | 0.9160    |
+| Weighted Recall      | 0.8216    |
+| Weighted Precision   | 0.8245    |
+| Weighted F1          | 0.8220    |
+| Inference Time       | 249.05s   |
+
+**Inference Performance:**
+- Identifiers/sec: 8.6
+
+---
+
+## Supported Tagset
+
+| Tag   | Meaning                            | Examples                       |
+|-------|------------------------------------|--------------------------------|
+| N     | Noun                               | `user`, `Data`, `Array`        |
+| DT    | Determiner                         | `this`, `that`, `those`        |
+| CJ    | Conjunction                        | `and`, `or`, `but`             |
+| P     | Preposition                        | `with`, `for`, `in`            |
+| NPL   | Plural Noun                        | `elements`, `indices`          |
+| NM    | Noun Modifier (adjective-like)     | `max`, `total`, `employee`     |
+| V     | Verb                               | `get`, `set`, `delete`         |
+| VM    | Verb Modifier (adverb-like)        | `quickly`, `deeply`            |
+| D     | Digit                              | `1`, `2`, `10`, `0xAF`         |
+| PRE   | Preamble / Prefix                  | `m`, `b`, `GL`, `p`            |
+
+---
+
+## Docker Support (Legacy only)
+
+For the legacy server, you can also use Docker:
+
+```bash
+docker compose pull
+docker compose up
+```
+
+---
+
+## Notes
+
+- **Kebab case** is not supported (e.g., `do-something-cool`).
+- Feature and position tokens (e.g., `@pos_0`) are inserted automatically.
+- Internally uses [WordNet](https://wordnet.princeton.edu/) for lexical features.
+- Input must be parsed into identifier tokens. We recommend [srcML](https://www.srcml.org/) but any AST-based parser works.
+
+---
+
+## Citations
+
+Please cite:
+
+```
+@inproceedings{newman2025scalar,
+  author    = {Christian Newman and Brandon Scholten and Sophia Testa and others},
+  title     = {SCALAR: A Part-of-speech Tagger for Identifiers},
+  booktitle = {ICPC Tool Demonstrations Track},
+  year      = {2025}
+}
+
+@article{newman2021ensemble,
+  title={An Ensemble Approach for Annotating Source Code Identifiers with Part-of-speech Tags},
+  author={Newman, Christian and Decker, Michael and AlSuhaibani, Reem and others},
+  journal={IEEE Transactions on Software Engineering},
+  year={2021},
+  doi={10.1109/TSE.2021.3098242}
+}
+```
+
+---
+
+## Training Data
+
+You can find the most recent SCALAR training dataset [here](https://github.com/SCANL/scanl_tagger/blob/master/input/tagger_data.tsv)
+
+---
+
+## More from SCANL
+
+- [SCANL Website](https://www.scanl.org/)
+- [Identifier Name Structure Catalogue](https://github.com/SCANL/identifier_name_structure_catalogue)
+
+---
+
+## Trouble?
+
+Please [open an issue](https://github.com/SCANL/scanl_tagger/issues) if you encounter problems!

From 89748a09bf25154a263fc39b6c3377e52e8968ca Mon Sep 17 00:00:00 2001
From: Christian Newman <cdvnse@rit.edu>
Date: Tue, 10 Jun 2025 01:57:40 -0400
Subject: [PATCH 48/51] git workflow

---
 .github/workflows/tests.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 1586a95..3cad9e6 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -2,7 +2,7 @@ name: SCALAR Tagger CI
 
 on:
   push:
-    branches: [ master, develop ]
+    branches: [ master, develop, distilbert ]
   pull_request:
     branches: [ master, develop ]
 
@@ -78,12 +78,12 @@ jobs:
       
       - name: Start tagger server
         run: |
-          ./main -r &
+          python main --mode run --model_type lm_based
           
           # Wait for up to 5 minutes for the service to start and load models
           timeout=300
           while [ $timeout -gt 0 ]; do
-            if curl -s "http://localhost:8080/cache/numberArray/DECLARATION" > /dev/null; then
+            if curl -s "http://localhost:8080/numberArray/DECLARATION" > /dev/null; then
               echo "Service is ready"
               break
             fi
@@ -101,7 +101,7 @@ jobs:
       
       - name: Test tagger endpoint
         run: |
-          response=$(curl -s "http://localhost:8080/cache/numberArray/DECLARATION")
+          response=$(curl -s "http://localhost:8080/numberArray/DECLARATION")
           if [ -z "$response" ]; then
             echo "No response from tagger"
             exit 1

From 28dd47c1c7bbf35362d380a1847425ce67b5fa99 Mon Sep 17 00:00:00 2001
From: Christian Newman <cdvnse@rit.edu>
Date: Tue, 10 Jun 2025 10:24:56 -0400
Subject: [PATCH 49/51] Starting to see if I can get Doker up again. Update
 requirements with nltk for tree_based model

---
 Dockerfile       | 9 +--------
 requirements.txt | 1 +
 setup.py         | 2 +-
 version.py       | 2 +-
 4 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b747297..fc3234c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,18 +1,11 @@
 FROM python:3.12-slim
 
-#argument to enable GPU accelaration
-ARG GPU=false
-
 # Install (and build) requirements
 COPY requirements.txt /requirements.txt
-COPY requirements_gpu.txt /requirements_gpu.txt
 RUN apt-get clean && rm -rf /var/lib/apt/lists/* && \
     apt-get update --fix-missing && \
     apt-get install --allow-unauthenticated -y git curl && \
     pip install -r requirements.txt && \
-    if [ "$GPU" = true ]; then \
-        pip install -r requirements_gpu.txt; \
-    fi && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
 
 COPY . .
@@ -77,6 +70,6 @@ CMD date; \
     fi; \
     date; \
     echo "Running..."; \
-    /main -r --words words/abbreviationList.csv
+    /main --mode train --model_type lm_based --words words/abbreviationList.csv
 
 ENV TZ=US/Michigan
diff --git a/requirements.txt b/requirements.txt
index 7ed1a05..0eefe27 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,5 @@ spiral @ git+https://github.com/cnewman/spiral.git@dff537320c15849c10e583968036d
 torch==2.7.1
 waitress==3.0.2
 gensim==4.3.3
+nltk==3.9.1
 transformers[torch]
diff --git a/setup.py b/setup.py
index 2532143..d96cf39 100644
--- a/setup.py
+++ b/setup.py
@@ -18,6 +18,6 @@
         ],
     },
     python_requires='>=3.12',
-    author="Christian Newman",
+    author="Christian Newman, Anthony Peruma, Brandon Scholten, Syreen Banabilah",
     description="A machine learning based tagger for source code analysis",
 )
\ No newline at end of file
diff --git a/version.py b/version.py
index 2cc2f7f..6003a5c 100644
--- a/version.py
+++ b/version.py
@@ -1,2 +1,2 @@
-__version__ = "2.1.0"  # Changed to match docstring version
+__version__ = "2.2.0"  # Changed to match docstring version
 __version_info__ = tuple(int(num) for num in __version__.split("."))
\ No newline at end of file

From 4ed9cd3f7e599c9d56cf24fb3f40c24a8780479e Mon Sep 17 00:00:00 2001
From: Christian Newman <cdvnse@rit.edu>
Date: Tue, 10 Jun 2025 10:39:57 -0400
Subject: [PATCH 50/51] add download_files() to lm execution flow for the way
 we are currently using cache-- this should probably be fixed

---
 main | 1 +
 1 file changed, 1 insertion(+)

diff --git a/main b/main
index 536cb71..481d5ab 100755
--- a/main
+++ b/main
@@ -88,6 +88,7 @@ if __name__ == "__main__":
 
             start_server(temp_config=config)
         elif args.model_type == "lm_based":
+            download_files()
             if not args.local:
                 start_server(temp_config={
                     'script_dir': SCRIPT_DIR,

From cd8d94e3cd2d77dd249b90fa7fe96a02bd69c59e Mon Sep 17 00:00:00 2001
From: Christian Newman <cdvnse@rit.edu>
Date: Tue, 10 Jun 2025 11:04:44 -0400
Subject: [PATCH 51/51] Forgot to run process in the bg for github actions

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 3cad9e6..2f1eccb 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -78,7 +78,7 @@ jobs:
       
       - name: Start tagger server
         run: |
-          python main --mode run --model_type lm_based
+          python main --mode run --model_type lm_based &
           
           # Wait for up to 5 minutes for the service to start and load models
           timeout=300