From 259dbb3453882f67e4ea80b1f94ffe5a3b9f58a4 Mon Sep 17 00:00:00 2001 From: SyreenBan Date: Wed, 5 Feb 2025 20:23:58 -0500 Subject: [PATCH 01/51] add split_by_capitals --- tag_identifier.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tag_identifier.py b/tag_identifier.py index 8195c4e..d781b1c 100644 --- a/tag_identifier.py +++ b/tag_identifier.py @@ -8,6 +8,7 @@ from waitress import serve from spiral import ronin import json +import re from create_models import createModel, stable_features, mutable_feature_list app = Flask(__name__) @@ -159,6 +160,11 @@ def start_server(temp_config = {}): serve(app, host=server_host, port=server_port, url_scheme=server_url_scheme) data.close() +def split_by_capitals(name: str): + matches = re.finditer(r'[A-Z]+(?![a-z])|[A-Z][a-z]*|[a-z]+', name) + words = [match.group() for match in matches] + return words + def dictionary_lookup(word): #return true if the word exists in the dictionary (the nltk words corpus) #or if the word is in the list of approved words @@ -218,6 +224,9 @@ def listen(student, identifier_name: str, identifier_context: str) -> List[dict] # Split identifier_name into words words = identifier_name.split('_') + if (len(words) == 1 and identifier_name == words[0]): + words = split_by_capitals(identifier_name) + # # Create initial data frame data = pd.DataFrame({ 'WORD': words, From a8b4ba36a1906b886ed8d5420f1c6b5eb045f3c0 Mon Sep 17 00:00:00 2001 From: SyreenBan Date: Thu, 13 Feb 2025 12:28:33 -0500 Subject: [PATCH 02/51] fix the Splitter --- tag_identifier.py | 726 +++++++++++++++++++++++----------------------- 1 file changed, 359 insertions(+), 367 deletions(-) diff --git a/tag_identifier.py b/tag_identifier.py index d781b1c..bb89017 100644 --- a/tag_identifier.py +++ b/tag_identifier.py @@ -1,367 +1,359 @@ -import os -import time -import joblib -import nltk -import pandas as pd -from feature_generator import * -from flask import Flask -from waitress import serve -from spiral import ronin -import json -import re -from create_models import createModel, stable_features, mutable_feature_list -app = Flask(__name__) - -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -class ModelData: - def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> None: - """ - Initialize an instance of the ModelData class with word vector models. - - Args: - ModelTokens: Word vectors model for tokens. - ModelMethods: Word vectors model for methods. - ModelGensimEnglish: Word vectors model for general English words. - """ - - self.ModelTokens = modelTokens - self.ModelMethods = modelMethods - self.ModelGensimEnglish = modelGensimEnglish - self.wordCount = wordCount - # self.ModelClassifier = joblib.load('output/model_RandomForestClassifier.pkl') - -class AppCache: - def __init__(self, Path, Filename) -> None: - self.Cache = {} - self.Path = Path - self.Filename = Filename - - def load(self): - if not os.path.isdir(self.Path): - raise Exception("Cannot load path: "+self.Path) - else: - if not os.path.isfile(self.Path+"/"+self.Filename): - JSONcache = open(self.Path+"/"+self.Filename, 'w') - json.dump({}, JSONcache) - JSONcache.close() - JSONcache = open(self.Path+"/"+self.Filename, 'r') - self.Cache = json.load(JSONcache) - JSONcache.close() - - def add(self, identifier, result): - info = result - info.update({"firstEncounter": time.time()}) - info.update({"lastEncounter": time.time()}) - info.update({"count": 1}) - info.update({"version": "SCANL 1.0"}) - self.Cache.update({identifier : info}) - - def encounter(self, identifier): - self.Cache[identifier].update({"lastEncounter": time.time()}) - self.Cache[identifier].update({"count": self.Cache[identifier]["count"]+1}) - self.Cache[identifier].update({"version": "SCANL 1.0"}) - - def save(self): - JSONcache = open(self.Path+"/"+self.Filename, 'w') - json.dump(self.Cache, JSONcache) - JSONcache.close() - -class WordList: - def __init__(self, Path): - self.Words = set() - self.Path = Path - - def load(self): - if not os.path.isfile(self.Path): - print("Could not find word list file!") - return - with open(self.Path) as file: - for line in file: - self.Words.add(line[:line.find(',')]) #stop at comma - - def find(self, item): - return item in self.Words - -def initialize_model(): - """ - Initialize and load word vectors for the application, and load a word count DataFrame. - - This function initializes and loads word vectors using the 'createModel' function, and loads word counts - from a JSON file into a Pandas DataFrame for use in the application. - - Returns: - tuple: (ModelData, WORD_COUNT DataFrame) - """ - print("Loading word vectors!!") - modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR) - print("Word vectors loaded!!") - - # Load the word count JSON file into a DataFrame - word_count_path = os.path.join("input", "word_count.json") - if os.path.exists(word_count_path): - print(f"Loading word count data from {word_count_path}...") - word_count_df = pd.read_json(word_count_path, orient='index', typ='series').reset_index() - word_count_df.columns = ['word', 'log_frequency'] - print("Word count data loaded!") - else: - print(f"Word count file not found at {word_count_path}. Initializing empty DataFrame.") - word_count_df = pd.DataFrame(columns=['word', 'log_frequency']) - - # Create and store model data - app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df) - -def start_server(temp_config = {}): - """ - Initialize the model and start the server. - - This function first initializes the model by calling the 'initialize_model' function. Then, it starts the server using - the waitress `serve` method, allowing incoming HTTP requests to be handled. - - The arguments to waitress serve are read from the configuration file `serve.json`. The default option is to - listen for HTTP requests on all interfaces (ip address 0.0.0.0, port 5000). - - Returns: - None - """ - print('initializing model...') - initialize_model() - - print("loading cache...") - if not os.path.isdir("cache"): os.mkdir("cache") - app.cache = AppCache("cache", "cache.json") - app.studentCache = AppCache("cache", "student_cache.json") - app.cache.load() - - print("loading dictionary...") - nltk.download("words") - app.english_words = set(w.lower() for w in nltk.corpus.words.words()) - #insert english words from words/en.txt - if not os.path.exists("words/en.txt"): - print("could not find English words, using WordNet only!") - else: - with open("words/en.txt") as words: - for word in words: - app.english_words.add(word[:-1]) - - print('retrieving server configuration...') - data = open('serve.json') - config = json.load(data) - - server_host = temp_config["address"] if "address" in temp_config.keys() else config["address"] - server_port = temp_config["port"] if "port" in temp_config.keys() else config['port'] - server_url_scheme = temp_config["protocol"] if "protocol" in temp_config.keys() else config["protocol"] - - print("loading word list...") - wordListPath = temp_config["words"] if "words" in temp_config.keys() else config["words"] - app.words = WordList(wordListPath) - app.words.load() - - print("Starting server...") - serve(app, host=server_host, port=server_port, url_scheme=server_url_scheme) - data.close() - -def split_by_capitals(name: str): - matches = re.finditer(r'[A-Z]+(?![a-z])|[A-Z][a-z]*|[a-z]+', name) - words = [match.group() for match in matches] - return words - -def dictionary_lookup(word): - #return true if the word exists in the dictionary (the nltk words corpus) - #or if the word is in the list of approved words - dictionaryType = "" - dictionary = word.lower() in app.english_words - acceptable = app.words.find(word) - digit = word.isnumeric() - if (dictionary): - dictionaryType = "DW" - elif (acceptable): - dictionaryType = "AW" - elif (digit): - dictionaryType = "DD" - else: - dictionaryType = "UC" - - return dictionaryType - -#TODO: this is not an intuitive way to save cache -@app.route('/') -def save(): - app.cache.save() - app.studentCache.save() - return "successfully saved cache" - -#TODO: use a query string instead for specifying student cache -@app.route('///') -def listen(student, identifier_name: str, identifier_context: str) -> List[dict]: - #check if identifier name has already been used - cache = None; - - if (student == "student"): - cache = app.studentCache - else: - cache = app.cache - - if (identifier_name in cache.Cache.keys()): - cache.encounter(identifier_name) - return cache.Cache[identifier_name] - - """ - Process a web request to analyze an identifier within a specific context. - - This route function takes two URL parameters (identifier_name, and identifier_context) from an - incoming HTTP request and performs data preprocessing and feature extraction on the identifier_name. - It then uses a trained classifier to annotate the identifier with part-of-speech tags and other linguistic features. - - Args: - identifier_name (str): The name of the identifier to be analyzed. - identifier_context (str): The context in which the identifier appears. - - Returns: - List[dict]: A list of dictionaries containing words and their predicted POS tags. - """ - print(f"INPUT: {identifier_name} {identifier_context}") - - # Split identifier_name into words - words = identifier_name.split('_') - - if (len(words) == 1 and identifier_name == words[0]): - words = split_by_capitals(identifier_name) - - # # Create initial data frame - data = pd.DataFrame({ - 'WORD': words, - 'SPLIT_IDENTIFIER': ' '.join(words), - 'CONTEXT_NUMBER': context_to_number(identifier_context), # Predefined context number - }) - - # create response JSON - # tags = list(annotate_identifier(app.model_data.ModelClassifier, data)) - result = { - "words" : [] - } - - # Add features to the data - data = createFeatures( - data, - mutable_feature_list, - modelGensimEnglish=app.model_data.ModelGensimEnglish, - ) - - categorical_features = ['NLTK_POS'] - category_variables = [] - - for category_column in categorical_features: - if category_column in data.columns: - category_variables.append(category_column) - data.loc[:, category_column] = data[category_column].astype(str) - - for category_column in category_variables: - # Explicitly handle categorical conversion - unique_values = data[category_column].unique() - category_map = {} - for value in unique_values: - if value in universal_to_custom: - category_map[value] = custom_to_numeric[universal_to_custom[value]] - else: - category_map[value] = custom_to_numeric['NOUN'] # Assign 'NM' (8) for unknown categories - - data.loc[:, category_column] = data[category_column].map(category_map) - - # Convert categorical variables to numeric - # Load and apply the classifier - clf = joblib.load(os.path.join(SCRIPT_DIR, 'output', 'model_GradientBoostingClassifier.pkl')) - predicted_tags = annotate_identifier(clf, data) - - # Combine words and their POS tags into a parseable format - #result = [{'word': word, 'pos_tag': tag} for word, tag in zip(words, predicted_tags)] - - for i in range(len(words)): - #check dictionary - dictionary = "UC" #uncategorized - word = words[i] - dictionary = dictionary_lookup(word) - result["words"].append( - { - words[i] : { - "tag" : predicted_tags[i], - "dictionary" : dictionary - } - } - ) - - # append result to cache - cache.add(identifier_name, result) - - return result - -def context_to_number(context): - """ - Convert a textual context description to a numerical representation. - - This function takes a context description as a string and maps it to a numerical representation according to a - predefined mapping. - - Args: - context (str): The textual context description. - - Returns: - int: The numerical representation of the context. - - Raises: - ValueError: If the provided context is not one of the predefined values. - - Example: - numeric_context = context_to_number("CLASS") - """ - if context == "ATTRIBUTE": - return 1 - elif context == "CLASS": - return 2 - elif context == "DECLARATION": - return 3 - elif context == "FUNCTION": - return 4 - elif context == "PARAMETER": - return 5 - -def annotate_identifier(clf, data): - """ - Annotate identifier tokens using a trained classifier. - - This function takes a trained classifier and a dataset containing features for identifier tokens. It applies the - classifier to predict labels for the identifier tokens. - - Args: - clf (Classifier): The trained classifier model. - data (pd.DataFrame): A DataFrame containing features for identifier tokens. The columns of the DataFrame should - match the feature names used during training. - - Returns: - np.array: An array of predicted labels for the identifier tokens. - """ - # Drop unnecessary columns - data = data.drop(columns=['WORD', 'SPLIT_IDENTIFIER'], errors='ignore') - - # Ensure only the features used during training are included - trained_features = clf.feature_names_in_ # Features expected by the classifier - missing_features = set(trained_features) - set(data.columns) - extra_features = set(data.columns) - set(trained_features) - - if missing_features: - raise ValueError(f"The following expected features are missing: {missing_features}") - if extra_features: - print(f"Warning: The following unused features are being ignored: {extra_features}") - data = data[trained_features] - - # Ensure feature order matches the trained model - df_features = data[trained_features] - - print("THESE") - print(df_features) - - print("THOSE") - print(clf.feature_names_in_) - - # Make predictions - y_pred = clf.predict(df_features) - return y_pred +import os +import time +import joblib +import nltk +import pandas as pd +from feature_generator import * +from flask import Flask +from waitress import serve +from spiral import ronin +import json +from create_models import createModel, stable_features, mutable_feature_list + +app = Flask(__name__) + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +class ModelData: + def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> None: + """ + Initialize an instance of the ModelData class with word vector models. + + Args: + ModelTokens: Word vectors model for tokens. + ModelMethods: Word vectors model for methods. + ModelGensimEnglish: Word vectors model for general English words. + """ + + self.ModelTokens = modelTokens + self.ModelMethods = modelMethods + self.ModelGensimEnglish = modelGensimEnglish + self.wordCount = wordCount + # self.ModelClassifier = joblib.load('output/model_RandomForestClassifier.pkl') + +class AppCache: + def __init__(self, Path, Filename) -> None: + self.Cache = {} + self.Path = Path + self.Filename = Filename + + def load(self): + if not os.path.isdir(self.Path): + raise Exception("Cannot load path: "+self.Path) + else: + if not os.path.isfile(self.Path+"/"+self.Filename): + JSONcache = open(self.Path+"/"+self.Filename, 'w') + json.dump({}, JSONcache) + JSONcache.close() + JSONcache = open(self.Path+"/"+self.Filename, 'r') + self.Cache = json.load(JSONcache) + JSONcache.close() + + def add(self, identifier, result): + info = result + info.update({"firstEncounter": time.time()}) + info.update({"lastEncounter": time.time()}) + info.update({"count": 1}) + info.update({"version": "SCANL 1.0"}) + self.Cache.update({identifier : info}) + + def encounter(self, identifier): + self.Cache[identifier].update({"lastEncounter": time.time()}) + self.Cache[identifier].update({"count": self.Cache[identifier]["count"]+1}) + self.Cache[identifier].update({"version": "SCANL 1.0"}) + + def save(self): + JSONcache = open(self.Path+"/"+self.Filename, 'w') + json.dump(self.Cache, JSONcache) + JSONcache.close() + +class WordList: + def __init__(self, Path): + self.Words = set() + self.Path = Path + + def load(self): + if not os.path.isfile(self.Path): + print("Could not find word list file!") + return + with open(self.Path) as file: + for line in file: + self.Words.add(line[:line.find(',')]) #stop at comma + + def find(self, item): + return item in self.Words + +def initialize_model(): + """ + Initialize and load word vectors for the application, and load a word count DataFrame. + + This function initializes and loads word vectors using the 'createModel' function, and loads word counts + from a JSON file into a Pandas DataFrame for use in the application. + + Returns: + tuple: (ModelData, WORD_COUNT DataFrame) + """ + print("Loading word vectors!!") + modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR) + print("Word vectors loaded!!") + + # Load the word count JSON file into a DataFrame + word_count_path = os.path.join("input", "word_count.json") + if os.path.exists(word_count_path): + print(f"Loading word count data from {word_count_path}...") + word_count_df = pd.read_json(word_count_path, orient='index', typ='series').reset_index() + word_count_df.columns = ['word', 'log_frequency'] + print("Word count data loaded!") + else: + print(f"Word count file not found at {word_count_path}. Initializing empty DataFrame.") + word_count_df = pd.DataFrame(columns=['word', 'log_frequency']) + + # Create and store model data + app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df) + +def start_server(temp_config = {}): + """ + Initialize the model and start the server. + + This function first initializes the model by calling the 'initialize_model' function. Then, it starts the server using + the waitress `serve` method, allowing incoming HTTP requests to be handled. + + The arguments to waitress serve are read from the configuration file `serve.json`. The default option is to + listen for HTTP requests on all interfaces (ip address 0.0.0.0, port 5000). + + Returns: + None + """ + print('initializing model...') + initialize_model() + + print("loading cache...") + if not os.path.isdir("cache"): os.mkdir("cache") + app.cache = AppCache("cache", "cache.json") + app.studentCache = AppCache("cache", "student_cache.json") + app.cache.load() + + print("loading dictionary...") + nltk.download("words") + app.english_words = set(w.lower() for w in nltk.corpus.words.words()) + #insert english words from words/en.txt + if not os.path.exists("words/en.txt"): + print("could not find English words, using WordNet only!") + else: + with open("words/en.txt") as words: + for word in words: + app.english_words.add(word[:-1]) + + print('retrieving server configuration...') + data = open('serve.json') + config = json.load(data) + + server_host = temp_config["address"] if "address" in temp_config.keys() else config["address"] + server_port = temp_config["port"] if "port" in temp_config.keys() else config['port'] + server_url_scheme = temp_config["protocol"] if "protocol" in temp_config.keys() else config["protocol"] + + print("loading word list...") + wordListPath = temp_config["words"] if "words" in temp_config.keys() else config["words"] + app.words = WordList(wordListPath) + app.words.load() + + print("Starting server...") + serve(app, host=server_host, port=server_port, url_scheme=server_url_scheme) + data.close() + +def dictionary_lookup(word): + #return true if the word exists in the dictionary (the nltk words corpus) + #or if the word is in the list of approved words + dictionaryType = "" + dictionary = word.lower() in app.english_words + acceptable = app.words.find(word) + digit = word.isnumeric() + if (dictionary): + dictionaryType = "DW" + elif (acceptable): + dictionaryType = "AW" + elif (digit): + dictionaryType = "DD" + else: + dictionaryType = "UC" + + return dictionaryType + +#TODO: this is not an intuitive way to save cache +@app.route('/') +def save(): + app.cache.save() + app.studentCache.save() + return "successfully saved cache" + +#TODO: use a query string instead for specifying student cache +@app.route('///') +def listen(student, identifier_name: str, identifier_context: str) -> List[dict]: + #check if identifier name has already been used + cache = None; + + if (student == "student"): + cache = app.studentCache + else: + cache = app.cache + + if (identifier_name in cache.Cache.keys()): + cache.encounter(identifier_name) + return cache.Cache[identifier_name] + + """ + Process a web request to analyze an identifier within a specific context. + + This route function takes two URL parameters (identifier_name, and identifier_context) from an + incoming HTTP request and performs data preprocessing and feature extraction on the identifier_name. + It then uses a trained classifier to annotate the identifier with part-of-speech tags and other linguistic features. + + Args: + identifier_name (str): The name of the identifier to be analyzed. + identifier_context (str): The context in which the identifier appears. + + Returns: + List[dict]: A list of dictionaries containing words and their predicted POS tags. + """ + print(f"INPUT: {identifier_name} {identifier_context}") + + # Split identifier_name into words + words = ronin.split(identifier_name) + + # # Create initial data frame + data = pd.DataFrame({ + 'WORD': words, + 'SPLIT_IDENTIFIER': ' '.join(words), + 'CONTEXT_NUMBER': context_to_number(identifier_context), # Predefined context number + }) + + # create response JSON + # tags = list(annotate_identifier(app.model_data.ModelClassifier, data)) + result = { + "words" : [] + } + + # Add features to the data + data = createFeatures( + data, + mutable_feature_list, + modelGensimEnglish=app.model_data.ModelGensimEnglish, + ) + + categorical_features = ['NLTK_POS'] + category_variables = [] + + for category_column in categorical_features: + if category_column in data.columns: + category_variables.append(category_column) + data.loc[:, category_column] = data[category_column].astype(str) + + for category_column in category_variables: + # Explicitly handle categorical conversion + unique_values = data[category_column].unique() + category_map = {} + for value in unique_values: + if value in universal_to_custom: + category_map[value] = custom_to_numeric[universal_to_custom[value]] + else: + category_map[value] = custom_to_numeric['NOUN'] # Assign 'NM' (8) for unknown categories + + data.loc[:, category_column] = data[category_column].map(category_map) + + # Convert categorical variables to numeric + # Load and apply the classifier + clf = joblib.load(os.path.join(SCRIPT_DIR, 'output', 'model_GradientBoostingClassifier.pkl')) + predicted_tags = annotate_identifier(clf, data) + + # Combine words and their POS tags into a parseable format + #result = [{'word': word, 'pos_tag': tag} for word, tag in zip(words, predicted_tags)] + + for i in range(len(words)): + #check dictionary + dictionary = "UC" #uncategorized + word = words[i] + dictionary = dictionary_lookup(word) + result["words"].append( + { + words[i] : { + "tag" : predicted_tags[i], + "dictionary" : dictionary + } + } + ) + + # append result to cache + cache.add(identifier_name, result) + + return result + +def context_to_number(context): + """ + Convert a textual context description to a numerical representation. + + This function takes a context description as a string and maps it to a numerical representation according to a + predefined mapping. + + Args: + context (str): The textual context description. + + Returns: + int: The numerical representation of the context. + + Raises: + ValueError: If the provided context is not one of the predefined values. + + Example: + numeric_context = context_to_number("CLASS") + """ + if context == "ATTRIBUTE": + return 1 + elif context == "CLASS": + return 2 + elif context == "DECLARATION": + return 3 + elif context == "FUNCTION": + return 4 + elif context == "PARAMETER": + return 5 + +def annotate_identifier(clf, data): + """ + Annotate identifier tokens using a trained classifier. + + This function takes a trained classifier and a dataset containing features for identifier tokens. It applies the + classifier to predict labels for the identifier tokens. + + Args: + clf (Classifier): The trained classifier model. + data (pd.DataFrame): A DataFrame containing features for identifier tokens. The columns of the DataFrame should + match the feature names used during training. + + Returns: + np.array: An array of predicted labels for the identifier tokens. + """ + # Drop unnecessary columns + data = data.drop(columns=['WORD', 'SPLIT_IDENTIFIER'], errors='ignore') + + # Ensure only the features used during training are included + trained_features = clf.feature_names_in_ # Features expected by the classifier + missing_features = set(trained_features) - set(data.columns) + extra_features = set(data.columns) - set(trained_features) + + if missing_features: + raise ValueError(f"The following expected features are missing: {missing_features}") + if extra_features: + print(f"Warning: The following unused features are being ignored: {extra_features}") + data = data[trained_features] + + # Ensure feature order matches the trained model + df_features = data[trained_features] + + print("THESE") + print(df_features) + + print("THOSE") + print(clf.feature_names_in_) + + # Make predictions + y_pred = clf.predict(df_features) + return y_pred From d91ed0d654d07af3eff4809f9b881ee0bcdbf92d Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Sun, 16 Feb 2025 01:09:30 -0500 Subject: [PATCH 03/51] Change ports in the readme --- README.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 1686629..7a5cd64 100644 --- a/README.md +++ b/README.md @@ -54,9 +54,7 @@ options: `./main -r` will start the server, which will listen for identifier names sent via HTTP over the route: -http://127.0.0.1:5000/{cache_selection}/{identifier_name}/{code_context} - -**NOTE: ** On docker, the port is 8080 instead of 5000. +http://127.0.0.1:8080/{cache_selection}/{identifier_name}/{code_context} "cache selection" will save results to a separate cache if it is set to "student" @@ -69,11 +67,11 @@ http://127.0.0.1:5000/{cache_selection}/{identifier_name}/{code_context} For example: -Tag a declaration: ``http://127.0.0.1:5000/cache/numberArray/DECLARATION`` +Tag a declaration: ``http://127.0.0.1:8080/cache/numberArray/DECLARATION`` -Tag a function: ``http://127.0.0.1:5000/cache/GetNumberArray/FUNCTION`` +Tag a function: ``http://127.0.0.1:8080/cache/GetNumberArray/FUNCTION`` -Tag an class: ``http://127.0.0.1:5000/cache/PersonRecord/CLASS`` +Tag an class: ``http://127.0.0.1:8080/cache/PersonRecord/CLASS`` #### Note Kebab case is not currently supported due to the limitations of Spiral. Attempting to send the tagger identifiers which are in kebab case will result in the entry of a single noun. From cfea42ddc88c0a4c8232ff84376a30f88d7e5c16 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Sun, 16 Feb 2025 01:10:29 -0500 Subject: [PATCH 04/51] Forgot Git uses master and not main. Updated .yml to master. --- .github/workflows/tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e57f85f..1586a95 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -2,9 +2,9 @@ name: SCALAR Tagger CI on: push: - branches: [ main, develop ] + branches: [ master, develop ] pull_request: - branches: [ main, develop ] + branches: [ master, develop ] jobs: test-docker: @@ -112,4 +112,4 @@ jobs: uses: actions/cache@v3 with: path: ~/.cache/gensim-data/fasttext-wiki-news-subwords-300* - key: ${{ runner.os }}-fasttext-model \ No newline at end of file + key: ${{ runner.os }}-fasttext-model From d730df7773a0e7c5406fe74aa26be94b14c38cdd Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Sun, 16 Feb 2025 11:17:24 -0500 Subject: [PATCH 05/51] Change dockerhub link for now --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7a5cd64..83ead68 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ There are two ways to run the tagger. This document describes both ways. ## Getting Started with Docker -To run SCNL tagger in a Docker container you can clone the repository and pull the latest docker impage from `srcml/scanl_tagger:latest` +To run SCNL tagger in a Docker container you can clone the repository and pull the latest docker impage from `sourceslicer/scalar_tagger:latest` Make sure you have Docker and Docker Compose installed: https://docs.docker.com/engine/install/ From 40a72da0557e87d2ad289f8f2606d3e47915f60a Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Sun, 16 Mar 2025 00:11:07 -0400 Subject: [PATCH 06/51] Rewrite AppCache to use sqlite --- requirements.txt | 1 + tag_identifier.py | 143 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 113 insertions(+), 31 deletions(-) diff --git a/requirements.txt b/requirements.txt index 74c39c3..f8846a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ scikit_learn==1.3.0 scipy==1.10.1 git+https://github.com/cnewman/spiral.git waitress==2.1.2 +sqlite3 diff --git a/tag_identifier.py b/tag_identifier.py index bb89017..bb6b778 100644 --- a/tag_identifier.py +++ b/tag_identifier.py @@ -8,6 +8,7 @@ from waitress import serve from spiral import ronin import json +import sqlite3 from create_models import createModel, stable_features, mutable_feature_list app = Flask(__name__) @@ -30,41 +31,119 @@ def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> self.wordCount = wordCount # self.ModelClassifier = joblib.load('output/model_RandomForestClassifier.pkl') +#TODO: rewrite to use an SQL lite database +# class AppCache: +# def __init__(self, Path, Filename) -> None: +# self.Cache = {} +# self.Path = Path +# self.Filename = Filename + +# def load(self): +# if not os.path.isdir(self.Path): +# raise Exception("Cannot load path: "+self.Path) +# else: +# if not os.path.isfile(self.Path+"/"+self.Filename): +# JSONcache = open(self.Path+"/"+self.Filename, 'w') +# json.dump({}, JSONcache) +# JSONcache.close() +# JSONcache = open(self.Path+"/"+self.Filename, 'r') +# self.Cache = json.load(JSONcache) +# JSONcache.close() + +# def add(self, identifier, result): +# info = result +# info.update({"firstEncounter": time.time()}) +# info.update({"lastEncounter": time.time()}) +# info.update({"count": 1}) +# info.update({"version": "SCANL 1.0"}) +# self.Cache.update({identifier : info}) + +# def encounter(self, identifier): +# self.Cache[identifier].update({"lastEncounter": time.time()}) +# self.Cache[identifier].update({"count": self.Cache[identifier]["count"]+1}) +# self.Cache[identifier].update({"version": "SCANL 1.0"}) + +# def save(self): +# JSONcache = open(self.Path+"/"+self.Filename, 'w') +# json.dump(self.Cache, JSONcache) +# JSONcache.close() + +#TODO: context should probably be considered when saving tagged names class AppCache: - def __init__(self, Path, Filename) -> None: - self.Cache = {} - self.Path = Path - self.Filename = Filename - - def load(self): - if not os.path.isdir(self.Path): - raise Exception("Cannot load path: "+self.Path) - else: - if not os.path.isfile(self.Path+"/"+self.Filename): - JSONcache = open(self.Path+"/"+self.Filename, 'w') - json.dump({}, JSONcache) - JSONcache.close() - JSONcache = open(self.Path+"/"+self.Filename, 'r') - self.Cache = json.load(JSONcache) - JSONcache.close() + def __init__(self, Path) -> None: + self.Path = Path #path to an SQL lite database + + def load(self): + #create connection to database + conn = sqlite3.connect(self.Path) + #create the table of names if it doesn't exist + cursor = conn.cursor() + cursor.execute(''' + CREATE TABLE IF NOT EXISTS names ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL, + words TEXT, -- this is a JSON string + firstEncounter INTEGER, + lastEncounter INTEGER, + count INTEGER + ) + ''') + #close the database connection + conn.commit() + conn.close() def add(self, identifier, result): - info = result - info.update({"firstEncounter": time.time()}) - info.update({"lastEncounter": time.time()}) - info.update({"count": 1}) - info.update({"version": "SCANL 1.0"}) - self.Cache.update({identifier : info}) + #connection setup + conn = sqlite3.connect(self.Path) + cursor = conn.cursor() + #add identifier to table + record = { + "name": identifier, + "words": json.dumps(result["words"]), + "firstEncounter": time.time(), + "lastEncounter": time.time(), + "count": 1 + } + cursor.execute(''' + INSERT INTO names (name, words, firstEncounter, lastEncounter, count) + VALUES (:name, :words, :firstEncounter, :lastEncounter, :count) + ''', record) + #close the database connection + conn.commit() + conn.close() + + def retrieve(self, identifier): + #return a dictionary of the name, or false if not in database + conn = sqlite3.connect(self.Path) + cursor = conn.cursor() + cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ?", identifier) + row = cursor.fetchone() + + if row: + return { + "name": row[0], + "words": json.loads(rows[1]), + "firstEncounter": row[2], + "lastEncounter": row[3], + "count": row[4] + } + else: + return False def encounter(self, identifier): - self.Cache[identifier].update({"lastEncounter": time.time()}) - self.Cache[identifier].update({"count": self.Cache[identifier]["count"]+1}) - self.Cache[identifier].update({"version": "SCANL 1.0"}) - - def save(self): - JSONcache = open(self.Path+"/"+self.Filename, 'w') - json.dump(self.Cache, JSONcache) - JSONcache.close() + currentCount = self.retrieve()["count"] + #connection setup + conn = sqlite3.connect(self.Path) + cursor = conn.cursor() + #update record + cursor.execute(''' + UPDATE names + SET lastEncounter = ?, count = ? + WHERE name = ? + ''', time.time(), currentCount+1, identifier) + #close connection + conn.commit() + conn.close() class WordList: def __init__(self, Path): @@ -186,10 +265,12 @@ def save(): return "successfully saved cache" #TODO: use a query string instead for specifying student cache +#TODO: update to save data to SQL lite instead of updating a JSON +# responses should still be sent in the JSON format @app.route('///') def listen(student, identifier_name: str, identifier_context: str) -> List[dict]: #check if identifier name has already been used - cache = None; + cache = None if (student == "student"): cache = app.studentCache From fb2ab83bccfe3f614d0eb46ee8eeedfd95615fcc Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Mon, 17 Mar 2025 12:54:19 -0400 Subject: [PATCH 07/51] Switch to sqlite --- tag_identifier.py | 123 ++++++++++++++++++++++------------------------ 1 file changed, 60 insertions(+), 63 deletions(-) diff --git a/tag_identifier.py b/tag_identifier.py index bb6b778..2cf5325 100644 --- a/tag_identifier.py +++ b/tag_identifier.py @@ -29,44 +29,41 @@ def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> self.ModelMethods = modelMethods self.ModelGensimEnglish = modelGensimEnglish self.wordCount = wordCount - # self.ModelClassifier = joblib.load('output/model_RandomForestClassifier.pkl') - -#TODO: rewrite to use an SQL lite database -# class AppCache: -# def __init__(self, Path, Filename) -> None: -# self.Cache = {} -# self.Path = Path -# self.Filename = Filename - -# def load(self): -# if not os.path.isdir(self.Path): -# raise Exception("Cannot load path: "+self.Path) -# else: -# if not os.path.isfile(self.Path+"/"+self.Filename): -# JSONcache = open(self.Path+"/"+self.Filename, 'w') -# json.dump({}, JSONcache) -# JSONcache.close() -# JSONcache = open(self.Path+"/"+self.Filename, 'r') -# self.Cache = json.load(JSONcache) -# JSONcache.close() - -# def add(self, identifier, result): -# info = result -# info.update({"firstEncounter": time.time()}) -# info.update({"lastEncounter": time.time()}) -# info.update({"count": 1}) -# info.update({"version": "SCANL 1.0"}) -# self.Cache.update({identifier : info}) - -# def encounter(self, identifier): -# self.Cache[identifier].update({"lastEncounter": time.time()}) -# self.Cache[identifier].update({"count": self.Cache[identifier]["count"]+1}) -# self.Cache[identifier].update({"version": "SCANL 1.0"}) - -# def save(self): -# JSONcache = open(self.Path+"/"+self.Filename, 'w') -# json.dump(self.Cache, JSONcache) -# JSONcache.close() + +class CacheIndex: + def __init__(self, Path) -> None: + self.Path = Path + #create a table that just has a single column of cache IDs + conn = sqlite3.connect(Path) + cursor = conn.cursor() + cursor.execute(''' + CREATE TABLE IF NOT EXISTS caches ( + cache_id TEXT NOT NULL + ) + ''') + conn.commit() + conn.close() + + def add(self, cache_id): + #add cache_id to the table + conn = sqlite3(self.Path) + cursor = conn.cursor() + cursor.execute(''' + INSERT INTO caches (cache_id) VALUES (?) + ''', cache_id) + conn.commit() + conn.close() + + def isCacheExistent(self, cache_id): + conn = sqlite3(self.Path) + cursor = conn.cursor() + cursor.execute(''' + SELECT cache_id FROM caches WHERE cache_id = ? + ''') + row = cursor.fetchone() + if row: return True + else: return False + #TODO: context should probably be considered when saving tagged names class AppCache: @@ -118,6 +115,7 @@ def retrieve(self, identifier): cursor = conn.cursor() cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ?", identifier) row = cursor.fetchone() + conn.close() if row: return { @@ -205,11 +203,8 @@ def start_server(temp_config = {}): print('initializing model...') initialize_model() - print("loading cache...") - if not os.path.isdir("cache"): os.mkdir("cache") - app.cache = AppCache("cache", "cache.json") - app.studentCache = AppCache("cache", "student_cache.json") - app.cache.load() + print("setting up cache...") + app.caches = {} print("loading dictionary...") nltk.download("words") @@ -258,28 +253,30 @@ def dictionary_lookup(word): return dictionaryType #TODO: this is not an intuitive way to save cache -@app.route('/') -def save(): - app.cache.save() - app.studentCache.save() - return "successfully saved cache" - -#TODO: use a query string instead for specifying student cache -#TODO: update to save data to SQL lite instead of updating a JSON -# responses should still be sent in the JSON format -@app.route('///') -def listen(student, identifier_name: str, identifier_context: str) -> List[dict]: +# @app.route('/') +# def save(): +# app.cache.save() +# app.studentCache.save() +# return "successfully saved cache" + +#TODO: caches should be saved in an SQL lite database +@app.route('///') +def listen(cache_id, identifier_name: str, identifier_context: str) -> List[dict]: #check if identifier name has already been used cache = None - - if (student == "student"): - cache = app.studentCache - else: - cache = app.cache - - if (identifier_name in cache.Cache.keys()): - cache.encounter(identifier_name) - return cache.Cache[identifier_name] + + #find the existing cache in app.caches or create a new one if it doesn't exist + if cache_id in app.caches: + cache = app.caches[cache_id] + #check if the identifier name is in this cache and return it if so + data = cache.retrieve(identifier_name) + if data != False: + return data + else: + #create the cache and add it to the dictionary of caches + cache = AppCache("cache/"+cache_id+".db") + cache.load() + app.caches[cache_id] = cache """ Process a web request to analyze an identifier within a specific context. From 6c36a6a2005630cac73fc72ff707444aeb8a5d33 Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Fri, 21 Mar 2025 18:24:30 -0400 Subject: [PATCH 08/51] Finish initial sqlite implementation --- tag_identifier.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/tag_identifier.py b/tag_identifier.py index 2cf5325..6692356 100644 --- a/tag_identifier.py +++ b/tag_identifier.py @@ -59,7 +59,7 @@ def isCacheExistent(self, cache_id): cursor = conn.cursor() cursor.execute(''' SELECT cache_id FROM caches WHERE cache_id = ? - ''') + ''', cache_id) row = cursor.fetchone() if row: return True else: return False @@ -204,7 +204,7 @@ def start_server(temp_config = {}): initialize_model() print("setting up cache...") - app.caches = {} + app.cacheIndex = CacheIndex('index.db') print("loading dictionary...") nltk.download("words") @@ -252,23 +252,15 @@ def dictionary_lookup(word): return dictionaryType -#TODO: this is not an intuitive way to save cache -# @app.route('/') -# def save(): -# app.cache.save() -# app.studentCache.save() -# return "successfully saved cache" - -#TODO: caches should be saved in an SQL lite database +#caches should be saved in an SQL lite database @app.route('///') def listen(cache_id, identifier_name: str, identifier_context: str) -> List[dict]: #check if identifier name has already been used cache = None - #find the existing cache in app.caches or create a new one if it doesn't exist - if cache_id in app.caches: - cache = app.caches[cache_id] + if app.cacheIndex.isCacheExistent(cache_id): #check if the identifier name is in this cache and return it if so + cache = AppCache("cache/"+cache_id+".db") data = cache.retrieve(identifier_name) if data != False: return data @@ -276,7 +268,7 @@ def listen(cache_id, identifier_name: str, identifier_context: str) -> List[dict #create the cache and add it to the dictionary of caches cache = AppCache("cache/"+cache_id+".db") cache.load() - app.caches[cache_id] = cache + app.cacheIndex.add(cache_id) """ Process a web request to analyze an identifier within a specific context. From 38cbb03651fac60019b44dd9ddc7a3bf5aeab4be Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Fri, 21 Mar 2025 19:44:39 -0400 Subject: [PATCH 09/51] Remove sqlite3 from requirements --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f8846a2..74c39c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,3 @@ scikit_learn==1.3.0 scipy==1.10.1 git+https://github.com/cnewman/spiral.git waitress==2.1.2 -sqlite3 From 4596af05c41bd39659d9b809cc8635a966cf1bac Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Sun, 23 Mar 2025 15:43:50 -0400 Subject: [PATCH 10/51] Fix bugs --- .gitignore | 2 +- requirements.txt | 1 + tag_identifier.py | 18 ++++++++++-------- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index cb28750..700916a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,4 @@ output/ __pycache__/ code2vec/ cache/ -input.txt \ No newline at end of file +input.txt diff --git a/requirements.txt b/requirements.txt index 74c39c3..4caddc9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ scikit_learn==1.3.0 scipy==1.10.1 git+https://github.com/cnewman/spiral.git waitress==2.1.2 +protobuf==3.20.3 diff --git a/tag_identifier.py b/tag_identifier.py index 6692356..7db152b 100644 --- a/tag_identifier.py +++ b/tag_identifier.py @@ -46,20 +46,21 @@ def __init__(self, Path) -> None: def add(self, cache_id): #add cache_id to the table - conn = sqlite3(self.Path) + conn = sqlite3.connect(self.Path) cursor = conn.cursor() + #cache_id needs to be by itself in a tuple for some reason? otherwise sqlite freaks out idk cursor.execute(''' INSERT INTO caches (cache_id) VALUES (?) - ''', cache_id) + ''', (cache_id,)) conn.commit() conn.close() def isCacheExistent(self, cache_id): - conn = sqlite3(self.Path) + conn = sqlite3.connect(self.Path) cursor = conn.cursor() cursor.execute(''' SELECT cache_id FROM caches WHERE cache_id = ? - ''', cache_id) + ''', (cache_id,)) row = cursor.fetchone() if row: return True else: return False @@ -113,14 +114,14 @@ def retrieve(self, identifier): #return a dictionary of the name, or false if not in database conn = sqlite3.connect(self.Path) cursor = conn.cursor() - cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ?", identifier) + cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ?", (identifier,)) row = cursor.fetchone() conn.close() if row: return { "name": row[0], - "words": json.loads(rows[1]), + "words": json.loads(row[1]), "firstEncounter": row[2], "lastEncounter": row[3], "count": row[4] @@ -204,7 +205,8 @@ def start_server(temp_config = {}): initialize_model() print("setting up cache...") - app.cacheIndex = CacheIndex('index.db') + if not os.path.exists('cache'): os.mkdir('cache') + app.cacheIndex = CacheIndex('cache/index.db') print("loading dictionary...") nltk.download("words") @@ -254,7 +256,7 @@ def dictionary_lookup(word): #caches should be saved in an SQL lite database @app.route('///') -def listen(cache_id, identifier_name: str, identifier_context: str) -> List[dict]: +def listen(cache_id: str, identifier_name: str, identifier_context: str) -> List[dict]: #check if identifier name has already been used cache = None #find the existing cache in app.caches or create a new one if it doesn't exist From 455d5b57e8f9b2c3edbabd4d4e56794ec1db15be Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Sun, 23 Mar 2025 15:48:14 -0400 Subject: [PATCH 11/51] Add restart always to compose.yml --- compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/compose.yml b/compose.yml index 63cd5f1..30a003a 100644 --- a/compose.yml +++ b/compose.yml @@ -20,3 +20,4 @@ services: - words:/words ports: - "${PORT-8080}:5000" + restart: always From bec20b9675d8c6d4704a2815c23e7836ac6cfad0 Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Sun, 23 Mar 2025 19:47:41 -0400 Subject: [PATCH 12/51] Attempt at optional cache, broke everything --- tag_identifier.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/tag_identifier.py b/tag_identifier.py index 7db152b..570e57d 100644 --- a/tag_identifier.py +++ b/tag_identifier.py @@ -9,7 +9,7 @@ from spiral import ronin import json import sqlite3 -from create_models import createModel, stable_features, mutable_feature_list +from create_models import createModel, mutable_feature_list app = Flask(__name__) @@ -255,22 +255,24 @@ def dictionary_lookup(word): return dictionaryType #caches should be saved in an SQL lite database -@app.route('///') -def listen(cache_id: str, identifier_name: str, identifier_context: str) -> List[dict]: +@app.route('//') +@app.route('///') +def listen(identifier_name: str, identifier_context: str, cache_id: str = None) -> List[dict]: #check if identifier name has already been used cache = None #find the existing cache in app.caches or create a new one if it doesn't exist - if app.cacheIndex.isCacheExistent(cache_id): - #check if the identifier name is in this cache and return it if so - cache = AppCache("cache/"+cache_id+".db") - data = cache.retrieve(identifier_name) - if data != False: - return data - else: - #create the cache and add it to the dictionary of caches - cache = AppCache("cache/"+cache_id+".db") - cache.load() - app.cacheIndex.add(cache_id) + if cache_id != None: + if app.cacheIndex.isCacheExistent(cache_id): + #check if the identifier name is in this cache and return it if so + cache = AppCache("cache/"+cache_id+".db") + data = cache.retrieve(identifier_name) + if data != False: + return data + else: + #create the cache and add it to the dictionary of caches + cache = AppCache("cache/"+cache_id+".db") + cache.load() + app.cacheIndex.add(cache_id) """ Process a web request to analyze an identifier within a specific context. @@ -354,7 +356,8 @@ def listen(cache_id: str, identifier_name: str, identifier_context: str) -> List ) # append result to cache - cache.add(identifier_name, result) + if cache_id != None: + cache.add(identifier_name, result) return result From f982cf42f8555a30ea8cec2e26f70df860d23bb6 Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Wed, 26 Mar 2025 06:45:53 -0400 Subject: [PATCH 13/51] Fix count --- tag_identifier.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tag_identifier.py b/tag_identifier.py index 570e57d..897d09a 100644 --- a/tag_identifier.py +++ b/tag_identifier.py @@ -130,7 +130,7 @@ def retrieve(self, identifier): return False def encounter(self, identifier): - currentCount = self.retrieve()["count"] + currentCount = self.retrieve(identifier)["count"] #connection setup conn = sqlite3.connect(self.Path) cursor = conn.cursor() @@ -139,7 +139,7 @@ def encounter(self, identifier): UPDATE names SET lastEncounter = ?, count = ? WHERE name = ? - ''', time.time(), currentCount+1, identifier) + ''', (time.time(), currentCount+1, identifier)) #close connection conn.commit() conn.close() @@ -265,6 +265,7 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None) if app.cacheIndex.isCacheExistent(cache_id): #check if the identifier name is in this cache and return it if so cache = AppCache("cache/"+cache_id+".db") + cache.encounter(identifier_name) data = cache.retrieve(identifier_name) if data != False: return data From 9e62aa7c37ab16e6de19b86689a40861d435d343 Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Wed, 26 Mar 2025 10:23:56 -0400 Subject: [PATCH 14/51] Fix encounter --- tag_identifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tag_identifier.py b/tag_identifier.py index 897d09a..c60adf2 100644 --- a/tag_identifier.py +++ b/tag_identifier.py @@ -265,9 +265,9 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None) if app.cacheIndex.isCacheExistent(cache_id): #check if the identifier name is in this cache and return it if so cache = AppCache("cache/"+cache_id+".db") - cache.encounter(identifier_name) data = cache.retrieve(identifier_name) if data != False: + cache.encounter(identifier_name) return data else: #create the cache and add it to the dictionary of caches From bf8d0e500283a629f9ae4ac2f33551592f5c0552 Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Sat, 29 Mar 2025 13:16:48 -0400 Subject: [PATCH 15/51] Remove use of CacheIndex, add probe route --- tag_identifier.py | 56 +++++++++++------------------------------------ 1 file changed, 13 insertions(+), 43 deletions(-) diff --git a/tag_identifier.py b/tag_identifier.py index c60adf2..2f94e3c 100644 --- a/tag_identifier.py +++ b/tag_identifier.py @@ -30,42 +30,6 @@ def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> self.ModelGensimEnglish = modelGensimEnglish self.wordCount = wordCount -class CacheIndex: - def __init__(self, Path) -> None: - self.Path = Path - #create a table that just has a single column of cache IDs - conn = sqlite3.connect(Path) - cursor = conn.cursor() - cursor.execute(''' - CREATE TABLE IF NOT EXISTS caches ( - cache_id TEXT NOT NULL - ) - ''') - conn.commit() - conn.close() - - def add(self, cache_id): - #add cache_id to the table - conn = sqlite3.connect(self.Path) - cursor = conn.cursor() - #cache_id needs to be by itself in a tuple for some reason? otherwise sqlite freaks out idk - cursor.execute(''' - INSERT INTO caches (cache_id) VALUES (?) - ''', (cache_id,)) - conn.commit() - conn.close() - - def isCacheExistent(self, cache_id): - conn = sqlite3.connect(self.Path) - cursor = conn.cursor() - cursor.execute(''' - SELECT cache_id FROM caches WHERE cache_id = ? - ''', (cache_id,)) - row = cursor.fetchone() - if row: return True - else: return False - - #TODO: context should probably be considered when saving tagged names class AppCache: def __init__(self, Path) -> None: @@ -206,7 +170,6 @@ def start_server(temp_config = {}): print("setting up cache...") if not os.path.exists('cache'): os.mkdir('cache') - app.cacheIndex = CacheIndex('cache/index.db') print("loading dictionary...") nltk.download("words") @@ -254,7 +217,15 @@ def dictionary_lookup(word): return dictionaryType -#caches should be saved in an SQL lite database +#route to check for and create a database if it does not exist already +@app.route('/probe/') +def probe(cache_id: str): + if os.path.exists("cache/"+cache_id+".db3"): + return "Opening existing identifier database..." + else: + return "First request will create identifier database: "+cache_id+"..." + +#route to tag an identifier name @app.route('//') @app.route('///') def listen(identifier_name: str, identifier_context: str, cache_id: str = None) -> List[dict]: @@ -262,18 +233,17 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None) cache = None #find the existing cache in app.caches or create a new one if it doesn't exist if cache_id != None: - if app.cacheIndex.isCacheExistent(cache_id): + if os.path.exists("cache/"+cache_id+".db3"): #check if the identifier name is in this cache and return it if so - cache = AppCache("cache/"+cache_id+".db") + cache = AppCache("cache/"+cache_id+".db3") data = cache.retrieve(identifier_name) if data != False: cache.encounter(identifier_name) return data else: - #create the cache and add it to the dictionary of caches - cache = AppCache("cache/"+cache_id+".db") + #create the cache + cache = AppCache("cache/"+cache_id+".db3") cache.load() - app.cacheIndex.add(cache_id) """ Process a web request to analyze an identifier within a specific context. From 0e5df4ef54bc89caf628a68fd1935e9ed70eb006 Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Mon, 21 Apr 2025 13:20:44 -0400 Subject: [PATCH 16/51] Update documentation --- Dockerfile | 4 ++-- README.md | 12 +++++++----- serve.json | 4 ++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index b9d7ed1..3c31e07 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,8 +2,8 @@ FROM python:3.10-slim # Install (and build) requirements COPY requirements.txt /requirements.txt -RUN apt-get update && \ - apt-get install -y git curl && \ +RUN apt-get update --fix-missing && \ + apt-get install --allow-unauthenticated -y git curl && \ pip install -r requirements.txt && \ rm -rf /var/lib/apt/lists/* diff --git a/README.md b/README.md index 859dd10..f7f235b 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,7 @@ Conosider configuring `PYTHONPATH` as well: export PYTHONPATH=~/path/to/scanl_tagger -Finally, you need to install Spiral, which we use for identifier splitting. The current version of Spiral on the official repo has a [problem](https://github.com/casics/spiral/issues/4), so consider installing the one from the link below: - - sudo pip3 install git+https://github.com/cnewman/spiral.git +Install dependencies by running `pip3 install -r requirements.txt` in the root of the repository. Finally, we require the `token` and `target` vectors from [code2vec](https://github.com/tech-srl/code2vec). The tagger will attempt to automatically download them if it doesn't find them, but you could download them yourself if you like. It will place them in your local directory under `./code2vec/*` @@ -50,9 +48,13 @@ options: `./main -r` will start the server, which will listen for identifier names sent via HTTP over the route: -http://127.0.0.1:5000/{cache_selection}/{identifier_name}/{code_context} +http://127.0.0.1:5000/{identifier_name}/{code_context}/{database_name (optional)} + +"database name" specifies an sqlite database to be used for result caching and data collection. If the database specified does not exist, one will be created. + +You can check wehther or not a database exists by using the `/probe` route by sending an HTTP request like this: -"cache selection" will save results to a separate cache if it is set to "student" +http://127.0.0.1:5000/probe/{database_name} "code context" is one of: - FUNCTION diff --git a/serve.json b/serve.json index 84e15c0..261db0b 100644 --- a/serve.json +++ b/serve.json @@ -1,6 +1,6 @@ { "address": "0.0.0.0", "port": 5000, - "protocol": "https", + "protocol": "http", "words":"" -} \ No newline at end of file +} From 11e45a78d7dc7019e6414c22ff71e4286990c720 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Wed, 23 Apr 2025 21:03:04 -0400 Subject: [PATCH 17/51] Create LICENSE --- LICENSE | 674 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 674 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f288702 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. From 041c103064e4bf6738461b2a3fd37c3f0b439848 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Wed, 23 Apr 2025 21:03:50 -0400 Subject: [PATCH 18/51] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 83ead68..f7794ce 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,12 @@ There are two ways to run the tagger. This document describes both ways. ## Getting Started with Docker -To run SCNL tagger in a Docker container you can clone the repository and pull the latest docker impage from `sourceslicer/scalar_tagger:latest` +To run SCALAR in a Docker container you can clone the repository and pull the latest docker impage from `sourceslicer/scalar_tagger:latest` Make sure you have Docker and Docker Compose installed: + https://docs.docker.com/engine/install/ + https://docs.docker.com/compose/install/ ``` From 23d28e67f053f062f0f08c37316155d37790f82e Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Sun, 27 Apr 2025 22:58:03 -0400 Subject: [PATCH 19/51] Update README.md --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index f7794ce..9c8ee5e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,14 @@ # SCALAR Part-of-speech tagger This the official release of the SCALAR Part-of-speech tagger +# Current Metrics (this will be updated every time we update/change the model!) +| | Accuracy | Balanced Accuracy | Weighted Recall | Weighted Precision | Weighted F1 | Performance (seconds) | +|------------|:--------:|:------------------:|:---------------:|:------------------:|:-----------:|:---------------------:| +| **SCALAR** | **0.8216** | **0.9160** | **0.8216** | **0.8245** | **0.8220** | **249.05** | +| Ensemble | 0.7124 | 0.8311 | 0.7124 | 0.7597 | 0.7235 | 1149.44 | +| Flair | 0.6087 | 0.7844 | 0.6087 | 0.7755 | 0.6497 | 807.03 | + + There are two ways to run the tagger. This document describes both ways. 1. Using Docker compose (which runs the tagger's built-in server for you) From 5a398c54b19e98cd8d0a8bb75b4a5ac1199787e0 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Sun, 27 Apr 2025 22:59:11 -0400 Subject: [PATCH 20/51] Update README.md --- README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 9c8ee5e..99dcb9d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,11 @@ # SCALAR Part-of-speech tagger This the official release of the SCALAR Part-of-speech tagger +There are two ways to run the tagger. This document describes both ways. + +1. Using Docker compose (which runs the tagger's built-in server for you) +2. Running the tagger's built-in server without Docker + # Current Metrics (this will be updated every time we update/change the model!) | | Accuracy | Balanced Accuracy | Weighted Recall | Weighted Precision | Weighted F1 | Performance (seconds) | |------------|:--------:|:------------------:|:---------------:|:------------------:|:-----------:|:---------------------:| @@ -8,12 +13,6 @@ This the official release of the SCALAR Part-of-speech tagger | Ensemble | 0.7124 | 0.8311 | 0.7124 | 0.7597 | 0.7235 | 1149.44 | | Flair | 0.6087 | 0.7844 | 0.6087 | 0.7755 | 0.6497 | 807.03 | - -There are two ways to run the tagger. This document describes both ways. - -1. Using Docker compose (which runs the tagger's built-in server for you) -2. Running the tagger's built-in server without Docker - ## Getting Started with Docker To run SCALAR in a Docker container you can clone the repository and pull the latest docker impage from `sourceslicer/scalar_tagger:latest` From 76933c7a64457d5db60d50d22e899d7d0ea64ecb Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Sun, 27 Apr 2025 22:59:48 -0400 Subject: [PATCH 21/51] Update header level --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 99dcb9d..c384bae 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ There are two ways to run the tagger. This document describes both ways. 1. Using Docker compose (which runs the tagger's built-in server for you) 2. Running the tagger's built-in server without Docker -# Current Metrics (this will be updated every time we update/change the model!) +## Current Metrics (this will be updated every time we update/change the model!) | | Accuracy | Balanced Accuracy | Weighted Recall | Weighted Precision | Weighted F1 | Performance (seconds) | |------------|:--------:|:------------------:|:---------------:|:------------------:|:-----------:|:---------------------:| | **SCALAR** | **0.8216** | **0.9160** | **0.8216** | **0.8245** | **0.8220** | **249.05** | From 3fce0f676c581313d8b13f674922b30a933a4bb6 Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Sat, 3 May 2025 17:20:26 -0400 Subject: [PATCH 22/51] Resolve merge conflicts --- README.md | 95 +++++++++++--- requirements.txt | 118 +++++++++++++++--- serve.json | 4 +- .../classifier_multiclass.py | 0 create_models.py => src/create_models.py | 0 .../download_code2vec_vectors.py | 0 .../feature_generator.py | 0 tag_identifier.py => src/tag_identifier.py | 0 8 files changed, 183 insertions(+), 34 deletions(-) rename classifier_multiclass.py => src/classifier_multiclass.py (100%) rename create_models.py => src/create_models.py (100%) rename download_code2vec_vectors.py => src/download_code2vec_vectors.py (100%) rename feature_generator.py => src/feature_generator.py (100%) rename tag_identifier.py => src/tag_identifier.py (100%) diff --git a/README.md b/README.md index f7f235b..79c0453 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,45 @@ # SCALAR Part-of-speech tagger This the official release of the SCALAR Part-of-speech tagger -**NOTE** -There is a fork of SCALAR which was designed to handle parallel http requests and cache SCALAR's output to increase its speed. You can find this version here: https://github.com/brandonscholten/scanl_tagger. These will be combined into a single application in the *very* near future. +There are two ways to run the tagger. This document describes both ways. + +1. Using Docker compose (which runs the tagger's built-in server for you) +2. Running the tagger's built-in server without Docker + +## Current Metrics (this will be updated every time we update/change the model!) +| | Accuracy | Balanced Accuracy | Weighted Recall | Weighted Precision | Weighted F1 | Performance (seconds) | +|------------|:--------:|:------------------:|:---------------:|:------------------:|:-----------:|:---------------------:| +| **SCALAR** | **0.8216** | **0.9160** | **0.8216** | **0.8245** | **0.8220** | **249.05** | +| Ensemble | 0.7124 | 0.8311 | 0.7124 | 0.7597 | 0.7235 | 1149.44 | +| Flair | 0.6087 | 0.7844 | 0.6087 | 0.7755 | 0.6497 | 807.03 | ## Getting Started with Docker -To run SCNL tagger in a Docker container you can clone the repository and pull the latest docker impage from `srcml/scanl_tagger:latest` +To run SCALAR in a Docker container you can clone the repository and pull the latest docker impage from `sourceslicer/scalar_tagger:latest` + +Make sure you have Docker and Docker Compose installed: + +https://docs.docker.com/engine/install/ + +https://docs.docker.com/compose/install/ ``` -git clone https://github.com/brandonscholten/scanl_tagger.git +git clone git@github.com:SCANL/scanl_tagger.git cd scanl_tagger docker compose pull docker compose up ``` -## Setup and Run -You will need `python3.10` installed. +## Getting Started without Docker +You will need `python3.12` installed. -You'll need to install `pip3` +You'll need to install `pip` -- https://pip.pypa.io/en/stable/installation/ -Conosider configuring `PYTHONPATH` as well: +Set up a virtual environtment: `python -m venv /tmp/tagger` -- feel free to put it somewhere else (change /tmp/tagger) if you prefer - export PYTHONPATH=~/path/to/scanl_tagger +Activate the virtual environment: `source /tmp/tagger/bin/activate` (you can find how to activate it here if `source` does not work for you -- https://docs.python.org/3/library/venv.html#how-venvs-work) -Install dependencies by running `pip3 install -r requirements.txt` in the root of the repository. +After it's installed and your virtual environment is activated, in the root of the repo, run `pip install -r requirements.txt` Finally, we require the `token` and `target` vectors from [code2vec](https://github.com/tech-srl/code2vec). The tagger will attempt to automatically download them if it doesn't find them, but you could download them yourself if you like. It will place them in your local directory under `./code2vec/*` @@ -48,7 +63,7 @@ options: `./main -r` will start the server, which will listen for identifier names sent via HTTP over the route: -http://127.0.0.1:5000/{identifier_name}/{code_context}/{database_name (optional)} +http://127.0.0.1:8080/{identifier_name}/{code_context}/{database_name (optional)} "database name" specifies an sqlite database to be used for result caching and data collection. If the database specified does not exist, one will be created. @@ -65,26 +80,73 @@ http://127.0.0.1:5000/probe/{database_name} For example: -Tag a declaration: ``http://127.0.0.1:5000/cache/numberArray/DECLARATION`` +Tag a declaration: ``http://127.0.0.1:8000/cache/numberArray/DECLARATION`` -Tag a function: ``http://127.0.0.1:5000/cache/GetNumberArray/FUNCTION`` +Tag a function: ``http://127.0.0.1:8000/cache/GetNumberArray/FUNCTION`` -Tag an class: ``http://127.0.0.1:5000/cache/PersonRecord/CLASS`` +Tag an class: ``http://127.0.0.1:8000/cache/PersonRecord/CLASS`` #### Note Kebab case is not currently supported due to the limitations of Spiral. Attempting to send the tagger identifiers which are in kebab case will result in the entry of a single noun. You will need to have a way to parse code and filter out identifier names if you want to do some on-the-fly analysis of source code. We recommend [srcML](https://www.srcml.org/). Since the actual tagger is a web server, you don't have to use srcML. You could always use other AST-based code representations, or any other method of obtaining identifier information. + +## Tagset + +**Supported Tagset** +| Abbreviation | Expanded Form | Examples | +|:------------:|:--------------------------------------------:|:--------------------------------------------:| +| N | noun | Disneyland, shoe, faucet, mother | +| DT | determiner | the, this, that, these, those, which | +| CJ | conjunction | and, for, nor, but, or, yet, so | +| P | preposition | behind, in front of, at, under, above | +| NPL | noun plural | Streets, cities, cars, people, lists | +| NM | noun modifier (**noun-adjunct**, adjective) | red, cold, hot, **bit**Set, **employee**Name | +| V | verb | Run, jump, spin, | +| VM | verb modifier (adverb) | Very, loudly, seriously, impatiently | +| D | digit | 1, 2, 10, 4.12, 0xAF | +| PRE | preamble | Gimp, GLEW, GL, G, p, m, b | + +**Penn Treebank to SCALAR tagset** + +| Penn Treebank Annotation | SCALAR Tagset | +|:---------------------------:|:------------------------:| +| Conjunction (CC) | Conjunction (CJ) | +| Digit (CD) | Digit (D) | +| Determiner (DT) | Determiner (DT) | +| Foreign Word (FW) | Noun (N) | +| Preposition (IN) | Preposition (P) | +| Adjective (JJ) | Noun Modifier (NM) | +| Comparative Adjective (JJR) | Noun Modifier (NM) | +| Superlative Adjective (JJS) | Noun Modifier (NM) | +| List Item (LS) | Noun (N) | +| Modal (MD) | Verb (V) | +| Noun Singular (NN) | Noun (N) | +| Proper Noun (NNP) | Noun (N) | +| Proper Noun Plural (NNPS) | Noun Plural (NPL) | +| Noun Plural (NNS) | Noun Plural (NPL) | +| Adverb (RB) | Verb Modifier (VM) | +| Comparative Adverb (RBR) | Verb Modifier (VM) | +| Particle (RP) | Verb Modifier (VM) | +| Symbol (SYM) | Noun (N) | +| To Preposition (TO) | Preposition (P) | +| Verb (VB) | Verb (V) | +| Verb (VBD) | Verb (V) | +| Verb (VBG) | Verb (V) | +| Verb (VBN) | Verb (V) | +| Verb (VBP) | Verb (V) | +| Verb (VBZ) | Verb (V) | + ## Training the tagger You can train this tagger using the `-t` option (which will re-run the training routine). For the moment, most of this is hard-coded in, so if you want to use a different data set/different seeds, you'll need to modify the code. This will potentially change in the future. ## Errors? Please make an issue if you run into errors -# Please Cite the Paper! +# Please Cite the Paper(s)! -No paper for now however the current tagger is based on our previous, so you could cite the previous one for now: +Newman, Christian, Scholten , Brandon, Testa, Sophia, Behler, Joshua, Banabilah, Syreen, Collard, Michael L., Decker, Michael, Mkaouer, Mohamed Wiem, Zampieri, Marcos, Alomar, Eman Abdullah, Alsuhaibani, Reem, Peruma, Anthony, Maletic, Jonathan I., (2025), “SCALAR: A Part-of-speech Tagger for Identifiers”, in the Proceedings of the 33rd IEEE/ACM International Conference on Program Comprehension - Tool Demonstrations Track (ICPC), Ottawa, ON, Canada, April 27 -28, 5 pages TO APPEAR. Christian D. Newman, Michael J. Decker, Reem S. AlSuhaibani, Anthony Peruma, Satyajit Mohapatra, Tejal Vishnoi, Marcos Zampieri, Mohamed W. Mkaouer, Timothy J. Sheldon, and Emily Hill, "An Ensemble Approach for Annotating Source Code Identifiers with Part-of-speech Tags," in IEEE Transactions on Software Engineering, doi: 10.1109/TSE.2021.3098242. @@ -98,4 +160,3 @@ Find our other research [at our webpage](https://www.scanl.org/) and check out t This project uses WordNet to perform a dictionary lookup on the individual words in each identifier: Princeton University "About WordNet." [WordNet](https://wordnet.princeton.edu/). Princeton University. 2010 - diff --git a/requirements.txt b/requirements.txt index 4caddc9..51e31b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,104 @@ -utils==1.0.1 -flair==0.14.0 +accelerate==1.3.0 +attrs==25.1.0 +beautifulsoup4==4.12.3 +bioc==2.1 +blinker==1.9.0 +boto3==1.36.6 +botocore==1.36.6 +certifi==2024.12.14 +charset-normalizer==3.4.1 +click==8.1.8 +conllu==4.5.3 +contourpy==1.3.1 +cycler==0.12.1 +Deprecated==1.2.17 +docopt==0.6.2 +filelock==3.17.0 +flair==0.15.0 Flask==3.1.0 -gensim==4.3.1 -imbalanced_learn==0.12.2 -imblearn==0.0 -joblib==1.3.1 -nltk==3.8.1 -numpy==1.25.1 -pandas==2.0.3 -Requests==2.32.3 -scikit_learn==1.3.0 -scipy==1.10.1 -git+https://github.com/cnewman/spiral.git -waitress==2.1.2 -protobuf==3.20.3 +fonttools==4.55.6 +fsspec==2024.12.0 +ftfy==6.3.1 +gdown==5.2.0 +gensim==4.3.3 +huggingface-hub==0.27.1 +humanize==4.11.0 +idna==3.10 +iniconfig==2.0.0 +intervaltree==3.1.0 +itsdangerous==2.2.0 +Jinja2==3.1.5 +jmespath==1.0.1 +joblib==1.4.2 +jsonlines==4.0.0 +kiwisolver==1.4.8 +langdetect==1.0.9 +lxml==5.3.0 +MarkupSafe==3.0.2 +matplotlib==3.10.0 +more-itertools==10.6.0 +mpld3==0.5.10 +mpmath==1.3.0 +networkx==3.4.2 +nltk==3.9.1 +numpy==1.26.4 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +packaging==24.2 +pandas==2.2.3 +pillow==11.1.0 +plac==1.4.3 +pluggy==1.5.0 +pptree==3.1 +protobuf==5.29.3 +psutil==6.1.1 +pyparsing==3.2.1 +PySocks==1.7.1 +pytest==8.3.4 +python-dateutil==2.9.0.post0 +pytorch_revgrad==0.2.0 +pytz==2024.2 +PyYAML==6.0.2 +regex==2024.11.6 +requests==2.32.3 +s3transfer==0.11.2 +safetensors==0.5.2 +scikit-learn==1.6.1 +scipy==1.13.1 +segtok==1.5.11 +sentencepiece==0.2.0 +setuptools==75.8.0 +six==1.17.0 +smart-open==7.1.0 +sortedcontainers==2.4.0 +soupsieve==2.6 +spiral @ git+https://github.com/cnewman/spiral.git@dff537320c15849c10e583968036df2d966eddee +sqlitedict==2.1.0 +sympy==1.13.1 +tabulate==0.9.0 +termcolor==2.5.0 +threadpoolctl==3.5.0 +tokenizers==0.21.0 +torch==2.5.1 +tqdm==4.67.1 +transformer-smaller-training-vocab==0.4.0 +transformers==4.48.1 +triton==3.1.0 +typing_extensions==4.12.2 +tzdata==2025.1 +urllib3==2.3.0 +waitress==3.0.2 +wcwidth==0.2.13 +Werkzeug==3.1.3 +Wikipedia-API==0.8.1 +wrapt==1.17.2 \ No newline at end of file diff --git a/serve.json b/serve.json index 261db0b..3eeb486 100644 --- a/serve.json +++ b/serve.json @@ -1,6 +1,6 @@ { "address": "0.0.0.0", - "port": 5000, - "protocol": "http", + "port": 8080, + "protocol": "https", "words":"" } diff --git a/classifier_multiclass.py b/src/classifier_multiclass.py similarity index 100% rename from classifier_multiclass.py rename to src/classifier_multiclass.py diff --git a/create_models.py b/src/create_models.py similarity index 100% rename from create_models.py rename to src/create_models.py diff --git a/download_code2vec_vectors.py b/src/download_code2vec_vectors.py similarity index 100% rename from download_code2vec_vectors.py rename to src/download_code2vec_vectors.py diff --git a/feature_generator.py b/src/feature_generator.py similarity index 100% rename from feature_generator.py rename to src/feature_generator.py diff --git a/tag_identifier.py b/src/tag_identifier.py similarity index 100% rename from tag_identifier.py rename to src/tag_identifier.py From 81d140a15308689db1e3946d7d61971bea18767c Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Sat, 3 May 2025 19:28:02 -0400 Subject: [PATCH 23/51] Resolve merge conflicts --- src/tag_identifier.py | 1158 ++++++++++++++--------------------------- 1 file changed, 401 insertions(+), 757 deletions(-) diff --git a/src/tag_identifier.py b/src/tag_identifier.py index f690d8f..0af80c1 100644 --- a/src/tag_identifier.py +++ b/src/tag_identifier.py @@ -1,757 +1,401 @@ -#original ====================================================================== -import os -import time -import joblib -import nltk -import pandas as pd -from feature_generator import * -from flask import Flask -from waitress import serve -from spiral import ronin -import json -import sqlite3 -from create_models import createModel, mutable_feature_list - -app = Flask(__name__) - -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -class ModelData: - def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> None: - """ - Initialize an instance of the ModelData class with word vector models. - - Args: - ModelTokens: Word vectors model for tokens. - ModelMethods: Word vectors model for methods. - ModelGensimEnglish: Word vectors model for general English words. - """ - - self.ModelTokens = modelTokens - self.ModelMethods = modelMethods - self.ModelGensimEnglish = modelGensimEnglish - self.wordCount = wordCount - -#TODO: context should probably be considered when saving tagged names -class AppCache: - def __init__(self, Path) -> None: - self.Path = Path #path to an SQL lite database - - def load(self): - #create connection to database - conn = sqlite3.connect(self.Path) - #create the table of names if it doesn't exist - cursor = conn.cursor() - cursor.execute(''' - CREATE TABLE IF NOT EXISTS names ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - name TEXT NOT NULL, - words TEXT, -- this is a JSON string - firstEncounter INTEGER, - lastEncounter INTEGER, - count INTEGER - ) - ''') - #close the database connection - conn.commit() - conn.close() - - def add(self, identifier, result): - #connection setup - conn = sqlite3.connect(self.Path) - cursor = conn.cursor() - #add identifier to table - record = { - "name": identifier, - "words": json.dumps(result["words"]), - "firstEncounter": time.time(), - "lastEncounter": time.time(), - "count": 1 - } - cursor.execute(''' - INSERT INTO names (name, words, firstEncounter, lastEncounter, count) - VALUES (:name, :words, :firstEncounter, :lastEncounter, :count) - ''', record) - #close the database connection - conn.commit() - conn.close() - - def retrieve(self, identifier): - #return a dictionary of the name, or false if not in database - conn = sqlite3.connect(self.Path) - cursor = conn.cursor() - cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ?", (identifier,)) - row = cursor.fetchone() - conn.close() - - if row: - return { - "name": row[0], - "words": json.loads(row[1]), - "firstEncounter": row[2], - "lastEncounter": row[3], - "count": row[4] - } - else: - return False - - def encounter(self, identifier): - currentCount = self.retrieve(identifier)["count"] - #connection setup - conn = sqlite3.connect(self.Path) - cursor = conn.cursor() - #update record - cursor.execute(''' - UPDATE names - SET lastEncounter = ?, count = ? - WHERE name = ? - ''', (time.time(), currentCount+1, identifier)) - #close connection - conn.commit() - conn.close() - -class WordList: - def __init__(self, Path): - self.Words = set() - self.Path = Path - - def load(self): - if not os.path.isfile(self.Path): - print("Could not find word list file!") - return - with open(self.Path) as file: - for line in file: - self.Words.add(line[:line.find(',')]) #stop at comma - - def find(self, item): - return item in self.Words - -def initialize_model(): - """ - Initialize and load word vectors for the application, and load a word count DataFrame. - - This function initializes and loads word vectors using the 'createModel' function, and loads word counts - from a JSON file into a Pandas DataFrame for use in the application. - - Returns: - tuple: (ModelData, WORD_COUNT DataFrame) - """ - print("Loading word vectors!!") - modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR) - print("Word vectors loaded!!") - - # Load the word count JSON file into a DataFrame - word_count_path = os.path.join("input", "word_count.json") - if os.path.exists(word_count_path): - print(f"Loading word count data from {word_count_path}...") - word_count_df = pd.read_json(word_count_path, orient='index', typ='series').reset_index() - word_count_df.columns = ['word', 'log_frequency'] - print("Word count data loaded!") - else: - print(f"Word count file not found at {word_count_path}. Initializing empty DataFrame.") - word_count_df = pd.DataFrame(columns=['word', 'log_frequency']) - - # Create and store model data - app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df) - -def start_server(temp_config = {}): - """ - Initialize the model and start the server. - - This function first initializes the model by calling the 'initialize_model' function. Then, it starts the server using - the waitress `serve` method, allowing incoming HTTP requests to be handled. - - The arguments to waitress serve are read from the configuration file `serve.json`. The default option is to - listen for HTTP requests on all interfaces (ip address 0.0.0.0, port 5000). - - Returns: - None - """ - print('initializing model...') - initialize_model() - - print("setting up cache...") - if not os.path.exists('cache'): os.mkdir('cache') - - print("loading dictionary...") - nltk.download("words") - app.english_words = set(w.lower() for w in nltk.corpus.words.words()) - #insert english words from words/en.txt - if not os.path.exists("words/en.txt"): - print("could not find English words, using WordNet only!") - else: - with open("words/en.txt") as words: - for word in words: - app.english_words.add(word[:-1]) - - print('retrieving server configuration...') - data = open('serve.json') - config = json.load(data) - - server_host = temp_config["address"] if "address" in temp_config.keys() else config["address"] - server_port = temp_config["port"] if "port" in temp_config.keys() else config['port'] - server_url_scheme = temp_config["protocol"] if "protocol" in temp_config.keys() else config["protocol"] - - print("loading word list...") - wordListPath = temp_config["words"] if "words" in temp_config.keys() else config["words"] - app.words = WordList(wordListPath) - app.words.load() - - print("Starting server...") - serve(app, host=server_host, port=server_port, url_scheme=server_url_scheme) - data.close() - -def dictionary_lookup(word): - #return true if the word exists in the dictionary (the nltk words corpus) - #or if the word is in the list of approved words - dictionaryType = "" - dictionary = word.lower() in app.english_words - acceptable = app.words.find(word) - digit = word.isnumeric() - if (dictionary): - dictionaryType = "DW" - elif (acceptable): - dictionaryType = "AW" - elif (digit): - dictionaryType = "DD" - else: - dictionaryType = "UC" - - return dictionaryType - -#route to check for and create a database if it does not exist already -@app.route('/probe/') -def probe(cache_id: str): - if os.path.exists("cache/"+cache_id+".db3"): - return "Opening existing identifier database..." - else: - return "First request will create identifier database: "+cache_id+"..." - -#route to tag an identifier name -@app.route('//') -@app.route('///') -def listen(identifier_name: str, identifier_context: str, cache_id: str = None) -> List[dict]: - #check if identifier name has already been used - cache = None - #find the existing cache in app.caches or create a new one if it doesn't exist - if cache_id != None: - if os.path.exists("cache/"+cache_id+".db3"): - #check if the identifier name is in this cache and return it if so - cache = AppCache("cache/"+cache_id+".db3") - data = cache.retrieve(identifier_name) - if data != False: - cache.encounter(identifier_name) - return data - else: - #create the cache - cache = AppCache("cache/"+cache_id+".db3") - cache.load() - - """ - Process a web request to analyze an identifier within a specific context. - - This route function takes two URL parameters (identifier_name, and identifier_context) from an - incoming HTTP request and performs data preprocessing and feature extraction on the identifier_name. - It then uses a trained classifier to annotate the identifier with part-of-speech tags and other linguistic features. - - Args: - identifier_name (str): The name of the identifier to be analyzed. - identifier_context (str): The context in which the identifier appears. - - Returns: - List[dict]: A list of dictionaries containing words and their predicted POS tags. - """ - print(f"INPUT: {identifier_name} {identifier_context}") - - # Split identifier_name into words - words = ronin.split(identifier_name) - - # # Create initial data frame - data = pd.DataFrame({ - 'WORD': words, - 'SPLIT_IDENTIFIER': ' '.join(words), - 'CONTEXT_NUMBER': context_to_number(identifier_context), # Predefined context number - }) - - # create response JSON - # tags = list(annotate_identifier(app.model_data.ModelClassifier, data)) - result = { - "words" : [] - } - - # Add features to the data - data = createFeatures( - data, - mutable_feature_list, - modelGensimEnglish=app.model_data.ModelGensimEnglish, - ) - - categorical_features = ['NLTK_POS'] - category_variables = [] - - for category_column in categorical_features: - if category_column in data.columns: - category_variables.append(category_column) - data.loc[:, category_column] = data[category_column].astype(str) - - for category_column in category_variables: - # Explicitly handle categorical conversion - unique_values = data[category_column].unique() - category_map = {} - for value in unique_values: - if value in universal_to_custom: - category_map[value] = custom_to_numeric[universal_to_custom[value]] - else: - category_map[value] = custom_to_numeric['NOUN'] # Assign 'NM' (8) for unknown categories - - data.loc[:, category_column] = data[category_column].map(category_map) - - # Convert categorical variables to numeric - # Load and apply the classifier - clf = joblib.load(os.path.join(SCRIPT_DIR, 'output', 'model_GradientBoostingClassifier.pkl')) - predicted_tags = annotate_identifier(clf, data) - - # Combine words and their POS tags into a parseable format - #result = [{'word': word, 'pos_tag': tag} for word, tag in zip(words, predicted_tags)] - - for i in range(len(words)): - #check dictionary - dictionary = "UC" #uncategorized - word = words[i] - dictionary = dictionary_lookup(word) - result["words"].append( - { - words[i] : { - "tag" : predicted_tags[i], - "dictionary" : dictionary - } - } - ) - - # append result to cache - if cache_id != None: - cache.add(identifier_name, result) - - return result - -def context_to_number(context): - """ - Convert a textual context description to a numerical representation. - - This function takes a context description as a string and maps it to a numerical representation according to a - predefined mapping. - - Args: - context (str): The textual context description. - - Returns: - int: The numerical representation of the context. - - Raises: - ValueError: If the provided context is not one of the predefined values. - - Example: - numeric_context = context_to_number("CLASS") - """ - if context == "ATTRIBUTE": - return 1 - elif context == "CLASS": - return 2 - elif context == "DECLARATION": - return 3 - elif context == "FUNCTION": - return 4 - elif context == "PARAMETER": - return 5 - -def annotate_identifier(clf, data): - """ - Annotate identifier tokens using a trained classifier. - - This function takes a trained classifier and a dataset containing features for identifier tokens. It applies the - classifier to predict labels for the identifier tokens. - - Args: - clf (Classifier): The trained classifier model. - data (pd.DataFrame): A DataFrame containing features for identifier tokens. The columns of the DataFrame should - match the feature names used during training. - - Returns: - np.array: An array of predicted labels for the identifier tokens. - """ - # Drop unnecessary columns - data = data.drop(columns=['WORD', 'SPLIT_IDENTIFIER'], errors='ignore') - - # Ensure only the features used during training are included - trained_features = clf.feature_names_in_ # Features expected by the classifier - missing_features = set(trained_features) - set(data.columns) - extra_features = set(data.columns) - set(trained_features) - - if missing_features: - raise ValueError(f"The following expected features are missing: {missing_features}") - if extra_features: - print(f"Warning: The following unused features are being ignored: {extra_features}") - data = data[trained_features] - - # Ensure feature order matches the trained model - df_features = data[trained_features] - - print("THESE") - print(df_features) - - print("THOSE") - print(clf.feature_names_in_) - - # Make predictions - y_pred = clf.predict(df_features) - return y_pred -#new ========================================================================================== -import os -import time -import joblib -import nltk -import pandas as pd -from src.feature_generator import createFeatures, universal_to_custom, custom_to_numeric -from flask import Flask -from waitress import serve -from spiral import ronin -import json -from src.create_models import createModel, stable_features, mutable_feature_list -app = Flask(__name__) - -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -class ModelData: - def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> None: - """ - Initialize an instance of the ModelData class with word vector models. - - Args: - ModelTokens: Word vectors model for tokens. - ModelMethods: Word vectors model for methods. - ModelGensimEnglish: Word vectors model for general English words. - """ - - self.ModelTokens = modelTokens - self.ModelMethods = modelMethods - self.ModelGensimEnglish = modelGensimEnglish - self.wordCount = wordCount - # self.ModelClassifier = joblib.load('output/model_RandomForestClassifier.pkl') - -class AppCache: - def __init__(self, Path, Filename) -> None: - self.Cache = {} - self.Path = Path - self.Filename = Filename - - def load(self): - if not os.path.isdir(self.Path): - raise Exception("Cannot load path: "+self.Path) - else: - if not os.path.isfile(self.Path+"/"+self.Filename): - JSONcache = open(self.Path+"/"+self.Filename, 'w') - json.dump({}, JSONcache) - JSONcache.close() - JSONcache = open(self.Path+"/"+self.Filename, 'r') - self.Cache = json.load(JSONcache) - JSONcache.close() - - def add(self, identifier, result): - info = result - info.update({"firstEncounter": time.time()}) - info.update({"lastEncounter": time.time()}) - info.update({"count": 1}) - info.update({"version": "SCANL 1.0"}) - self.Cache.update({identifier : info}) - - def encounter(self, identifier): - self.Cache[identifier].update({"lastEncounter": time.time()}) - self.Cache[identifier].update({"count": self.Cache[identifier]["count"]+1}) - self.Cache[identifier].update({"version": "SCANL 1.0"}) - - def save(self): - JSONcache = open(self.Path+"/"+self.Filename, 'w') - json.dump(self.Cache, JSONcache) - JSONcache.close() - -class WordList: - def __init__(self, Path): - self.Words = set() - self.Path = Path - - def load(self): - if not os.path.isfile(self.Path): - print("Could not find word list file!") - return - with open(self.Path) as file: - for line in file: - self.Words.add(line[:line.find(',')]) #stop at comma - - def find(self, item): - return item in self.Words - -def initialize_model(): - """ - Initialize and load word vectors for the application, and load a word count DataFrame. - - This function initializes and loads word vectors using the 'createModel' function, and loads word counts - from a JSON file into a Pandas DataFrame for use in the application. - - Returns: - tuple: (ModelData, WORD_COUNT DataFrame) - """ - print("Loading word vectors!!") - modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR) - print("Word vectors loaded!!") - - # Load the word count JSON file into a DataFrame - word_count_path = os.path.join("input", "word_count.json") - if os.path.exists(word_count_path): - print(f"Loading word count data from {word_count_path}...") - word_count_df = pd.read_json(word_count_path, orient='index', typ='series').reset_index() - word_count_df.columns = ['word', 'log_frequency'] - print("Word count data loaded!") - else: - print(f"Word count file not found at {word_count_path}. Initializing empty DataFrame.") - word_count_df = pd.DataFrame(columns=['word', 'log_frequency']) - - # Create and store model data - app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df) - -def start_server(temp_config = {}): - """ - Initialize the model and start the server. - - This function first initializes the model by calling the 'initialize_model' function. Then, it starts the server using - the waitress `serve` method, allowing incoming HTTP requests to be handled. - - The arguments to waitress serve are read from the configuration file `serve.json`. The default option is to - listen for HTTP requests on all interfaces (ip address 0.0.0.0, port 5000). - - Returns: - None - """ - print('initializing model...') - initialize_model() - - print("loading cache...") - if not os.path.isdir("cache"): os.mkdir("cache") - app.cache = AppCache("cache", "cache.json") - app.studentCache = AppCache("cache", "student_cache.json") - app.cache.load() - - app.english_words = set(w.lower() for w in nltk.corpus.words.words()) - #insert english words from words/en.txt - if not os.path.exists("words/en.txt"): - print("could not find English words, using WordNet only!") - else: - with open("words/en.txt") as words: - for word in words: - app.english_words.add(word[:-1]) - - print('retrieving server configuration...') - data = open(os.path.join(SCRIPT_DIR, '..', 'serve.json')) - config = json.load(data) - - server_host = temp_config["address"] if "address" in temp_config.keys() else config["address"] - server_port = temp_config["port"] if "port" in temp_config.keys() else config['port'] - server_url_scheme = temp_config["protocol"] if "protocol" in temp_config.keys() else config["protocol"] - - print("loading word list...") - wordListPath = temp_config["words"] if "words" in temp_config.keys() else config["words"] - app.words = WordList(wordListPath) - app.words.load() - - print("Starting server...") - serve(app, host=server_host, port=server_port, url_scheme=server_url_scheme) - data.close() - -def dictionary_lookup(word): - #return true if the word exists in the dictionary (the nltk words corpus) - #or if the word is in the list of approved words - dictionaryType = "" - dictionary = word.lower() in app.english_words - acceptable = app.words.find(word) - digit = word.isnumeric() - if (dictionary): - dictionaryType = "DW" - elif (acceptable): - dictionaryType = "AW" - elif (digit): - dictionaryType = "DD" - else: - dictionaryType = "UC" - - return dictionaryType - -#TODO: this is not an intuitive way to save cache -@app.route('/') -def save(): - app.cache.save() - app.studentCache.save() - return "successfully saved cache" - -#TODO: use a query string instead for specifying student cache -@app.route('///') -def listen(student, identifier_name: str, identifier_context: str) -> list[dict]: - #check if identifier name has already been used - cache = None - - if (student == "student"): - cache = app.studentCache - else: - cache = app.cache - - if (identifier_name in cache.Cache.keys()): - cache.encounter(identifier_name) - return cache.Cache[identifier_name] - - """ - Process a web request to analyze an identifier within a specific context. - - This route function takes two URL parameters (identifier_name, and identifier_context) from an - incoming HTTP request and performs data preprocessing and feature extraction on the identifier_name. - It then uses a trained classifier to annotate the identifier with part-of-speech tags and other linguistic features. - - Args: - identifier_name (str): The name of the identifier to be analyzed. - identifier_context (str): The context in which the identifier appears. - - Returns: - List[dict]: A list of dictionaries containing words and their predicted POS tags. - """ - print(f"INPUT: {identifier_name} {identifier_context}") - - # Split identifier_name into words - words = ronin.split(identifier_name) - - # # Create initial data frame - data = pd.DataFrame({ - 'WORD': words, - 'SPLIT_IDENTIFIER': ' '.join(words), - 'CONTEXT_NUMBER': context_to_number(identifier_context), # Predefined context number - }) - - # create response JSON - # tags = list(annotate_identifier(app.model_data.ModelClassifier, data)) - result = { - "words" : [] - } - - # Add features to the data - data = createFeatures( - data, - mutable_feature_list, - modelGensimEnglish=app.model_data.ModelGensimEnglish, - ) - - categorical_features = ['NLTK_POS','PREV_POS', 'NEXT_POS'] - category_variables = [] - - for category_column in categorical_features: - if category_column in data.columns: - category_variables.append(category_column) - data.loc[:, category_column] = data[category_column].astype(str) - - for category_column in category_variables: - # Explicitly handle categorical conversion - unique_values = data[category_column].unique() - category_map = {} - for value in unique_values: - if value in universal_to_custom: - category_map[value] = custom_to_numeric[universal_to_custom[value]] - else: - category_map[value] = custom_to_numeric['NOUN'] # Assign 'NM' (8) for unknown categories - - data.loc[:, category_column] = data[category_column].map(category_map) - - # Convert categorical variables to numeric - # Load and apply the classifier - clf = joblib.load(os.path.join(SCRIPT_DIR, '..', 'models', 'model_GradientBoostingClassifier.pkl')) - predicted_tags = annotate_identifier(clf, data) - - # Combine words and their POS tags into a parseable format - #result = [{'word': word, 'pos_tag': tag} for word, tag in zip(words, predicted_tags)] - - for i in range(len(words)): - #check dictionary - dictionary = "UC" #uncategorized - word = words[i] - dictionary = dictionary_lookup(word) - result["words"].append( - { - words[i] : { - "tag" : predicted_tags[i], - "dictionary" : dictionary - } - } - ) - - # append result to cache - cache.add(identifier_name, result) - - return result - -def context_to_number(context): - """ - Convert a textual context description to a numerical representation. - - This function takes a context description as a string and maps it to a numerical representation according to a - predefined mapping. - - Args: - context (str): The textual context description. - - Returns: - int: The numerical representation of the context. - - Raises: - ValueError: If the provided context is not one of the predefined values. - - Example: - numeric_context = context_to_number("CLASS") - """ - if context == "ATTRIBUTE": - return 1 - elif context == "CLASS": - return 2 - elif context == "DECLARATION": - return 3 - elif context == "FUNCTION": - return 4 - elif context == "PARAMETER": - return 5 - -def annotate_identifier(clf, data): - """ - Annotate identifier tokens using a trained classifier. - - This function takes a trained classifier and a dataset containing features for identifier tokens. It applies the - classifier to predict labels for the identifier tokens. - - Args: - clf (Classifier): The trained classifier model. - data (pd.DataFrame): A DataFrame containing features for identifier tokens. The columns of the DataFrame should - match the feature names used during training. - - Returns: - np.array: An array of predicted labels for the identifier tokens. - """ - # Drop unnecessary columns - data = data.drop(columns=['WORD', 'SPLIT_IDENTIFIER'], errors='ignore') - - # Ensure only the features used during training are included - trained_features = clf.feature_names_in_ # Features expected by the classifier - missing_features = set(trained_features) - set(data.columns) - extra_features = set(data.columns) - set(trained_features) - - if missing_features: - raise ValueError(f"The following expected features are missing: {missing_features}") - if extra_features: - print(f"Warning: The following unused features are being ignored: {extra_features}") - data = data[trained_features] - - # Ensure feature order matches the trained model - df_features = data[trained_features] - - # Make predictions - y_pred = clf.predict(df_features) - return y_pred +import os +import time +import joblib +import nltk +import pandas as pd +from src.feature_generator import createFeatures, universal_to_custom, custom_to_numeric +from flask import Flask +from waitress import serve +from spiral import ronin +import json +import sqlite3 +from src.create_models import createModel, stable_features, mutable_feature_list +app = Flask(__name__) + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +class ModelData: + def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> None: + """ + Initialize an instance of the ModelData class with word vector models. + + Args: + ModelTokens: Word vectors model for tokens. + ModelMethods: Word vectors model for methods. + ModelGensimEnglish: Word vectors model for general English words. + """ + + self.ModelTokens = modelTokens + self.ModelMethods = modelMethods + self.ModelGensimEnglish = modelGensimEnglish + self.wordCount = wordCount + # self.ModelClassifier = joblib.load('output/model_RandomForestClassifier.pkl') + +class AppCache: + def __init__(self, Path) -> None: + self.Path = Path + + def load(self): + #create connection to database + conn = sqlite3.connect(self.Path) + #create the table of names if it doesn't exist + cursor = conn.cursor() + cursor.execute(''' + CREATE TABLE IF NOT EXISTS names ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL, + words TEXT, -- this is a JSON string + firstEncounter INTEGER, + lastEncounter INTEGER, + count INTEGER + ) + ''') + #close the database connection + conn.commit() + conn.close() + + def add(self, identifier, result): + #connection setup + conn = sqlite3.connect(self.Path) + cursor = conn.cursor() + #add identifier to table + record = { + "name": identifier, + "words": json.dumps(result["words"]), + "firstEncounter": time.time(), + "lastEncounter": time.time(), + "count": 1 + } + cursor.execute(''' + INSERT INTO names (name, words, firstEncounter, lastEncounter, count) + VALUES (:name, :words, :firstEncounter, :lastEncounter, :count) + ''', record) + #close the database connection + conn.commit() + conn.close() + + def retrieve(self, identifier): + #return a dictionary of the name, or false if not in database + conn = sqlite3.connect(self.Path) + cursor = conn.cursor() + cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ?", (identifier,)) + row = cursor.fetchone() + conn.close() + + if row: + return { + "name": row[0], + "words": json.loads(row[1]), + "firstEncounter": row[2], + "lastEncounter": row[3], + "count": row[4] + } + else: + return False + + def encounter(self, identifier): + currentCount = self.retrieve(identifier)["count"] + #connection setup + conn = sqlite3.connect(self.Path) + cursor = conn.cursor() + #update record + cursor.execute(''' + UPDATE names + SET lastEncounter = ?, count = ? + WHERE name = ? + ''', (time.time(), currentCount+1, identifier)) + #close connection + conn.commit() + conn.close() + +class WordList: + def __init__(self, Path): + self.Words = set() + self.Path = Path + + def load(self): + if not os.path.isfile(self.Path): + print("Could not find word list file!") + return + with open(self.Path) as file: + for line in file: + self.Words.add(line[:line.find(',')]) #stop at comma + + def find(self, item): + return item in self.Words + +def initialize_model(): + """ + Initialize and load word vectors for the application, and load a word count DataFrame. + + This function initializes and loads word vectors using the 'createModel' function, and loads word counts + from a JSON file into a Pandas DataFrame for use in the application. + + Returns: + tuple: (ModelData, WORD_COUNT DataFrame) + """ + print("Loading word vectors!!") + modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR) + print("Word vectors loaded!!") + + # Load the word count JSON file into a DataFrame + word_count_path = os.path.join("input", "word_count.json") + if os.path.exists(word_count_path): + print(f"Loading word count data from {word_count_path}...") + word_count_df = pd.read_json(word_count_path, orient='index', typ='series').reset_index() + word_count_df.columns = ['word', 'log_frequency'] + print("Word count data loaded!") + else: + print(f"Word count file not found at {word_count_path}. Initializing empty DataFrame.") + word_count_df = pd.DataFrame(columns=['word', 'log_frequency']) + + # Create and store model data + app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df) + +def start_server(temp_config = {}): + """ + Initialize the model and start the server. + + This function first initializes the model by calling the 'initialize_model' function. Then, it starts the server using + the waitress `serve` method, allowing incoming HTTP requests to be handled. + + The arguments to waitress serve are read from the configuration file `serve.json`. The default option is to + listen for HTTP requests on all interfaces (ip address 0.0.0.0, port 5000). + + Returns: + None + """ + print('initializing model...') + initialize_model() + + print("loading cache...") + if not os.path.isdir("cache"): os.mkdir("cache") + + print("laoding dictionary") + #TODO: if there's issues with uncateogorized words, it's porbably because this is commented out + #nltk.download("words") + app.english_words = set(w.lower() for w in nltk.corpus.words.words()) + #insert english words from words/en.txt + if not os.path.exists("words/en.txt"): + print("could not find English words, using WordNet only!") + else: + with open("words/en.txt") as words: + for word in words: + app.english_words.add(word[:-1]) + + print('retrieving server configuration...') + data = open(os.path.join(SCRIPT_DIR, '..', 'serve.json')) + config = json.load(data) + + server_host = temp_config["address"] if "address" in temp_config.keys() else config["address"] + server_port = temp_config["port"] if "port" in temp_config.keys() else config['port'] + server_url_scheme = temp_config["protocol"] if "protocol" in temp_config.keys() else config["protocol"] + + print("loading word list...") + wordListPath = temp_config["words"] if "words" in temp_config.keys() else config["words"] + app.words = WordList(wordListPath) + app.words.load() + + print("Starting server...") + serve(app, host=server_host, port=server_port, url_scheme=server_url_scheme) + data.close() + +def dictionary_lookup(word): + #return true if the word exists in the dictionary (the nltk words corpus) + #or if the word is in the list of approved words + dictionaryType = "" + dictionary = word.lower() in app.english_words + acceptable = app.words.find(word) + digit = word.isnumeric() + if (dictionary): + dictionaryType = "DW" + elif (acceptable): + dictionaryType = "AW" + elif (digit): + dictionaryType = "DD" + else: + dictionaryType = "UC" + + return dictionaryType + +#route to check for and create a database if it does not exist already +@app.route('/probe/') +def probe(cache_id: str): + if os.path.exists("cache/"+cache_id+".db3"): + return "Opening existing identifier database..." + else: + return "First request will create identifier database: "+cache_id+"..." + +#route to tag an identifier name +@app.route('//') +@app.route('///') +def listen(identifier_name: str, identifier_context: str, cache_id: str = None) -> list[dict]: + #check if identifier name has already been used + cache = None + #find the existing cache in app.caches or create a new one if it doesn't exist + if cache_id != None: + if os.path.exists("cache/"+cache_id+".db3"): + #check if the identifier name is in this cache and return it if so + cache = AppCache("cache/"+cache_id+".db3") + data = cache.retrieve(identifier_name) + if data != False: + cache.encounter(identifier_name) + return data + else: + #create the cache + cache = AppCache("cache/"+cache_id+".db3") + cache.load() + + #TODO: update this documentation + """ + Process a web request to analyze an identifier within a specific context. + + This route function takes two URL parameters (identifier_name, and identifier_context) from an + incoming HTTP request and performs data preprocessing and feature extraction on the identifier_name. + It then uses a trained classifier to annotate the identifier with part-of-speech tags and other linguistic features. + + Args: + identifier_name (str): The name of the identifier to be analyzed. + identifier_context (str): The context in which the identifier appears. + + Returns: + List[dict]: A list of dictionaries containing words and their predicted POS tags. + """ + print(f"INPUT: {identifier_name} {identifier_context}") + + # Split identifier_name into words + words = ronin.split(identifier_name) + + # # Create initial data frame + data = pd.DataFrame({ + 'WORD': words, + 'SPLIT_IDENTIFIER': ' '.join(words), + 'CONTEXT_NUMBER': context_to_number(identifier_context), # Predefined context number + }) + + # create response JSON + # tags = list(annotate_identifier(app.model_data.ModelClassifier, data)) + result = { + "words" : [] + } + + # Add features to the data + data = createFeatures( + data, + mutable_feature_list, + modelGensimEnglish=app.model_data.ModelGensimEnglish, + ) + + categorical_features = ['NLTK_POS','PREV_POS', 'NEXT_POS'] + category_variables = [] + + for category_column in categorical_features: + if category_column in data.columns: + category_variables.append(category_column) + data.loc[:, category_column] = data[category_column].astype(str) + + for category_column in category_variables: + # Explicitly handle categorical conversion + unique_values = data[category_column].unique() + category_map = {} + for value in unique_values: + if value in universal_to_custom: + category_map[value] = custom_to_numeric[universal_to_custom[value]] + else: + category_map[value] = custom_to_numeric['NOUN'] # Assign 'NM' (8) for unknown categories + + data.loc[:, category_column] = data[category_column].map(category_map) + + # Convert categorical variables to numeric + # Load and apply the classifier + clf = joblib.load(os.path.join(SCRIPT_DIR, '..', 'models', 'model_GradientBoostingClassifier.pkl')) + predicted_tags = annotate_identifier(clf, data) + + # Combine words and their POS tags into a parseable format + #result = [{'word': word, 'pos_tag': tag} for word, tag in zip(words, predicted_tags)] + + for i in range(len(words)): + #check dictionary + dictionary = "UC" #uncategorized + word = words[i] + dictionary = dictionary_lookup(word) + result["words"].append( + { + words[i] : { + "tag" : predicted_tags[i], + "dictionary" : dictionary + } + } + ) + + # append result to cache + if cache_id != None: + cache.add(identifier_name, result) + + return result + +def context_to_number(context): + """ + Convert a textual context description to a numerical representation. + + This function takes a context description as a string and maps it to a numerical representation according to a + predefined mapping. + + Args: + context (str): The textual context description. + + Returns: + int: The numerical representation of the context. + + Raises: + ValueError: If the provided context is not one of the predefined values. + + Example: + numeric_context = context_to_number("CLASS") + """ + if context == "ATTRIBUTE": + return 1 + elif context == "CLASS": + return 2 + elif context == "DECLARATION": + return 3 + elif context == "FUNCTION": + return 4 + elif context == "PARAMETER": + return 5 + +def annotate_identifier(clf, data): + """ + Annotate identifier tokens using a trained classifier. + + This function takes a trained classifier and a dataset containing features for identifier tokens. It applies the + classifier to predict labels for the identifier tokens. + + Args: + clf (Classifier): The trained classifier model. + data (pd.DataFrame): A DataFrame containing features for identifier tokens. The columns of the DataFrame should + match the feature names used during training. + + Returns: + np.array: An array of predicted labels for the identifier tokens. + """ + # Drop unnecessary columns + data = data.drop(columns=['WORD', 'SPLIT_IDENTIFIER'], errors='ignore') + + # Ensure only the features used during training are included + trained_features = clf.feature_names_in_ # Features expected by the classifier + missing_features = set(trained_features) - set(data.columns) + extra_features = set(data.columns) - set(trained_features) + + if missing_features: + raise ValueError(f"The following expected features are missing: {missing_features}") + if extra_features: + print(f"Warning: The following unused features are being ignored: {extra_features}") + data = data[trained_features] + + # Ensure feature order matches the trained model + df_features = data[trained_features] + + # Make predictions + y_pred = clf.predict(df_features) + return y_pred + From 9b7cd2e24c1a7ea9e4aef6c5f0ad8ce648a50323 Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Mon, 5 May 2025 20:33:40 -0400 Subject: [PATCH 24/51] Account for context when saving identifiers --- requirements.txt | 2 +- src/tag_identifier.py | 25 +++++++++++++------------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/requirements.txt b/requirements.txt index 00f84f5..5c0ca05 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,7 +17,7 @@ filelock==3.17.0 flair==0.15.0 Flask==3.1.0 fonttools==4.55.6 -fsspec==2024.12.0 +fsspec==2023.5.0 ftfy==6.3.1 gdown==5.2.0 gensim==4.3.3 diff --git a/src/tag_identifier.py b/src/tag_identifier.py index 0af80c1..6a3a889 100644 --- a/src/tag_identifier.py +++ b/src/tag_identifier.py @@ -43,6 +43,7 @@ def load(self): CREATE TABLE IF NOT EXISTS names ( id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, + context TEXT NOT NULL, words TEXT, -- this is a JSON string firstEncounter INTEGER, lastEncounter INTEGER, @@ -53,31 +54,32 @@ def load(self): conn.commit() conn.close() - def add(self, identifier, result): + def add(self, identifier, result, context): #connection setup conn = sqlite3.connect(self.Path) cursor = conn.cursor() #add identifier to table record = { "name": identifier, + "context": context, "words": json.dumps(result["words"]), "firstEncounter": time.time(), "lastEncounter": time.time(), "count": 1 } cursor.execute(''' - INSERT INTO names (name, words, firstEncounter, lastEncounter, count) - VALUES (:name, :words, :firstEncounter, :lastEncounter, :count) + INSERT INTO names (name, context, words, firstEncounter, lastEncounter, count) + VALUES (:name, :context, :words, :firstEncounter, :lastEncounter, :count) ''', record) #close the database connection conn.commit() conn.close() - def retrieve(self, identifier): + def retrieve(self, identifier, context): #return a dictionary of the name, or false if not in database conn = sqlite3.connect(self.Path) cursor = conn.cursor() - cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ?", (identifier,)) + cursor.execute("SELECT name, words, firstEncounter, lastEncounter, count FROM names WHERE name = ? AND context = ?", (identifier, context)) row = cursor.fetchone() conn.close() @@ -92,8 +94,8 @@ def retrieve(self, identifier): else: return False - def encounter(self, identifier): - currentCount = self.retrieve(identifier)["count"] + def encounter(self, identifier, context): + currentCount = self.retrieve(identifier, context)["count"] #connection setup conn = sqlite3.connect(self.Path) cursor = conn.cursor() @@ -171,9 +173,8 @@ def start_server(temp_config = {}): if not os.path.isdir("cache"): os.mkdir("cache") print("laoding dictionary") - #TODO: if there's issues with uncateogorized words, it's porbably because this is commented out - #nltk.download("words") app.english_words = set(w.lower() for w in nltk.corpus.words.words()) + #insert english words from words/en.txt if not os.path.exists("words/en.txt"): print("could not find English words, using WordNet only!") @@ -236,9 +237,9 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None) if os.path.exists("cache/"+cache_id+".db3"): #check if the identifier name is in this cache and return it if so cache = AppCache("cache/"+cache_id+".db3") - data = cache.retrieve(identifier_name) + data = cache.retrieve(identifier_name, identifier_context) if data != False: - cache.encounter(identifier_name) + cache.encounter(identifier_name, identifier_context) return data else: #create the cache @@ -329,7 +330,7 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None) # append result to cache if cache_id != None: - cache.add(identifier_name, result) + cache.add(identifier_name, result, identifier_context) return result From dc1222cd588b680cc2f3da6611b118d9e820c61d Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Mon, 5 May 2025 21:02:41 -0400 Subject: [PATCH 25/51] Save time to tag an identifier in database --- src/tag_identifier.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/tag_identifier.py b/src/tag_identifier.py index 6a3a889..305e390 100644 --- a/src/tag_identifier.py +++ b/src/tag_identifier.py @@ -47,14 +47,15 @@ def load(self): words TEXT, -- this is a JSON string firstEncounter INTEGER, lastEncounter INTEGER, - count INTEGER + count INTEGER, + tagTime INTEGER -- time it took to tag the identifier ) ''') #close the database connection conn.commit() conn.close() - def add(self, identifier, result, context): + def add(self, identifier, result, context, tag_time): #connection setup conn = sqlite3.connect(self.Path) cursor = conn.cursor() @@ -65,11 +66,12 @@ def add(self, identifier, result, context): "words": json.dumps(result["words"]), "firstEncounter": time.time(), "lastEncounter": time.time(), - "count": 1 + "count": 1, + "tagTime": tag_time } cursor.execute(''' - INSERT INTO names (name, context, words, firstEncounter, lastEncounter, count) - VALUES (:name, :context, :words, :firstEncounter, :lastEncounter, :count) + INSERT INTO names (name, context, words, firstEncounter, lastEncounter, count, tagTime) + VALUES (:name, :context, :words, :firstEncounter, :lastEncounter, :count, :tagTime) ''', record) #close the database connection conn.commit() @@ -263,6 +265,9 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None) """ print(f"INPUT: {identifier_name} {identifier_context}") + # get the start time + start_time = time.perf_counter() + # Split identifier_name into words words = ronin.split(identifier_name) @@ -328,9 +333,12 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None) } ) + # get time it took to tag the identifier + tag_time = time.perf_counter() - start_time + # append result to cache if cache_id != None: - cache.add(identifier_name, result, identifier_context) + cache.add(identifier_name, result, identifier_context, tag_time) return result From 0ac367b2e516cbefccadcd54395ab59ac4a0592d Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Fri, 9 May 2025 21:41:46 -0400 Subject: [PATCH 26/51] Removed unused dependencies --- Dockerfile | 5 +++-- requirements.txt | 13 ------------- serve.json | 2 +- 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/Dockerfile b/Dockerfile index 54f54e7..d0e20b1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,10 +2,11 @@ FROM python:3.12-slim # Install (and build) requirements COPY requirements.txt /requirements.txt -RUN apt-get update --fix-missing && \ +RUN apt-get clean && rm -rf /var/lib/apt/lists/* && \ + apt-get update --fix-missing && \ apt-get install --allow-unauthenticated -y git curl && \ pip install -r requirements.txt && \ - rm -rf /var/lib/apt/lists/* + apt-get clean && rm -rf /var/lib/apt/lists/* COPY . . RUN pip install -e . diff --git a/requirements.txt b/requirements.txt index 5c0ca05..450f86d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -42,18 +42,6 @@ mpmath==1.3.0 networkx==3.4.2 nltk==3.9.1 numpy==1.26.4 -nvidia-cublas-cu12==12.4.5.8 -nvidia-cuda-cupti-cu12==12.4.127 -nvidia-cuda-nvrtc-cu12==12.4.127 -nvidia-cuda-runtime-cu12==12.4.127 -nvidia-cudnn-cu12==9.1.0.70 -nvidia-cufft-cu12==11.2.1.3 -nvidia-curand-cu12==10.3.5.147 -nvidia-cusolver-cu12==11.6.1.9 -nvidia-cusparse-cu12==12.3.1.170 -nvidia-nccl-cu12==2.21.5 -nvidia-nvjitlink-cu12==12.4.127 -nvidia-nvtx-cu12==12.4.127 packaging==24.2 pandas==2.2.3 pillow==11.1.0 @@ -93,7 +81,6 @@ torch==2.5.1 tqdm==4.67.1 transformer-smaller-training-vocab==0.4.0 transformers==4.48.1 -triton==3.1.0 typing_extensions==4.12.2 tzdata==2025.1 urllib3==2.3.0 diff --git a/serve.json b/serve.json index 3eeb486..6ecacbc 100644 --- a/serve.json +++ b/serve.json @@ -1,6 +1,6 @@ { "address": "0.0.0.0", - "port": 8080, + "port": 5000, "protocol": "https", "words":"" } From 14dd5c1784b5208ddae19ec0cbabfd392e8e76b3 Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Sat, 10 May 2025 04:34:25 -0400 Subject: [PATCH 27/51] Optional GPU accelaration in Docker --- Dockerfile | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index d0e20b1..b747297 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,18 @@ FROM python:3.12-slim +#argument to enable GPU accelaration +ARG GPU=false + # Install (and build) requirements COPY requirements.txt /requirements.txt +COPY requirements_gpu.txt /requirements_gpu.txt RUN apt-get clean && rm -rf /var/lib/apt/lists/* && \ apt-get update --fix-missing && \ apt-get install --allow-unauthenticated -y git curl && \ pip install -r requirements.txt && \ + if [ "$GPU" = true ]; then \ + pip install -r requirements_gpu.txt; \ + fi && \ apt-get clean && rm -rf /var/lib/apt/lists/* COPY . . @@ -72,4 +79,4 @@ CMD date; \ echo "Running..."; \ /main -r --words words/abbreviationList.csv -ENV TZ=US/Michigan \ No newline at end of file +ENV TZ=US/Michigan From b4c62d15d1ba53b32cd3eef785ffa6c27b8db31a Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Sat, 10 May 2025 04:38:34 -0400 Subject: [PATCH 28/51] Separate GPU accelaration dependencies --- requirements_gpu.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 requirements_gpu.txt diff --git a/requirements_gpu.txt b/requirements_gpu.txt new file mode 100644 index 0000000..801da7d --- /dev/null +++ b/requirements_gpu.txt @@ -0,0 +1,13 @@ +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +triton==3.1.0 \ No newline at end of file From 9eaa5763e4d76be3bc629cfe9fcceb72c4f4b348 Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Sat, 10 May 2025 23:02:26 -0400 Subject: [PATCH 29/51] =?UTF-8?q?=E2=80=9CUpdate=E2=80=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements_gpu.txt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/requirements_gpu.txt b/requirements_gpu.txt index 801da7d..c9a1ba1 100644 --- a/requirements_gpu.txt +++ b/requirements_gpu.txt @@ -2,12 +2,11 @@ nvidia-cublas-cu12==12.4.5.8 nvidia-cuda-cupti-cu12==12.4.127 nvidia-cuda-nvrtc-cu12==12.4.127 nvidia-cuda-runtime-cu12==12.4.127 -nvidia-cudnn-cu12==9.1.0.70 +nvidia-cudnn-cu12==9.1.1.17 nvidia-cufft-cu12==11.2.1.3 nvidia-curand-cu12==10.3.5.147 nvidia-cusolver-cu12==11.6.1.9 nvidia-cusparse-cu12==12.3.1.170 -nvidia-nccl-cu12==2.21.5 +nvidia-nccl-cu12==2.23.4 nvidia-nvjitlink-cu12==12.4.127 -nvidia-nvtx-cu12==12.4.127 -triton==3.1.0 \ No newline at end of file +nvidia-nvtx-cu12==12.4.127 \ No newline at end of file From 47dbb3b13e25a9092ec8b70833dfff68dcd4aaaa Mon Sep 17 00:00:00 2001 From: Brandon Scholten Date: Sun, 18 May 2025 16:47:24 -0400 Subject: [PATCH 30/51] Update serve.json --- serve.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/serve.json b/serve.json index 6ecacbc..3eeb486 100644 --- a/serve.json +++ b/serve.json @@ -1,6 +1,6 @@ { "address": "0.0.0.0", - "port": 5000, + "port": 8080, "protocol": "https", "words":"" } From 73ac7d409674166dc6f5d02fc86759ed02d23145 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Mon, 2 Jun 2025 19:24:39 -0400 Subject: [PATCH 31/51] Add new type of tagger --- input/tagger_data.tsv | 2610 +++++++++++++++++ main | 207 +- requirements.txt | 17 +- src/lm_based_tagger/__init__.py | 0 .../distilbert_preprocessing.py | 187 ++ src/lm_based_tagger/distilbert_tagger.py | 178 ++ src/lm_based_tagger/train_model.py | 127 + src/tag_identifier.py | 74 +- src/tree_based_tagger/__init__.py | 0 .../classifier_multiclass.py | 143 +- src/{ => tree_based_tagger}/create_models.py | 0 .../download_code2vec_vectors.py | 0 .../feature_generator.py | 0 13 files changed, 3353 insertions(+), 190 deletions(-) create mode 100644 input/tagger_data.tsv create mode 100644 src/lm_based_tagger/__init__.py create mode 100644 src/lm_based_tagger/distilbert_preprocessing.py create mode 100644 src/lm_based_tagger/distilbert_tagger.py create mode 100644 src/lm_based_tagger/train_model.py create mode 100644 src/tree_based_tagger/__init__.py rename src/{ => tree_based_tagger}/classifier_multiclass.py (64%) rename src/{ => tree_based_tagger}/create_models.py (100%) rename src/{ => tree_based_tagger}/download_code2vec_vectors.py (100%) rename src/{ => tree_based_tagger}/feature_generator.py (100%) diff --git a/input/tagger_data.tsv b/input/tagger_data.tsv new file mode 100644 index 0000000..4b6a6df --- /dev/null +++ b/input/tagger_data.tsv @@ -0,0 +1,2610 @@ +TYPE SPLIT CONTEXT GRAMMAR_PATTERN LANGUAGE SYSTEM_NAME +ABCOpt ABC Opt CLASS NM N C++ swift +int64t abs deadline n sec value PARAMETER NM NM NM NM N C++ grpc +AbstractBuildExecution Abstract Build Execution CLASS NM NM N Java jenkins +ofVec3f accelerometer Data PARAMETER NM N C openFrameworks +boolean accepting Tasks PARAMETER NM NPL Java jenkins +AccessorConformanceInfo Accessor Conformance Info CLASS NM NM N C++ swift +Map action To Index Map ATTRIBUTE N P NM N Java antlr4 +ActionTranslator Action Translator CLASS NM N Java antlr4 +KToggleAction action View Show Master Pages ATTRIBUTE NM N V NM NPL C++ calligra +AtomicInteger active Copies ATTRIBUTE NM NPL Java elasticsearch +QString active Tool Id DECLARATION NM NM N C++ calligra +int actual Count ATTRIBUTE NM N Java mockito +int actual Suffix DECLARATION NM N Java junit4 +void Add Bezier Curve FUNCTION V NM N C++ bullet3 +boolean add Bias To Embedding ATTRIBUTE V N P N Java corenlp +void Add Log To Stream FUNCTION V N P N C++ Telegram +void add Object Index FUNCTION V NM N C++ blender +void add Ordered List FUNCTION V NM N Java mockito +void add Unicode Script Codes To Names FUNCTION V NM NM NPL P NPL Java antlr4 +Guid Added Unicode Smp ATTRIBUTE NM NM N C# antlr4 +boolean adding DECLARATION V Java jenkins +grpcresolvedaddress addr PARAMETER N C grpc +AddressLowering Address Lowering CLASS N V C++ swift +int ADDRESS TYPE IPV4 ATTRIBUTE NM NM N Java okhttp +QGradient adjusted Gradient FUNCTION NM N C++ calligra +gsecaeadcrypter aead crypter seal DECLARATION NM NM N C++ grpc +void after Channel FUNCTION P N Java jenkins +List all action Roots DECLARATION DT NM NPL Java antlr4 +vector all Factors PARAMETER DT NPL C++ QuantLib +List all Invocation Matchers DECLARATION DT NM NPL Java mockito +QString all Open Files String FUNCTION DT NM NM N C++ kdevelop +List all Open Indices DECLARATION DT NM NPL Java elasticsearch +Collection all Stubbings ATTRIBUTE DT NPL Java mockito +AltAndContextConfigEqualityComparator Alt And Context Config Equality Comparator CLASS N CJ NM NM NM N Java antlr4 +Map alt Label Ctxs ATTRIBUTE NM NM NPL Java antlr4 +String alt Name PARAMETER NM N Java okhttp +path alternative Cache Name PARAMETER NM NM N C++ irrlicht +gboolean alwayspreview always preview DECLARATION VM V C gimp +ofFloatColor ambient ATTRIBUTE N C openFrameworks +float ambient coefficient PARAMETER NM N C++ bullet3 +QStringList anchor Name DECLARATION NM N C++ calligra +AndroidByteBuddyMockMaker Android Byte Buddy Mock Maker CLASS NM NM NM NM N Java mockito +qreal angle In Radian DECLARATION N P N C++ calligra +GtkWidget animation box DECLARATION NM N C gimp +KPrPredefinedAnimationsLoader animations Data PARAMETER NM NPL C++ calligra +String App ID PARAMETER NM N Java opencv +void append Python Style Escaped Code Point FUNCTION V NM N NM NM N Java antlr4 +void append Sequence FUNCTION V N C++ kdevelop +void apply Margin Insets FUNCTION V NM NPL Java Telegram +ApplyRewriter Apply Rewriter CLASS NM N C++ swift +ArabicSegmenter Arabic Segmenter CLASS NM N Java corenlp +AreaInfoDialog_t Area Info Dialog t CLASS NM NM NM N C gimp.idents +AresDnsResolver Ares Dns Resolver CLASS NM NM N C++ grpc +int ares expand name for response FUNCTION PRE V N P N C grpc +Collection arg Mismatch Stubbings PARAMETER NM NM NPL Java mockito +void ARGB Subtract Row C FUNCTION N V NM N C++ Telegram +Class array Class PARAMETER NM N Java junit4 +ArrayCountPropagation Array Count Propagation CLASS NM NM N C++ swift +SILValue Array Struct Value PARAMETER NM NM N C++ swift +void assign Lexer Token Types FUNCTION V NM NM NPL Java antlr4 +Set assigns PARAMETER NPL Java junit4 +int atn State DECLARATION NM N C# antlr4 +int ATOM ATTRIBUTE N Java antlr4 +Attribute Attribute CLASS N C++ openFrameworks +OrderedHashSet attribute Decls ATTRIBUTE NM NPL Java antlr4 +QXmlStreamAttributes attrs DECLARATION NPL C++ calligra +AUDSound AUD Sound envelope FUNCTION PRE NM N C++ blender +Authentication auth Result PARAMETER NM N Java jenkins +uint average In Bucket Used Slot Count ATTRIBUTE N P NM NM NM N C kdevelop +uint8x8t avg 1 DECLARATION N D C opencv +Map avg Map DECLARATION NM N Java corenlp +BusinessDayConvention b d c PARAMETER NM NM N C++ QuantLib +bool b Flip Horizontally PARAMETER PRE V VM C++ openFrameworks +boolean b Is Playing ATTRIBUTE PRE V V Java openFrameworks +bool b Loop ATTRIBUTE PRE N C++ openFrameworks +bool b Multi Fullscreen PARAMETER PRE NM N C++ openFrameworks +void b3 Push Profile Timing FUNCTION PRE V NM N C++ bullet3 +BackgroundParserPrivate Background Parser Private CLASS NM N NM C++ kdevelop +boolean backprop Training PARAMETER NM N Java corenlp +String base Category DECLARATION NM N Java corenlp +BaseLoaderCallback Base Loader Callback CLASS NM NM N Java opencv +btTransform base transform DECLARATION NM N C++ bullet3 +BasicReplicationRequest Basic Replication Request CLASS NM NM N Java elasticsearch +BasketPayoff Basket Payoff CLASS NM N C++ QuantLib +Map beanConfigs bean Configs ATTRIBUTE NM NPL Java jenkins +List befores ATTRIBUTE NPL Java junit4 +guchar best b DECLARATION NM N C gimp +IndexedWord best Node DECLARATION NM N Java corenlp +XIMStyle best Style DECLARATION NM N C++ irrlicht +QString bibliography Type PARAMETER NM N C++ calligra +Constructor biggest Constructor FUNCTION NM N Java mockito +BinaryRule Binary Rule CLASS NM N Java corenlp +String binary Search Bytes FUNCTION NM N NPL Java okhttp +MMatrix bind Matrix DECLARATION NM N C++ ogre +s16 binormal type DECLARATION NM N C++ irrlicht +u32 Bitmap Data Size ATTRIBUTE NM NM N C irrlicht +int BKE lattice index flip FUNCTION PRE NM N V C blender +void BLI str r strip FUNCTION PRE N VM V C blender +unsigned block Nr PARAMETER NM N C++ calligra +BMLog bm log ATTRIBUTE NM N C blender +vector bone Positions PARAMETER NM NPL C++ ogre +int bound Port PARAMETER NM N C# grpc +float Bounding Radius ATTRIBUTE NM N C irrlicht +LongConsumer breaker Consumer PARAMETER NM N Java elasticsearch +BreakpointDataPtr breakpoint ATTRIBUTE N C++ kdevelop +SILValue Bridged Value Fun Arg PARAMETER NM NM NM N C++ swift +quint32 brush Hatch PARAMETER NM N C++ calligra +btCollisionShape bt Collision Shape CLASS PRE NM N C++ bullet3 +class btDeformableContactConstraint bt Deformable Contact Constraint CLASS PRE NM NM N C++ bullet3 +btMaterial bt Material CLASS PRE N C++ bullet3 +BucketCollector Bucket Collector CLASS NM N Java elasticsearch +ParseField BUCKETS PATH FIELD ATTRIBUTE NM NM N Java elasticsearch +u32 buf Num PARAMETER NM N C++ irrlicht +vector buffers ATTRIBUTE NPL C++ Telegram +BuildContext build Context PARAMETER NM N Java antlr4 +void build Group Query FUNCTION V NM N Java elasticsearch +void build Lexer Rule Actions FUNCTION V NM NM NPL Java antlr4 +gint build time minor PARAMETER NM NM N C gimp +long build Timestamp ATTRIBUTE NM N Java jenkins +BuildWrapper Build Wrapper CLASS NM N Java jenkins +Path built Url PARAMETER NM N C++ kdevelop +ByteArrayView byte Array View PARAMETER NM NM N C++ kdevelop +uint32t byte Count DECLARATION NM N C Telegram +int8t byte Src DECLARATION NM N C++ Telegram +long bytes Received ATTRIBUTE NPL NM Java okhttp +CAnimatedMeshSceneNode C Animated Mesh Scene Node CLASS PRE NM NM NM N C++ irrlicht +CArchiveLoaderTAR C Archive Loader TAR CLASS PRE NM N NM C++ irrlicht +CAtBackgroundColor C At Background Color CLASS PRE NM NM N C++ openFrameworks +CAtFont C At Font CLASS PRE NM N C++ openFrameworks +CAtGrayScale C At Grayscale CLASS PRE NM N C++ openFrameworks +CBurningShader_Raster_Reference C Burning Shader Raster Reference CLASS PRE NM NM NM N C++ irrlicht +CD3D9RenderTarget C D3D9 Render Target CLASS PRE NM NM N C++ irrlicht +CFileReadCallBack C File Read Callback CLASS PRE NM NM N C++ irrlicht +Chttp2Connector C http2 Connector CLASS PRE NM N C++ grpc +CImage C Image CLASS PRE N C++ irrlicht +CImageWriterJPG C Image Writer JPG CLASS PRE NM N NM C++ irrlicht +ceresproblemt c problem PARAMETER NM N C++ blender +CSkinnedMesh C Skinned Mesh CLASS PRE NM N C++ irrlicht +Sink cache Body Unbuffered DECLARATION NM N NM Java okhttp +IndexShardCacheEntity cache Entity DECLARATION NM N Java elasticsearch +QString cache file PARAMETER NM N C++ kdevelop +CacheStrategy Cache Strategy CLASS NM N Java okhttp +CachedRegionTracker Cached Region Tracker CLASS NM NM N Java Telegram +BytecodeGenerator caching Mock Bytecode Generator ATTRIBUTE NM NM NM N Java mockito +u32 calc LUT FUNCTION V N C++ opencv +bool calibration Phase ATTRIBUTE NM N C++ QuantLib +CallErrorExtensions Call Error Extensions CLASS NM NM NPL C# grpc +long call Id PARAMETER NM N Java okhttp +float cam Rot Z DECLARATION NM NM N C++ bullet3 +CameraData Camera Data CLASS NM N C++ irrlicht +GimpCanvasItem canvas item ATTRIBUTE NM N C gimp +vector cap Times DECLARATION NM NPL C++ QuantLib +CapletVarianceCurve Caplet Variance Curve CLASS NM NM N C++ QuantLib +sharedptr cat Risk ATTRIBUTE NM N C++ QuantLib +CategoryFilterFactory Category Filter Factory CLASS NM NM N Java junit4 +Matcher cause Matcher PARAMETER NM N Java junit4 +grpcsslcertificateconfigreloadstatus cb result DECLARATION NM N C++ grpc +CborXContent Cbor X Content CLASS NM NM N Java elasticsearch +FrameIterator cell Cursor DECLARATION NM N C++ calligra +CertificateChainCleaner certificate Chain Cleaner ATTRIBUTE NM NM N Java okhttp +char Ch DECLARATION N C++ grpc +ChangeScroll Change Scroll CLASS V N C++ calligra +ChapterTocFrame Chapter Toc Frame CLASS NM NM N Java Telegram +stbttbuf char strings ATTRIBUTE NM NPL C bullet3 +bool check GL Support FUNCTION V NM N C++ openFrameworks +QSet check Next DECLARATION V N C++ kdevelop +void check Not Interface FUNCTION V VM N Java mockito +void check Not Local FUNCTION V VM N Java mockito +void check Sign FUNCTION V N C++ QuantLib +void check T FUNCTION V N C++ QuantLib +void Check Writeable FUNCTION V NM C# grpc +Set child Categories DECLARATION NM NPL Java junit4 +ChildLocation Child Location CLASS NM N C gimp.idents +Runnable child Statement PARAMETER NM N Java junit4 +ChineseTreebankParserParams Chinese Treebank Parser Params CLASS NM NM NM NPL Java corenlp +void choose Document FUNCTION V N C++ kdevelop +QList chosen Overrides ATTRIBUTE NM NPL C++ kdevelop +CipherSuite Cipher Suite CLASS NM N Java okhttp +auto clang Can Ty DECLARATION PRE NM N C++ swift +Annotation[] class Annotations FUNCTION NM NPL Java junit4 +ClassLoaders Class Loaders CLASS NM NPL Java mockito +byte[] classfile Buffer PARAMETER NM N Java mockito +ClauseMatrix Clause Matrix CLASS NM N C++ swift +void Clear Active ID FUNCTION V NM N C++ bullet3 +OkHttpClient client ATTRIBUTE N Java okhttp +HandshakeCertificates client Certificates DECLARATION NM NPL Java okhttp +int client Number ATTRIBUTE NM N Java corenlp +bool clip Dist Bug DECLARATION NM NM N C++ ogre +Clock Clock CLASS N C++ openFrameworks +ClusterUpdateSettingsResponse Cluster Update Settings Response CLASS NM NM NM N Java elasticsearch +List cmd Lines DECLARATION NM NPL Java jenkins +CmdOutputOperationType Cmd Output Operation Type CLASS NM NM NM N C++ ogre +String cmd Str PARAMETER NM N Java corenlp +sharedptr cms Pricer DECLARATION NM N C++ QuantLib +String code Exception Message DECLARATION NM NM N Java okhttp +CodeGeneratorPrivate Code Generator Private CLASS NM N NM C++ kdevelop +int code Point To PARAMETER NM N P Java antlr4 +int[] CODES ATTRIBUTE NPL Java okhttp +long col Length Left DECLARATION NM NM N C++ bullet3 +KoColorConversionSystem color Conversion System ATTRIBUTE NM NM N C++ calligra +ColorMap Color Map CLASS NM N C++ calligra +QMap color Map PARAMETER NM N C++ opencv +uint colored Count ATTRIBUTE NM N C kdevelop +CombinePaintMaskToCanvasBufferToPaintBufAlpha Combine Paint Mask To Canvas Buffer To Paint Buf Alpha CLASS NM NM N P NM N P NM NM N C++ gimp.idents +Command_t Command t CLASS NM N C gimp.idents +CompMask Comp Mask CLASS NM N C++ gimp.idents +Field compare Fields By Name DECLARATION V NPL P N Java mockito +int compare Inverse And Forward Dynamics FUNCTION V NM CJ NM NPL C++ bullet3 +TValue comparison Value PARAMETER NM N C# antlr4 +CompilerItem Compiler Item CLASS NM N C++ kdevelop +bool compiling Contexts DECLARATION V NPL C kdevelop +BatchCompletionDelegate Completion Handler I Unary Response Client Callback ATTRIBUTE NM N NM NM NM NM N C# grpc +PixelComponentType component Type ATTRIBUTE NM N C ogre +List concrete Follower Indices PARAMETER NM NM NPL Java elasticsearch +mat4 cone translation DECLARATION NM N C++ openFrameworks +ConfigureTimeouts Configure Timeouts CLASS NM NPL Java okhttp +serverconnectionstate connection state PARAMETER NM N C++ grpc +ConstParameterFloat Const Parameter Float CLASS NM NM N C++ ogre +ContainerTabBar Container Tab Bar CLASS NM NM N C++ kdevelop +ContentPath Content Path CLASS NM N Java elasticsearch +String content Type String PARAMETER NM NM N Java okhttp +int context Length ATTRIBUTE NM N Java junit4 +ContextTokenListIndexedGetterDecl Context Token List Indexed Getter Decl CLASS NM NM NM NM NM N Java antlr4 +QList context Url List FUNCTION NM NM N C++ kdevelop +ContextualizeClosures Contextualize Closures CLASS V NPL C++ swift +QVector control Points DECLARATION NM NPL C++ calligra +ControlledObject Controlled Object CLASS NM N C++ blender +ControllerValue Controller Value CLASS NM N C++ ogre +string Convert To Php Namespace FUNCTION V P NM N C++ grpc +b3ConvexPolyhedronData convex Shapes PARAMETER NM NPL C bullet3 +CookieJar cookie Jar ATTRIBUTE NM N Java okhttp +ActionBarMenuItem copy Item DECLARATION NM N Java Telegram +GsrProcessCore core ATTRIBUTE N C++ QuantLib +opusint64[] corr QC DECLARATION NM N C Telegram +sharedptr coterminal Model DECLARATION NM N C++ QuantLib +char[] cp 1254 DECLARATION N D C++ calligra +CppGeneratorServices Cpp Generator Services CLASS NM NM NPL C# grpc +CqEventQueue Cq Event Queue CLASS NM NM N C++ grpc +int create Duplicate Change Id FUNCTION V NM NM N C++ calligra +ParameterPtr create In Indices FUNCTION V NM NPL C++ ogre +bool create metadata array FUNCTION V NM N C++ grpc +Buffer create Sub Buffer FUNCTION V NM N C++ opencv +grpc_server_credentials* creds_ creds ATTRIBUTE NPL C grpc +CRFNonLinearLogConditionalObjectiveFunction CRF Non Linear Log Conditional Objective Function CLASS NM NM NM NM NM NM N Java corenlp +GeglNode crop node ATTRIBUTE NM N C gimp +Exception curr Thread Exception DECLARATION NM NM N Java junit4 +Mat current Charuco Corners DECLARATION NM NM NPL C++ opencv +String current Filename ATTRIBUTE NM N Java corenlp +InvocationOnMock current Invocation PARAMETER NM N Java mockito +int16_t current_median current median PARAMETER NM N C Telegram +int current Open Shards DECLARATION NM NM NPL Java elasticsearch +String current Prefix PARAMETER NM N Java jenkins +Matrix current Root PARAMETER NM N C++ QuantLib +ISceneNode Current Scene Node ATTRIBUTE NM NM N C++ irrlicht +vector current Sequence DECLARATION NM N C++ QuantLib +int current Slide FUNCTION NM N C++ calligra +gint curvatures height PARAMETER NM N C gimp +vector curves FUNCTION NPL C++ QuantLib +DBusProxy D Bus Proxy CLASS NM NM N C++ kdevelop +float d inf DECLARATION NM N Java Telegram +Mat D mat PARAMETER NM N C++ opencv +DobjPoints D obj Points CLASS NM NM NPL C gimp.idents +D3D11UnsupportedGpuProgram D3D11 Unsupported Gpu Program CLASS NM NM NM N C++ ogre +DataAccessRepositoryPrivate Data Access Repository Private CLASS NM NM N NM C++ kdevelop +FrameworkMethod data Point Method PARAMETER NM NM N Java junit4 +GimpDebugPolicy debug policy ATTRIBUTE NM N C gimp +DeclarationContextPrivate Declaration Context Private CLASS NM N NM C++ kdevelop +DeclarationItem Declaration Item CLASS NM N C++ kdevelop +void declare Component FUNCTION V N C++ kdevelop +Object deep Stub FUNCTION NM N Java mockito +guchar default PARAMETER N C gimp +Answer defaultAnswer default Answer ATTRIBUTE NM N Java mockito +AutoConstantEntry default Auto Entry PARAMETER NM NM N C++ ogre +BlockedItem DEFAULT BLOCKED ITEM COMPARATOR DECLARATION NM NM NM N Java jenkins +ParametersRunnerFactory DEFAULT FACTORY ATTRIBUTE NM N Java junit4 +int DEFAULT FLAGS ATTRIBUTE NM NPL Java elasticsearch +DefaultInjectionEngine Default Injection Engine CLASS NM NM N Java mockito +void default instance void ATTRIBUTE NM NM N C opencv +ColorManagedLook default look DECLARATION NM N C blender +Matx default mat x PARAMETER NM NM N C++ opencv +DefaultMockingDetails Default Mocking Details CLASS NM NM NPL Java mockito +bool default open PARAMETER NM N C bullet3 +String default Role PARAMETER NM N Java jenkins +DefaultSslRootsOverride Default Ssl Roots Override CLASS NM NM NPL NM C# grpc +Define Define CLASS N C++ calligra +int delay agnostic enabled ATTRIBUTE NM N V C Telegram +DeleteResponse delete FUNCTION V Java elasticsearch +void Delete Input FUNCTION V N C grpc +AbstractCoreLabel dep H PARAMETER NM N Java corenlp +ST dependencies ST DECLARATION NM N Java antlr4 +QString DEPOT MESSAGE START DECLARATION NM NM N C++ kdevelop +freenectdepthcb depth cb ATTRIBUTE NM N C openFrameworks +bool depth Stencil As Texture ATTRIBUTE NM N P N C openFrameworks +DeserializationContext Deserialization Context CLASS NM N C# grpc +dimension2d desktop Size PARAMETER NM N C++ irrlicht +GimpVector2 dest points PARAMETER NM NPL C gimp +gboolean destroy with parent PARAMETER V P N C gimp +bool Detect Leaks ATTRIBUTE V NPL C grpc +Strictness determine Strictness FUNCTION V N Java mockito +InstallState DEVELOPMENT ATTRIBUTE N Java jenkins +DeviceHandler Device Handler CLASS NM N C++ opencv +DialogElements Dialog Elements CLASS NM NPL C gimp.idents +void dialog info update FUNCTION NM N V C gimp +Real discount Factor PARAMETER NM N C++ QuantLib +DiskLruCache Disk Lru Cache CLASS NM NM N Java okhttp +DispatchMaskBufferIterator Dispatch Mask Buffer Iterator CLASS NM NM NM N C++ gimp.idents +DispatchPaintMask Dispatch Paint Mask CLASS NM NM N C++ gimp.idents +DisposeType dispose PARAMETER N C gimp +DiscountFactor dividend Discount Mother FUNCTION NM NM N C++ QuantLib +bool do Caps PARAMETER V NPL C++ QuantLib +HttpResponse do Forward FUNCTION V V Java jenkins +HttpResponse do Install Status FUNCTION V NM N Java jenkins +void do Login Entry FUNCTION V NM N Java jenkins +bool do Print After FUNCTION V V P C++ swift +DockWidgetArea docking Area PARAMETER NM N C++ kdevelop +DOTGenerator DOT Generator CLASS NM N Java antlr4 +int down sample PARAMETER NM N C Telegram +Real drift term DECLARATION NM N C++ QuantLib +String driver Type Name PARAMETER NM NM N C++ ogre +OutputArray dst map 2 PARAMETER NM N D C++ opencv +float dst Saturation DECLARATION NM N C calligra +int dst Type 1 DECLARATION NM N D C++ opencv +DualConInputReader Dual Con Input Reader CLASS NM NM NM N C++ blender +void dump cemd cmd FUNCTION V NM N C openFrameworks +void dump Core Map To String Builder FUNCTION V NM N P NM N Java corenlp +void dump Frame buffer Formats FUNCTION V NM NM NPL C irrlicht +DvbSubtitleReader Dvb Subtitle Reader CLASS NM NM N Java Telegram +int dynamic Table Byte Count ATTRIBUTE NM NM NM N Java okhttp +int dynamic Table Index FUNCTION NM NM N Java okhttp +DynamicTexturedCubeDemo Dynamic Textured Cube Demo CLASS NM NM NM N C++ bullet3 +bool echo path change PARAMETER V NM N C Telegram +gint edit count ATTRIBUTE NM N C gimp +unsigned Edit Length PARAMETER NM N C++ swift +Style effective Style FUNCTION NM N C++ calligra +GLboolean EGLEW ANDROID frame buffer target ATTRIBUTE PRE PRE NM NM N C blender +PFNCREATEPLATFORMWINDOWSURFACE eglew Create Platform Window Surface ATTRIBUTE PRE V NM NM N C blender +GLboolean EGLEW KHR stream fifo ATTRIBUTE PRE PRE NM N C blender +GParamSpec element spec DECLARATION NM N C gimp +VersionNumber EMBEDDED VERSION ATTRIBUTE NM N Java jenkins +ManagedValue emit Address FUNCTION V N C++ swift +FileObserver[] EMPTY DIRECTORY ATTRIBUTE NM N Java elasticsearch +void enable Background Opacity FUNCTION V NM N C++ calligra +void enable Materials FUNCTION V NPL C++ openFrameworks +freenectdeviceflags enabled subdevices ATTRIBUTE NM NPL C openFrameworks +List encoded Path Segments ATTRIBUTE NM NM NPL Java okhttp +List encoded Values PARAMETER NM N Java okhttp +boolean end Of Input DECLARATION N P N Java antlr4 +ImmutableTextSnapshotRef End Snapshot PARAMETER NM N C++ swift +int enum Count DECLARATION NM N Java junit4 +String env Name PARAMETER NM N Java antlr4 +EnvVarsSlaveInfo_DisplayName Env Vars Slave Info Display Name CLASS NM NM NM NM NM N Java jenkins +Real EPS PARAMETER N C++ QuantLib +EqualsBuilder equals Builder DECLARATION NM N Java mockito +int equals Offset DECLARATION NM N Java okhttp +T err ret PARAMETER NM N C++ Telegram +uint error Mark Type DECLARATION NM NM N C++ kdevelop +ESetTextureActive esa PARAMETER N C irrlicht +Date event 0 DECLARATION N D C++ QuantLib +ArrayList exception Channels ATTRIBUTE NM NPL Java Telegram +int excess Workload PARAMETER NM N Java jenkins +Executor Executor CLASS N Java jenkins +List executors DECLARATION NPL Java jenkins +path existing symlink PARAMETER NM N C++ grpc +Set expand Headers From Request FUNCTION V NPL P N Java elasticsearch +int expect rows DECLARATION NM NPL C++ opencv +ExpectedException Expected Exception CLASS NM N Java junit4 +String expected String DECLARATION NM N Java junit4 +ExplicitEulerScheme Explicit Euler Scheme CLASS NM NM N C++ QuantLib +ExpressionFinder Expression Finder CLASS NM N C++ swift +ClusteredBitVector extra Inhabitants Mask DECLARATION NM NM N C++ swift +List extra Interfaces DECLARATION NM NPL Java mockito +TimeSeries extract Component FUNCTION V N C++ QuantLib +boolean extract Events ATTRIBUTE V NPL Java corenlp +Class extract Raw Type Of FUNCTION V NM NM P Java mockito +int f Context Length ATTRIBUTE PRE NM N Java junit4 +Real f Cos DECLARATION NM N C++ ogre +bool f curve Found DECLARATION NM N V C++ ogre +FakeMetaMethod f m m DECLARATION NM NM N C++ kdevelop +Matcher f Matcher ATTRIBUTE PRE N Java junit4 +float f Ptr Out DECLARATION PRE NM N C++ openFrameworks +Real f Tolerance PARAMETER NM N C++ ogre +int face index DECLARATION NM N C blender +Class factory Class DECLARATION NM N Java junit4 +bool fast Load Success DECLARATION VM V N C++ ogre +FdBlackScholesVanillaEngine Fd Black Scholes Vanilla Engine CLASS NM NM NM NM N C++ QuantLib +auto feature Iterator DECLARATION NM N C++ antlr4 +FeedAdapter FEED ADAPTER ATTRIBUTE NM N Java jenkins +void fence FUNCTION N C++ opencv +unsigned field Offset Vector DECLARATION NM NM N C++ swift +SourceFile File ATTRIBUTE N C++ swift +FileChannel file Channel PARAMETER NM N Java okhttp +RepeatedField file Descriptor Proto ATTRIBUTE NM NM N C# grpc +FileItemDelegate File Item Delegate CLASS NM NM N C++ calligra +FileOperator File Operator CLASS NM N Java okhttp +FilePathFilter File Path Filter CLASS NM NM N Java jenkins +FileSystemArchive File System Archive CLASS NM NM N C++ ogre +String filter Spec DECLARATION NM N Java junit4 +List filtered Children ATTRIBUTE NM NPL Java junit4 +Response final Response PARAMETER NM N Java elasticsearch +Metadata find Hashable Base Type FUNCTION V NM NM N C++ swift +String find Source Subdir FUNCTION V NM N Java antlr4 +int fine priority PARAMETER NM N C Telegram +AlertDialog fingerprint Dialog ATTRIBUTE NM N Java Telegram +boolean finished Normally DECLARATION V VM Java elasticsearch +int first Space DECLARATION NM N Java okhttp +gchar first type label PARAMETER NM NM N C gimp +ICameraSceneNode fixed Cam DECLARATION NM N C++ irrlicht +Frequency fixed Leg Frequency DECLARATION NM NM N C++ QuantLib +FixedObject Fixed Object CLASS NM N C++ blender +int flag ATTRIBUTE N C blender +Pair flags Classifier Pair PARAMETER NM NM N Java corenlp +float Float FUNCTION N C++ bullet3 +DayCounter float Day Counter ATTRIBUTE NM NM N C++ QuantLib +vector folder Names PARAMETER NM NPL C++ openFrameworks +MetadataResponse follow Component FUNCTION V N C++ swift +Request followUp DECLARATION N Java okhttp +FootnotesPosition footnotes Position ATTRIBUTE NM N C++ calligra +bool force Direct PARAMETER V N C++ kdevelop +bool forked iter DECLARATION V N C blender +String format Display Name FUNCTION V NM N Java junit4 +boolean format OK DECLARATION N NM Java antlr4 +FormattedText Formatted Text CLASS NM N Java mockito +opusint16[] frame PARAMETER N C Telegram +Frame2 Frame 2 CLASS N D C++ blender +float[] frame pixel coords PARAMETER NM NM NPL C blender +pointer free cell ATTRIBUTE NM N C gimp +int[] free Positions DECLARATION NM NPL Java corenlp +BytesReference from Byte Buffers FUNCTION P NM NPL Java elasticsearch +btVector3 from Local Aabb Min DECLARATION P NM NM N C++ bullet3 +boolean from Server PARAMETER P N Java Telegram +gchar full path PARAMETER NM N C gimp +InputArray Func PARAMETER N C++ opencv +IDocument future Active Doc DECLARATION NM NM N C++ kdevelop +Real gauss Lobatto Eps PARAMETER NM NM N C++ QuantLib +String GEN SUBJ ATTRIBUTE NM N Java corenlp +bool generate Rule Bypass Transitions DECLARATION V NM NM NPL C# antlr4 +void Generate Service Descriptor Property FUNCTION V NM NM N C++ grpc +String generated Token On Creation DECLARATION NM N P N Java jenkins +GenericEmissiveClosure Generic Emissive Closure CLASS NM NM N C++ blender +GenericTypeExtractor Generic Type Extractor CLASS NM NM N Java mockito +GeometryInterface Geometry Interface CLASS NM N C++ bullet3 +Object[] get Actual Values FUNCTION V NM NPL Java junit4 +String get Artificial Op Prec Rule FUNCTION V NM NM NM N Java antlr4 +Side get Binary Side FUNCTION V NM N Java corenlp +String get Commit Id FUNCTION V NM N Java jenkins +long get Completed FUNCTION V NM Java elasticsearch +ConstantReference get Constant Reference For Protocol Descriptor FUNCTION V NM N P NM N C++ swift +void get Controller Transform FUNCTION V NM N C++ bullet3 +TextureAtlasAttib get Default Atlasing Attributes FUNCTION V NM NM NPL C++ ogre +InstallState get Default Install State FUNCTION V NM NM N Java jenkins +ColourValue get Diffuse Colour FUNCTION V NM N C++ ogre +String get Display Path FUNCTION V NM N Java jenkins +bool Get DMF Header FUNCTION V NM N C irrlicht +List get Error Listeners FUNCTION V NM NPL Java antlr4 +Long get Failure Timestamp FUNCTION V NM N Java junit4 +GetHomeDirectory Get Home Directory CLASS V NM N Java jenkins +String get Id For Name FUNCTION V N P N Java elasticsearch +int get Initial Window Size FUNCTION V NM NM N Java okhttp +Descriptor get Item Type Descriptor FUNCTION V NM NM N Java jenkins +ematrix6 get Jf FUNCTION V N C++ blender +Tree get Leftmost Descendant FUNCTION V NM N Java corenlp +double Get Longitude FUNCTION V N C# grpc +int get Max Shingle Diff FUNCTION V NM NM N Java elasticsearch +int Get Meta Index FUNCTION V NM N C Telegram +FunctionType get Msg Send Super Ty FUNCTION V NM NM NM N C++ swift +int get Number Of Transitions FUNCTION V N P NPL Java antlr4 +long get Operations Reads FUNCTION V NM NPL Java elasticsearch +String get Phrase 1 FUNCTION V N D Java corenlp +int get PID No Exceptions FUNCTION V N DT NPL Java corenlp +Vector3 get Plane Point FUNCTION V NM N C ogre +Method get Protocol Method PARAMETER NM NM N Java okhttp +Object[] get Raw Arguments FUNCTION V NM NPL Java mockito +float get Red Adjust 2 FUNCTION V NM N D C++ ogre +double Get Related View Data Row Double FUNCTION V NM NM NM N NM C++ grpc +String GET SOURCE NAME ATTRIBUTE NM NM N Java elasticsearch +Real get Top Border Size FUNCTION V NM NM N C++ ogre +GetTotalDiskSpace Get Total Disk Space CLASS V NM NM N Java jenkins +path get Working Directory FUNCTION V NM N C++ irrlicht +int Get XCR 0 FUNCTION V N D C++ Telegram +void gimp canvas rectangle set property FUNCTION PRE NM N V N C gimp +gboolean gimp devices check change FUNCTION PRE NPL V V C gimp +gboolean gimp eraser default FUNCTION PRE N NM C gimp +void gimp filter tool set gyroscope FUNCTION PRE NM N V N C gimp +GimpThumbnail gimp image file get thumbnail FUNCTION PRE NM N V N C gimp +void gimp param drawable id init FUNCTION PRE NM NM N V C gimp +void gimp selection tool class init FUNCTION PRE NM NM N V C gimp +void gimp status bar progress canceled FUNCTION PRE NM NM N V C gimp +void gimp value set int32 array FUNCTION PRE N V NM N C gimp +gboolean gimp wire compare FUNCTION PRE N V C gimp +GLConfigAttribs GL Config Attribs CLASS PRE NM NPL C++ ogre +PFNGLINDEXMASKPROC glew Index Mask ATTRIBUTE PRE NM N C blender +PFNGLRENDERMODEPROC glew Render Mode ATTRIBUTE PRE NM N C blender +PFNGLVALIDATEPROGRAMPIPELINEPROC glew Validate Program Pipeline ATTRIBUTE PRE V NM N C ogre +ObjectValue global Scope FUNCTION NM N C++ kdevelop +GlobalsAsMembersTableReaderInfo Globals As Members Table Reader Info CLASS NPL P NPL NM NM N C++ swift +GLXConfigurator GLX Configurator CLASS NM N C++ ogre +int gpencil primitive modal FUNCTION NM NM N C blender +QList group Boxes ATTRIBUTE NM NPL C++ kdevelop +GroupByContext Group By Context CLASS N P N Java elasticsearch +String group Role Attribute PARAMETER NM NM N Java jenkins +string grouping PARAMETER N C++ grpc +void grpc chttp2 mark stream writable FUNCTION PRE PRE V N NM C++ grpc +void grpc json writer value string FUNCTION PRE NM NM NM N C++ grpc +void grpc sock addr make wildcards FUNCTION PRE NM N V NPL C++ grpc +Hpack H pack CLASS NM N Java okhttp +void H Wnd ATTRIBUTE N NM C irrlicht +bool Handle If FUNCTION V CJ C++ ogre +QPointF handle Pos PARAMETER NM N C++ calligra +DenseSet handled Boxes ATTRIBUTE NM NPL C++ swift +HandshakeMode Handshake Mode CLASS NM N Java elasticsearch +bool has Composite Op FUNCTION V NM N C++ calligra +bool has View Relative Texture Coordinate Generation FUNCTION V NM NM NM NM N C++ ogre +bool have Mask DECLARATION V N C++ opencv +HDRListener HDR Listener CLASS NM N C++ ogre +HeatVisionListener Heat Vision Listener CLASS NM NM N C++ ogre +gint height int ATTRIBUTE NM N C gimp +FastVectorHighlighter highlighter PARAMETER N Java elasticsearch +QAction history Action ATTRIBUTE NM N C++ kdevelop +HtmlFile html File PARAMETER NM N C++ calligra +Http1ExchangeCodec Http1 Exchange Codec CLASS NM NM N Java okhttp +int http2 Error Code PARAMETER NM NM N Java okhttp +IllegalArgumentException i ar e PARAMETER NM NM N Java junit4 +IReadFile I Read File CLASS NM NM N C++ irrlicht +SILDeclRef i var Initializer PARAMETER NM NM N C++ swift +gint32 ico load layer FUNCTION PRE V N C gimp +IcyDecoder Icy Decoder CLASS NM N Java Telegram +long id A PARAMETER NM N Java jenkins +int ID LUMNINANCE MIN ATTRIBUTE NM NM N Java Telegram +Map id Mention DECLARATION N NM Java corenlp +GQuark identifier quark DECLARATION NM N C gimp +long idle Delay ATTRIBUTE NM N Java jenkins +IdpConfiguration idp Configuration DECLARATION NM N Java elasticsearch +int idx u DECLARATION NM N C blender +vector im out shape ATTRIBUTE NM NM N C++ opencv +ImageManager Image Manager CLASS NM N C++ blender +void imb stereo3d read interlace FUNCTION PRE NM V N C blender +stbi_uc img buffer original ATTRIBUTE NM N NM C ogre +IplImage[] img stub DECLARATION NM N C++ opencv +FREEIMAGETYPE img Type DECLARATION NM N C++ openFrameworks +String IMPLICIT GROUP KEY DECLARATION NM NM N Java elasticsearch +Grammar import G PARAMETER NM N Java antlr4 +wchart in PARAMETER N C++ bullet3 +float in buffer PARAMETER NM N C openFrameworks +Strategy in Cache DECLARATION P N Java Telegram +VerificationMode inOrderWrappedVerificationMode in Order Wrapped Verification Mode ATTRIBUTE P N NM NM N Java mockito +bool in SCC FUNCTION P N C++ swift +boolean inbound PARAMETER NM Java okhttp +IndexData index Data ATTRIBUTE NM NPL C ogre +String INDEX PREFIX WITH TEMPLATE ATTRIBUTE NM N P N Java elasticsearch +int index Tensor PARAMETER NM N C++ opencv +int index To Loc Format ATTRIBUTE N P NM N C bullet3 +Integer[] indexes Of Suspicious Args DECLARATION NPL P NM NPL Java mockito +IntPtr inertial Frame DECLARATION NM N C# bullet3 +InetSocketAddress inet Socket Address ATTRIBUTE NM NM N Java okhttp +Real inflation Leg NPV FUNCTION NM NM N C++ QuantLib +void init For Group FUNCTION V P N Java Telegram +Throwable INITIALIZATION ERROR ATTRIBUTE NM N Java mockito +void Initialize Dual Graph FUNCTION V NM N C++ bullet3 +float inner Alpha ATTRIBUTE NM N Java Telegram +Radian inner Angle PARAMETER NM N C++ ogre +auto inst Results DECLARATION NM NPL C++ swift +InstallUncaughtExceptionHandler Install Uncaught Exception Handler CLASS V NM NM N Java jenkins +QString institution PARAMETER N C++ calligra +string int Hex String PARAMETER NM NM N C++ openFrameworks +Type[] interface Bounds DECLARATION NM NPL Java mockito +Internal Internal CLASS NM Java okhttp +InternalFFMpegRegister Internal FFMpeg Register CLASS NM NM N C++ opencv +Real inv Flight K 2 DECLARATION NM NM N D C++ QuantLib +float inv unit scale DECLARATION NM NM N C blender +Interval INVALID ATTRIBUTE NM Java antlr4 +String INVALID HOST ATTRIBUTE NM N Java okhttp +int INVALID STATE NUMBER ATTRIBUTE NM NM N Java antlr4 +InverseDynamicsExample Inverse Dynamics Example CLASS NM NM N C++ bullet3 +bool Invert Success ATTRIBUTE V N C++ swift +Vector3 inverted Direction DECLARATION NM N C++ ogre +OutputStream ios PARAMETER N Java jenkins +int is a empty DECLARATION V DT N C grpc +boolean is Android FUNCTION V N Java mockito +boolean is Blocked By Shutdown FUNCTION V V P N Java jenkins +BitVector Is Bridged Argument ATTRIBUTE V NM N C++ swift +bool is canon FUNCTION V N C blender +boolean is Conscrypt Preferred FUNCTION V N V Java okhttp +boolean is Dependency Changed FUNCTION V N V Java antlr4 +ScorePhraseMeasures IS FIRST CAPITAL ATTRIBUTE V NM N Java corenlp +boolean is First Frame ATTRIBUTE V NM N Java okhttp +bool is Friend PARAMETER V N C++ kdevelop +bool is Generic Type Disambiguating Token FUNCTION V NM NM NM N C++ swift +bool Is Indirect Result ATTRIBUTE V NM N C swift +bool is Node A Left Child Leaf DECLARATION V N DT NM NM N C++ bullet3 +int is partial ATTRIBUTE V N C opencv +bool Is Return Bridged ATTRIBUTE V N NM C++ swift +bool is Stdlib Module FUNCTION V NM N C++ swift +bool is Sum Supported FUNCTION V N V C++ opencv +bool is Trained DECLARATION V V C++ opencv +IsVariadic is V PARAMETER V NM C++ swift +bool is Vaild PARAMETER V NM C++ ogre +bool is Valid Trailing Closure FUNCTION V NM NM N C++ swift +double items per second DECLARATION NPL P N C++ grpc +JarURLConnection jar URL Connection DECLARATION NM NM N Java jenkins +JFlexDummyLexer JFlex Dummy Lexer CLASS PRE NM N Java corenlp +JntArrayAcc Jnt Array Acc CLASS NM NM N C++ blender +u32 joint Start DECLARATION NM N C++ irrlicht +bool keep Aspect ATTRIBUTE V N C++ calligra +List keep Readability Only On Descendants FUNCTION V NM VM P NPL Java jenkins +QString key PARAMETER N C++ kdevelop +int key Begin DECLARATION NM N Java okhttp +queue key Events Copy DECLARATION NM NM N C++ openFrameworks +KeyFrame key Frame 2 PARAMETER NM N D C++ ogre +Setting KEY PASSWORD PROFILES ATTRIBUTE NM NM NPL Java elasticsearch +KeyStatus Key Status CLASS NM N Java Telegram +Keysym key sym ATTRIBUTE NM N C ogre +map kinects Copy DECLARATION NM N C++ openFrameworks +KoMainWindowPrivate Ko Main Window Private CLASS PRE NM N NM C++ calligra +KoPathPointRemoveCommandPrivate Ko Path Point Remove Command Private CLASS PRE NM NM NM N NM C++ calligra +KoRgbU8InvertColorTransformation Ko Rgb U8 Invert Color Transformation CLASS PRE NM NM NM NM N C++ calligra +KoSectionEndPrivate Ko Section End Private CLASS PRE NM N NM C++ calligra +KoShadowStylePrivate Ko Shadow Style Private CLASS PRE NM N NM C++ calligra +ListBase l b layer PARAMETER NM NM N C blender +LabelAction Label Action CLASS NM N C++ calligra +gchar label casefold DECLARATION NM N C gimp +LabelDrawingWidget Label Drawing Widget CLASS NM NM N C++ calligra +int label Op PARAMETER NM N Java antlr4 +LabelElementPair label Pair PARAMETER NM N Java antlr4 +Pattern label Regex ATTRIBUTE NM N Java corenlp +int[] labels PARAMETER NPL C++ opencv +String labels File PARAMETER NM N Java corenlp +HashMap language To Rules Files ATTRIBUTE N P NM NPL Java corenlp +Array last Gradient FUNCTION NM N C++ QuantLib +Size last Saved Step ATTRIBUTE NM NM N C++ QuantLib +bool last Token Was Delete Or Default DECLARATION NM N V N CJ N C++ kdevelop +String LATENCY ARG ATTRIBUTE NM N Java elasticsearch +GimpValueArray layer get composite mode invoker FUNCTION NM V NM NM N C gimp +GeglNode layer mask source node ATTRIBUTE NM NM NM N C gimp +LayerParameter Layer Parameter CLASS NM N C++ opencv +vector layer sizes DECLARATION NM NPL C++ opencv +LayoutData Layout Data CLASS NM N C++ calligra +double[] learned Lop Expert Weights 2 D PARAMETER NM NM NM NPL D N Java corenlp +Lease Lease CLASS N Java jenkins +int led Color PARAMETER NM N Java Telegram +LeftRecursiveRuleFunction Left Recursive Rule Function CLASS NM NM NM N Java antlr4 +int left Sisters Buffer DECLARATION NM NM N Java corenlp +int len r PARAMETER NM N C++ bullet3 +LessDummyGuiHelper Less Dummy Gui Helper CLASS NM NM NM N C++ bullet3 +unsigned lhs Component DECLARATION NM N C++ swift +LightAttenuationValue Light Attenuation Value CLASS NM NM N C++ ogre +vector3df light Dim PARAMETER NM N C++ irrlicht +double line 1 grad DECLARATION NM D N C gimp +int line index mask len PARAMETER NM NM NM N C blender +int line Y DECLARATION NM N C++ calligra +c8[] Link Name ATTRIBUTE NM N C irrlicht +QUrl link URL DECLARATION NM N C++ calligra +ListLevel List Level CLASS NM N C++ calligra +ResultBucket literal Bucket PARAMETER NM N C++ swift +int loaded mentions count DECLARATION NM NM N Java Telegram +LocalRef Local Ref CLASS NM N Java antlr4 +vector locations ATTRIBUTE NPL C++ QuantLib +bool locking PARAMETER V C++ grpc +LogMixedLinearCubicInterpolation Log Mixed Linear Cubic Interpolation CLASS NM NM NM NM N C++ QuantLib +Logger LOGGER ATTRIBUTE N Java jenkins +String[] logger Name Parts DECLARATION NM NM NPL Java jenkins +Real lower Boundary Factor FUNCTION NM NM N C++ QuantLib +int lower tail DECLARATION NM N C++ calligra +String lp Binary Path Name PARAMETER PRE NM NM N Java jenkins +LsmBasisSystem Lsm Basis System CLASS NM NM N C++ QuantLib +LVLCurrency LVL Currency CLASS NM N C++ QuantLib +VcsAnnotation m annotation ATTRIBUTE PRE N C++ kdevelop +b3Scalar m contact Motion 1 ATTRIBUTE PRE NM N D C++ bullet3 +int m count Activities ATTRIBUTE PRE NM NPL Java openFrameworks +bool m execute On Host ATTRIBUTE PRE V P N C++ kdevelop +int m FBO Height ATTRIBUTE PRE NM N Java opencv +uint8 mFirstRenderQueue m First Render Queue ATTRIBUTE PRE NM NM N C++ ogre +b3OpenCLArray m gpu Rays ATTRIBUTE PRE NM NPL C++ bullet3 +bool m is indx present ATTRIBUTE PRE V NM N C++ opencv +AtomicBoolean m is Worker Done ATTRIBUTE PRE V N NM Java openFrameworks +Cursor m last Changed Location ATTRIBUTE PRE NM NM N C++ kdevelop +float m Line Dash Offset ATTRIBUTE PRE NM NM N C++ openFrameworks +int m num Visual Shapes Copied ATTRIBUTE PRE NM NM NM N C bullet3 +Mode m paste Mode ATTRIBUTE PRE NM N C++ calligra +char[] m post Fix ATTRIBUTE PRE NM N C bullet3 +boolean m Preview Started ATTRIBUTE PRE N NM Java opencv +MultiBodyTree m reference ATTRIBUTE PRE N C++ bullet3 +UserDataRequestArgs m remove User Data Response Args ATTRIBUTE PRE NM NM NM NM NPL C bullet3 +Resources m Resources ATTRIBUTE PRE NPL Java opencv +streambuf m sbuf ATTRIBUTE PRE N C++ ogre +Pass m Shadow Receiver Pass ATTRIBUTE PRE NM NM N C ogre +Quaternion m Sky Box Orientation ATTRIBUTE PRE NM NM N C ogre +uint8 m Sky Plane Render Queue ATTRIBUTE PRE NM NM NM N C ogre +QStringList m text Types ATTRIBUTE PRE NM NPL C++ kdevelop +b3TransformChangeNotificationArgs m transform Change Args ATTRIBUTE PRE NM NM NPL C bullet3 +ParameterPtr m VS Out Light Position ATTRIBUTE PRE NM NM NM N C ogre +Queue m weaver ATTRIBUTE PRE N C++ kdevelop +int m window Width ATTRIBUTE PRE NM N C++ bullet3 +CodeCompletionWorker m worker ATTRIBUTE PRE N C++ kdevelop +ZoomController m zoom Controller ATTRIBUTE PRE NM N C++ kdevelop +MainClass Main Class CLASS NM N C# grpc +String make HTML Table FUNCTION V NM N Java corenlp +String MANIFEST FILE PREFIX ATTRIBUTE NM NM N Java elasticsearch +guchar mapped color PARAMETER NM N C gimp +vector mapped labels DECLARATION NM NPL C++ opencv +Maps Maps CLASS NPL Java corenlp +void mark As Fetching FUNCTION V P V Java elasticsearch +MaskComponents Mask Components CLASS NM NPL C++ gimp.idents +boolean match By IP FUNCTION V P N Java elasticsearch +ExpectedExceptionMatcherBuilder matcher Builder ATTRIBUTE NM N Java junit4 +boolean matches Any Parent Categories FUNCTION V DT NM NPL Java junit4 +long max Age Seconds Long DECLARATION NM NM NPL NM Java okhttp +sharedptr max basket DECLARATION NM N C++ QuantLib +int max Buffer Size ATTRIBUTE NM NM N C++ opencv +MaxCore Max Core CLASS NM N Java junit4 +int max Draw Buffers FUNCTION NM NM NPL C++ openFrameworks +quint64 max File Open ATTRIBUTE NM N NM C++ kdevelop +sizet max input size DECLARATION NM NM N C++ grpc +int max Intermediate Cas PARAMETER NM NM NPL Java okhttp +int max Requests Per Host PARAMETER NM NPL P N Java okhttp +double max scale f PARAMETER NM NM N C++ opencv +int max width ATTRIBUTE NM N C++ openFrameworks +sizet max Work Group Size ATTRIBUTE NM NM NM N C++ opencv +void maximize All FUNCTION V DT C++ openFrameworks +int media Chunk Index PARAMETER NM NM N Java Telegram +MediaChunkIterator Media Chunk Iterator CLASS NM NM N Java Telegram +uint melanin ofs DECLARATION NM N C blender +MentionDetectionEvaluator Mention Detection Evaluator CLASS NM NM N Java corenlp +String merged Type DECLARATION NM N Java elasticsearch +MetadataSnapshot metadata Snapshot FUNCTION NM N Java elasticsearch +Class method Handles DECLARATION NM NPL Java mockito +auto method index DECLARATION NM N C++ grpc +String[] method Name Prefixes PARAMETER NM NM NPL Java junit4 +double Min Error PARAMETER NM N C++ opencv +int min Font Size ATTRIBUTE NM NM N Java corenlp +int min Fresh PARAMETER NM N Java okhttp +int min Fresh Seconds PARAMETER NM NM NPL Java okhttp +TransducerGraph minimized Random FA DECLARATION NM NM N Java corenlp +MINIMUM_SUPPORTED_VERSION MINIMUM SUPPORTED VERSION CLASS NM NM N Java jenkins +Set missing Classes DECLARATION NM NPL Java elasticsearch +vector mkt Factors PARAMETER NM NPL C++ QuantLib +MockCreationSettings mock Creation Settings PARAMETER NM NM NPL Java mockito +MockCreationValidator Mock Creation Validator CLASS NM NM N Java mockito +Method mock Method DECLARATION NM N Java mockito +MockReference mock Ref ATTRIBUTE NM N Java mockito +MockitoAssertionError Mockito Assertion Error CLASS PRE NM N Java mockito +ModificationInterface mod Iface DECLARATION NM N C++ kdevelop +Sezpoz module Finder ATTRIBUTE NM N Java jenkins +ModuleHandler Module Handler CLASS NM N Java mockito +MonoPInvokeCallbackAttribute Mono P Invoke Callback Attribute CLASS NM NM NM NM N C# grpc +int MORE ATTRIBUTE DT Java antlr4 +TsurgeonPattern move RBT surgeon ATTRIBUTE NM NM N Java corenlp +MediaHandler movie Media Handler DECLARATION NM NM N C++ openFrameworks +Mp4Extractor Mp4 Extractor CLASS NM N Java Telegram +MP4Input MP4 Input CLASS NM N Java Telegram +MpegAudioReader Mpeg Audio Reader CLASS NM NM N Java Telegram +mutex mtx ATTRIBUTE N C++ opencv +MultiCubicSpline MultiCubic Spline CLASS NM N C++ QuantLib +MultipleFailureException Multiple Failure Exception CLASS NM NM N Java junit4 +bool multiple Occurences ATTRIBUTE NM NPL C++ calligra +MultiPolygon MultiPolygon CLASS N Java elasticsearch +ThreadMXBean mx Bean DECLARATION NM N Java junit4 +NormalDistribution n d ATTRIBUTE NM N C++ QuantLib +gint32 n layers PARAMETER NM NPL C gimp +int nrepeats n repeats DECLARATION NM NPL C++ opencv +float n Shininess PARAMETER NM N C++ openFrameworks +vector3d n vector PARAMETER NM N C irrlicht +LinearLayout name Container ATTRIBUTE NM N Java Telegram +List named Writeables PARAMETER NM NPL Java elasticsearch +bool Nav Visible ATTRIBUTE N NM C bullet3 +int nbr gaps DECLARATION NM NPL C blender +int nd Formula DECLARATION NM N C++ calligra +int nearest point DECLARATION NM N C blender +bool need fallback DECLARATION V N C blender +guint neighbor pos DECLARATION NM N C gimp +NetStateRuleDefaultTypeInternal Net State Rule Default Type Internal CLASS NM NM NM NM N NM C++ opencv +Object network Security Policy DECLARATION NM NM N Java okhttp +NewAggregateBuilderMap New Aggregate Builder Map CLASS NM NM NM N C++ swift +Exchange new Exchange FUNCTION NM N Java okhttp +MappedFieldType new Field Type PARAMETER NM NM N Java elasticsearch +List new Files PARAMETER NM NPL Java corenlp +gint new image height DECLARATION NM NM N C gimp +int new order DECLARATION NM N C blender +String new Rule Text DECLARATION NM NM N Java antlr4 +SILType new Sil Type DECLARATION NM NM N C++ swift +QList new Strokes ATTRIBUTE NM NPL C++ calligra +VerificationMode new Verification Mode PARAMETER NM NM N Java mockito +long new Warning Header Size DECLARATION NM NM NM N Java elasticsearch +float new Y ATTRIBUTE NM N C++ calligra +QRegularExpression next Fragment Expression DECLARATION NM NM N C++ kdevelop +char next func PARAMETER NM N C blender +int next Giphy Search Offset ATTRIBUTE NM NM NM N Java Telegram +string next Line PARAMETER NM N C++ kdevelop +byte[] next Search DECLARATION NM N Java elasticsearch +ParameterSignature next Unassigned FUNCTION NM NM Java junit4 +bool nla invert combine value FUNCTION NM V NM N C blender +ShaderOutput node find output by name FUNCTION N V N P N C++ blender +NodeShape Node Shape CLASS NM N C++ blender +CSGNoiseSource noise 1 DECLARATION N D C++ ogre +JsonArrayBuilder non Greedy States Builder DECLARATION NM NM NM N Java antlr4 +sizet nonce length ATTRIBUTE NM N C++ grpc +void notify Touch Down FUNCTION V NM N C++ openFrameworks +MockitoException null Passed To Verify No More Interactions FUNCTION N V P V VM DT NPL Java mockito +NullProgram Null Program CLASS NM N C++ ogre +Class nullable Class PARAMETER NM N Java junit4 +boolean nulls Ok FUNCTION NPL NM Java junit4 +Num Num CLASS N C# grpc +int num Active Contexts PARAMETER NM NM NPL C++ bullet3 +u16 Num Active Tris ATTRIBUTE NM NM NPL C++ irrlicht +gint num axis events ATTRIBUTE NM NM NPL C gimp +u32 num body parts ATTRIBUTE NM NM NPL C irrlicht +int num cols ATTRIBUTE NM NPL C blender +Size num Factors PARAMETER NM NPL C++ QuantLib +int num Files PARAMETER NM NPL C++ openFrameworks +u32 num groups ATTRIBUTE NM NPL C irrlicht +int num Keys PARAMETER NM NPL Java corenlp +gint num light ATTRIBUTE NM N C gimp +sizet num metadata DECLARATION NM N C++ grpc +int num Outs ATTRIBUTE NM NPL Java openFrameworks +sizet num primes DECLARATION NM NPL C Telegram +int num States DECLARATION NM NPL Java corenlp +int num Tess Face Data ATTRIBUTE NM NM NM N C blender +int16t num Vec Per Segment DECLARATION NM N P N C Telegram +int num Verts In A DECLARATION NM NPL P N C bullet3 +Size number Elementary Vegas ATTRIBUTE NM NM NPL C++ QuantLib +u32 number Of Joysticks DECLARATION N P NPL C++ irrlicht +u16 Number Start ATTRIBUTE NM N C irrlicht +String number Str PARAMETER NM N Java jenkins +QString numbering Path DECLARATION NM N C++ calligra +OAuthSession OAuth Session CLASS NM N Java okhttp +OAuthSessionFactory OAuth Session Factory CLASS NM NM N Java okhttp +ObjectProjection obj Proj DECLARATION NM N C++ swift +Object object With To String FUNCTION N P P N Java junit4 +OdfSymbolType odf Symbol Type ATTRIBUTE NM NM N C++ calligra +OFAndroidLifeCycleHelper OF Android Life Cycle Helper CLASS PRE NM NM NM N Java openFrameworks +OFAndroidObject OF Android Object CLASS PRE NM N Java openFrameworks +OFAndroidSoundPlayer OF Android Sound Player CLASS PRE NM NM N Java openFrameworks +OFAndroidWindow OF Android Window CLASS PRE NM N Java openFrameworks +OFOrientationListener OF Orientation Listener CLASS PRE NM N Java openFrameworks +string of To Binary FUNCTION PRE P N C++ openFrameworks +grpcclosure on connect PARAMETER P N C grpc +void on Group Call Key Sent FUNCTION P NM NM N NM Java Telegram +Void on Implies FUNCTION P V Java jenkins +OnItemLongClickListener on Item Long Click Listener ATTRIBUTE P NM NM NM N Java Telegram +boolean one Document DECLARATION NM N Java corenlp +Notification ongoing Call Notification ATTRIBUTE NM NM N Java Telegram +int OP CODE CONTINUATION ATTRIBUTE NM NM N Java okhttp +OpPool Op Pool CLASS NM N C++ opencv +String[] open Class Tags DECLARATION NM NM NPL Java corenlp +DeclTable operator Method Decls PARAMETER NM NM NPL C++ swift +bool optimize Identity Cast Composition FUNCTION V NM NM N C++ swift +OrderWith order With DECLARATION V P Java junit4 +Request order With FUNCTION V P Java junit4 +List ordered Invocations PARAMETER NM NPL Java mockito +float ori W DECLARATION NM N C++ bullet3 +MockingDetails original Mocking Details ATTRIBUTE NM NM NPL Java mockito +Set original Set PARAMETER NM N Java elasticsearch +long other Data Len Bits ATTRIBUTE NM NM NM NPL Java Telegram +gdouble other side x ATTRIBUTE NM NM N C gimp +char out buf PARAMETER NM N C irrlicht +double[] out d G DECLARATION N NM NM C Telegram +char[] out table PARAMETER NM N C bullet3 +TestRule outer Rule PARAMETER NM N Java junit4 +T output array PARAMETER NM N C++ grpc +OutputDelegatePrivate Output Delegate Private CLASS NM N NM C++ kdevelop +int overlap PARAMETER N C Telegram +auto overriden Function It DECLARATION NM NM N C++ kdevelop +short own flags PARAMETER NM NPL C blender +unzglobalinfo p global info 32 PARAMETER PRE NM N D C bullet3 +long packet Sample Count PARAMETER NM NM N Java Telegram +Optional packet sent ATTRIBUTE NM N C grpc +auto PAI Arg DECLARATION NM N C++ swift +GtkWidget paint radio DECLARATION NM N C gimp +bool palette poll FUNCTION N V C blender +sizet palette size PARAMETER NM N C++ bullet3 +ParallelComputer Parallel Computer CLASS NM N Java junit4 +ParameterDef param Def DECLARATION NM N C ogre +Assignments parameter Assignment PARAMETER NM N Java junit4 +ParameterSignature Parameter Signature CLASS NM N Java junit4 +String PARENT ATTRIBUTE N Java jenkins +vector parent field PARAMETER NM N C++ opencv +boolean parent Had Big Change PARAMETER N V NM N Java elasticsearch +List parent Pairs FUNCTION NM NPL Java corenlp +Attribute parse Attribute Def FUNCTION V NM N Java antlr4 +long parse Expires FUNCTION V NPL Java okhttp +ParseJobPrivate Parse Job Private CLASS NM N NM C++ kdevelop +long parse Max Age FUNCTION V NM N Java okhttp +List parserErrors parser Errors ATTRIBUTE NM NPL Java junit4 +bool parsing PARAMETER V C++ calligra +PartDocumentPrivate Part Document Private CLASS NM N NM C++ kdevelop +int parts Size DECLARATION NM N Java elasticsearch +MachineInstContainer Pass Machine Instructions PARAMETER NM NM NPL C ogre +string passphrase DECLARATION N C++ openFrameworks +PatchCoordBuffer Patch Coord Buffer CLASS NM NM N C++ blender +QString path With Native Separators FUNCTION N P NM NPL C kdevelop +PatternsAnnotations Patterns Annotations CLASS NM NPL Java corenlp +Real pd Sum DECLARATION NM N C++ QuantLib +char pem key PARAMETER NM N C++ grpc +bool pen Loaded PARAMETER N V C++ calligra +PhysicsClientExample Physics Client Example CLASS NM NM N C++ bullet3 +int pi Hash PARAMETER PRE N C Telegram +c8 pickup ATTRIBUTE N C irrlicht +PiecewiseConstantAbcdVariance Piecewise Constant Abcd Variance CLASS NM NM NM N C++ QuantLib +s32 pixel Width ATTRIBUTE NM N C irrlicht +string[] platform String DECLARATION NM N C++ openFrameworks +void png do strip channel FUNCTION NM V V N C irrlicht +PNGAPI png get row bytes FUNCTION NM V NM NPL C irrlicht +PNGAPI png get rows FUNCTION NM V NPL C irrlicht +PNGAPI png get unknown chunks FUNCTION NM V NM NPL C irrlicht +void png init palette transformations FUNCTION NM V NM NPL C irrlicht +void png read IDAT data FUNCTION NM V NM N C irrlicht +void pnm load raw pfm FUNCTION NM V NM N C gimp +Point3_ Point 3 CLASS N D C++ opencv +int point index PARAMETER NM N C blender +PointerMap Pointer Map CLASS NM N C++ grpc +String polling Log PARAMETER NM N Java jenkins +PostFile Post File CLASS NM N Java okhttp +PostScriptDocument Post Script Document CLASS NM NM N Java antlr4 +Map pre Map PARAMETER P N Java corenlp +PredicateWrapper Predicate Wrapper CLASS NM N C++ blender +String PREF FONT ATTRIBUTE NM N Java corenlp +aiVector3D present Scaling DECLARATION NM N C++ openFrameworks +void presentation Start From First FUNCTION N V P NM C++ calligra +int prev num hooks DECLARATION NM NM NPL C++ grpc +IntervalSet prev Property ATTRIBUTE NM N Java antlr4 +int prev Signal Bar Count DECLARATION NM NM NM N C++ Telegram +String previous Caption ATTRIBUTE NM N Java Telegram +QModelIndex previous Index PARAMETER NM N C++ calligra +Real previous Initial Value PARAMETER NM NM N C++ QuantLib +bool previous Is Valid PARAMETER N V NM C++ calligra +int previous Stream Id DECLARATION NM NM N Java okhttp +D3D11PRIMITIVETOPOLOGY prim Type DECLARATION NM N C++ ogre +PrintEvents Print Events CLASS NM NPL Java okhttp +int print Features Up to ATTRIBUTE V NPL VM P Java corenlp +PrintLabelFlag print label flag PARAMETER NM NM N C++ opencv +boolean print t PARAMETER V N C gimp +ProblemReporterFactory Problem Reporter Factory CLASS NM NM N C++ kdevelop +Process Process CLASS N C++ gimp.idents +sharedptr process Helper FUNCTION NM N C++ QuantLib +Features processing Level DECLARATION NM N C++ kdevelop +Bool progress out DECLARATION V N C irrlicht +bool progressive PARAMETER N C++ blender +ProjectControllerPrivate Project Controller Private CLASS NM N NM C++ kdevelop +QString project file PARAMETER NM N C++ kdevelop +Path projects Dir PARAMETER NM N C++ kdevelop +List promises PARAMETER NPL Java okhttp +String pronoun PARAMETER N Java corenlp +GimpColorProfile proof profile ATTRIBUTE NM N C gimp +IntervalSet property Interval Set PARAMETER NM NM N Java antlr4 +int provider Code DECLARATION NM N Java Telegram +PublishResponse Publish Response CLASS NM N Java elasticsearch +void push Reset Later FUNCTION V N VM Java okhttp +PutWatchRequest put Watch Request PARAMETER NM NM N Java elasticsearch +PyObject pybullet compute View Matrix FUNCTION PRE V NM N C bullet3 +sendrequest q tail ATTRIBUTE NM N C grpc +QRDetect QR Detect CLASS NM N C++ opencv +Quad Quad CLASS N C++ blender +long query Timeout In Ms PARAMETER NM N P NPL Java elasticsearch +GrammarAST question AST PARAMETER NM N Java antlr4 +Quote Quote CLASS N C++ QuantLib +RangeInRevision r PARAMETER N C++ kdevelop +camhdr r hdr DECLARATION NM N C openFrameworks +float radius PARAMETER N C gimp +sharedptr random Walk PARAMETER NM N C++ QuantLib +void rate Pointer PARAMETER NM N C++ bullet3 +Object raw Arguments ATTRIBUTE NM NPL Java mockito +List raw Extra Interfaces DECLARATION NM NM NPL Java mockito +GPUVertBufRaw raw nor DECLARATION NM N C blender +SILValue RC Identity ATTRIBUTE NM N C swift +float rcp len 2 DECLARATION NM N D C bullet3 +ReadBitstream Read Bitstream CLASS V N C++ opencv +void read Element Text Span FUNCTION V NM NM N C++ calligra +ReadBufferOperation read Operation PARAMETER NM N C++ blender +void read Pass FUNCTION V N C++ irrlicht +List read Response FUNCTION V N Java okhttp +FLACbool read subframe FUNCTION V N C Telegram +char Read Text File FUNCTION V NM N C++ ogre +void read White Space FUNCTION V NM N Java corenlp +int reader Flags PARAMETER NM NPL Java mockito +void reapply Filter FUNCTION V N C++ calligra +String received Token Signature DECLARATION NM NM N Java jenkins +ReconstructUpdateCallback Reconstruct Update Callback CLASS NM NM N C++ blender +RecordHeader Record Header CLASS NM N C++ calligra +int recorded Matchers Size DECLARATION NM NM N Java mockito +u32 rectangle Index DECLARATION NM N C++ irrlicht +JSONObject reduced Json DECLARATION NM N Java jenkins +int reduction Indices ATTRIBUTE NM NPL C++ opencv +QPointF ref Point Offset Percent ATTRIBUTE NM NM NM N C++ calligra +Map referee Set Map PARAMETER NM NM N Java jenkins +List reference Index Meta Datas ATTRIBUTE NM NM NM NPL Java elasticsearch +SourceRange Reference Range PARAMETER NM N C++ swift +String REFRESH INTERVAL IN MILLIS ATTRIBUTE NM N P NPL Java elasticsearch +MeanMetric refresh Metric PARAMETER NM N Java elasticsearch +RefutablePatternInitialization Refutable Pattern Initialization CLASS NM NM N C++ swift +void register With Volatility Spread FUNCTION V P NM N C++ QuantLib +f32 relative contrast PARAMETER NM N C++ irrlicht +Int32 rem F DECLARATION NM N C irrlicht +void remap Nearest Neighbor FUNCTION V NM N C++ opencv +Set remote Cluster Names PARAMETER NM NM NPL Java elasticsearch +RemotingDiagnostics Remoting Diagnostics CLASS NM NPL Java jenkins +void remove Imported Parent Contexts FUNCTION V NM NM NPL C++ kdevelop +void render result exr file end FUNCTION V NM NM NM N C blender +void Render Text FUNCTION V N C++ bullet3 +auto REPL Module DECLARATION NM N C++ swift +StringSet Replace Text Context ATTRIBUTE NM NM N C++ swift +sizet Replacement Length ATTRIBUTE NM N C swift +void report No Setter Found FUNCTION V DT N V Java mockito +void repress Ref At Loc FUNCTION V N P N C++ swift +uint8t request bytes DECLARATION NM NPL C++ grpc +Map requested Plugins DECLARATION NM NPL Java jenkins +void require Client Auth FUNCTION V NM N Java okhttp +int res Width PARAMETER NM N C++ openFrameworks +vector resamplers ATTRIBUTE NPL C++ Telegram +SILFunction Reserve Fn PARAMETER NM N C++ swift +void reset Meta Class Cache FUNCTION V NM NM N Java jenkins +void Reset Token Stats FUNCTION V NM NPL C Telegram +void resize Linear Open CV FUNCTION V NM NM N C++ opencv +void resize Nearest Neighbor FUNCTION V NM N C++ opencv +ResolvedFailedException Resolved Failed Exception CLASS NM NM N Java jenkins +ResponseHandlers Response Handlers CLASS NM NPL Java elasticsearch +RestClient Rest Client CLASS NM N C++ openFrameworks +boolean resume PARAMETER V Java Telegram +Predicate retain Function PARAMETER NM N Java corenlp +restrict rets PARAMETER NPL C blender +bool return Path PARAMETER NM N C++ irrlicht +Object returned Value ATTRIBUTE NM N Java mockito +ReturnsEmptyValues Returns Empty Values CLASS V NM NPL Java mockito +Answer RETURNS SELF ATTRIBUTE V N Java mockito +void rgb 2 rgb565 FUNCTION N P N C++ opencv +guchar[] rgb real DECLARATION NM N C gimp +void rgbx 2 bgrx FUNCTION N P N C++ opencv +Real risky Annuity ATTRIBUTE NM N C++ QuantLib +RiskyBond Risky Bond CLASS NM N C++ QuantLib +RollingFrictionDemo Rolling Friction Demo CLASS NM NM N C++ bullet3 +PointerRNA root ptr DECLARATION NM N C blender +Vector rot axis PARAMETER NM N C++ blender +gchar rotate desc ATTRIBUTE NM N C gimp +vector3df rotation Per Second PARAMETER N P N C++ irrlicht +ExtendedBounds rounded Bounds DECLARATION NM NPL Java elasticsearch +int row limit ATTRIBUTE NM N C++ grpc +RSComputeOperation RS Compute Operation CLASS NM NM N C++ ogre +RSStencilOperation RS Stencil Operation CLASS NM NM N C++ ogre +ofRtAudioSoundStream rt Stream Ptr DECLARATION NM NM N C++ openFrameworks +RtmSession rtm Session DECLARATION NM N Java okhttp +RuleMemberValidator Rule Member Validator CLASS NM NM N Java junit4 +Set rule Options ATTRIBUTE NM NPL Java antlr4 +RulePropertyRef_ctx Rule Property Ref ctx CLASS NM NM NM N Java antlr4 +RulePropertyRef_start Rule Property Ref start CLASS NM NM NM N Java antlr4 +RuleVersionAttribute Rule Version Attribute CLASS NM NM N C# antlr4 +List rules Of New Chain DECLARATION N P NM N Java junit4 +RunAfterParams Run After Params CLASS V NM NPL Java junit4 +RunBeforeParams Run Before Params CLASS V NM NPL Java junit4 +String RUN DIST CMD PROP ATTRIBUTE NM NM NM N Java corenlp +void run Methods FUNCTION V NPL Java junit4 +ParametersRunnerFactory runner Factory PARAMETER NM N Java junit4 +Runner runner Override ATTRIBUTE NM N Java junit4 +RunnerScheduler Runner Scheduler CLASS NM N Java junit4 +Real running Log Average DECLARATION NM NM N C++ QuantLib +Object runtime Mx Bean ATTRIBUTE NM NM N Java junit4 +string s Tracking System Name DECLARATION PRE NM NM N C++ bullet3 +TerrainLayerSamplerList samplers ATTRIBUTE NPL C ogre +QString sanitize Path FUNCTION V N C++ kdevelop +Dst saturated cast FUNCTION NM N C Telegram +void save As Quadratic Png FUNCTION V P NM N C++ calligra +bool save Dual Cells PARAMETER V NM NPL C ogre +char scene Node Type Name PARAMETER NM NM NM N C++ irrlicht +double[] score Pos Prev DECLARATION NM N NM Java corenlp +QHash script Event Action Factories ATTRIBUTE NM NM NM NPL C++ calligra +ScrollIdForNode Scroll Id For Node CLASS NM N P N Java elasticsearch +Map search Profile Results PARAMETER NM NM NPL Java elasticsearch +GimpHueRange secondary range PARAMETER NM N C gimp +SegmenterCoreAnnotations Segmenter Core Annotations CLASS NM NM NPL Java corenlp +SeiReader Sei Reader CLASS NM N Java Telegram +int selected Account ATTRIBUTE NM N Java Telegram +GeglRectangle selection bounds DECLARATION NM NPL C gimp +GtkWidget selection width label ATTRIBUTE NM NM N C gimp +void send Serial Config FUNCTION V NM N C++ openFrameworks +void send String FUNCTION V N Java openFrameworks +List sentence List DECLARATION NM N Java corenlp +QColor separator Color PARAMETER NM N C++ calligra +SerializedForm Serialized Form CLASS NM N Java junit4 +char server list PARAMETER NM N C++ grpc +ServerSafeHandle Server Safe Handle CLASS NM NM N C# grpc +KConfigGroup session Config FUNCTION NM N C++ kdevelop +OAuthSessionFactory session Factory ATTRIBUTE NM N Java okhttp +char session ticket key ATTRIBUTE NM NM N C grpc +void Set Add Faces Points FUNCTION V V NM NPL C bullet3 +Builder set Canonical Mention Begin FUNCTION V NM NM N Java corenlp +void set Custom Uniform 1 f FUNCTION V NM N D NM C++ openFrameworks +clint set Destructor Callback FUNCTION V NM N C++ opencv +void set Display Index FUNCTION V NM N C++ ogre +CreationSettings set Extra Interfaces FUNCTION V NM NPL Java mockito +void set Frame Pen FUNCTION V NM N C++ calligra +void set Invert FUNCTION V N C++ antlr4 +void set Layer Texture Name FUNCTION V NM NM N C++ ogre +Action set Prev Ctx Action DECLARATION NM NM NM N Java antlr4 +void set Project Naming Strategy FUNCTION V NM NM N Java jenkins +Method set Protocol Method PARAMETER NM NM N Java okhttp +void set Tiling FUNCTION V N C++ ogre +SettingManager Setting Manager CLASS NM N C++ irrlicht +ofSoundStreamSettings settings PARAMETER NPL C++ openFrameworks +void setup Bounding Box Vertices FUNCTION V NM NM NPL C++ ogre +sha2void sha256 hash FUNCTION NM N C++ irrlicht +void shader data to shader globals FUNCTION NM NPL P NM NPL C++ blender +ShiftReduceTrainOptions Shift Reduce Train Options CLASS NM NM NM NPL Java corenlp +vector shift Values ATTRIBUTE NM NPL C++ QuantLib +bool show tags ATTRIBUTE V NPL C++ blender +SiblingAlignInfo Sibling Info PARAMETER NM N C++ swift +void silk bw expander 32 FUNCTION NM NM N D C Telegram +auto simple Fn Ty DECLARATION NM NM N C++ swift +SimplePressure Simple Pressure CLASS NM N C++ gimp.idents +void simulate GC FUNCTION V N Java corenlp +float sin rot w DECLARATION NM NM N C++ ogre +SinglePeriodTimeline Single Period Timeline CLASS NM NM N Java Telegram +SINH SINH CLASS N Java elasticsearch +ImVec2 size contents PARAMETER NM NPL C++ bullet3 +long size Guess DECLARATION NM N Java jenkins +ImVec2 size on first use PARAMETER N P NM N C bullet3 +int size Per Span DECLARATION N P N Java Telegram +SKEditorConsumer SK Editor Consumer CLASS NM NM N C++ swift +boolean skip Vetoes ATTRIBUTE NM NPL Java jenkins +boolean skip Whitespace And Commas FUNCTION V N CJ NPL Java okhttp +SlackClient Slack Client CLASS NM N Java okhttp +DataType sort Field Data Type ATTRIBUTE NM NM NM N Java elasticsearch +int sorted Indices Buf PARAMETER NM NM N C++ opencv +uint32t source index DECLARATION NM N C openFrameworks +auto source Ty DECLARATION NM N C++ swift +Optional speaker FUNCTION N Java corenlp +int16t speech in PARAMETER NM N C Telegram +gint spline max len ATTRIBUTE NM NM N C gimp +int split Argument List FUNCTION V NM N Java antlr4 +array Sprites ATTRIBUTE NPL C++ irrlicht +int sqlite3 session config FUNCTION PRE N V C Telegram +int sqlite3 Walk Expr List FUNCTION PRE V NM N C Telegram +QFileInfo src File Info DECLARATION NM NM N C++ kdevelop +QGradient src Gradient PARAMETER NM N C++ calligra +int src start idx DECLARATION NM NM N C++ opencv +grpcchannelcredentials ssl creds DECLARATION NM NPL C++ grpc +STGroup st lib ATTRIBUTE NM N Java antlr4 +stack stack ATTRIBUTE N C++ grpc +QList start Dirs PARAMETER NM NPL C++ kdevelop +void start Matched Count Dec FUNCTION V NM N V Java corenlp +int Start Slot PARAMETER NM N C++ ogre +Real start Up Fix Cost PARAMETER NM NM NM N C++ QuantLib +Map state To Grammar Region Map ATTRIBUTE N P NM NM N Java antlr4 +Statement statement ATTRIBUTE N Java junit4 +rect static Rect DECLARATION NM N C++ irrlicht +File status File DECLARATION NM N Java antlr4 +File status File PARAMETER NM N Java antlr4 +QIcon status Icon FUNCTION NM N C++ kdevelop +int Step No ATTRIBUTE NM N C bullet3 +Real step Size DECLARATION NM N C++ QuantLib +int step x DECLARATION NM N C++ opencv +int stmt Close FUNCTION N V C Telegram +StochasticProcess Stochastic Process CLASS NM N C++ QuantLib +Store Store CLASS N Java elasticsearch +QStringList str args DECLARATION NM NPL C++ kdevelop +sizet str array len FUNCTION NM NM N C++ bullet3 +Headers stream Headers DECLARATION NM NPL Java okhttp +float strength PARAMETER N C blender +StrictnessSelector Strictness Selector CLASS NM N Java mockito +GrammarAST strip Left Recursion FUNCTION V NM N Java antlr4 +PyObject Stroke Attribute alpha get FUNCTION NM NM N V C++ blender +StructType Struct Ty ATTRIBUTE NM N C++ swift +StubbingComparator Stubbing Comparator CLASS NM N Java mockito +List stubbingLookupListeners stubbing Lookup Listeners ATTRIBUTE NM NM NPL Java mockito +SUTime SU Time CLASS NM N Java corenlp +vector Sub Module Name Visibility Pairs ATTRIBUTE NM NM NM NM NPL C++ swift +constiterator sub start DECLARATION NM N C++ openFrameworks +String subroutine Slot Name PARAMETER NM NM N C++ ogre +SuiteMethod Suite Method CLASS NM N Java junit4 +SuiteMethodBuilder Suite Method Builder CLASS NM NM N Java junit4 +RealMethod super Method PARAMETER NM N Java mockito +Class supplier Class PARAMETER NM N Java junit4 +XIMStyle supported Style DECLARATION NM N C++ irrlicht +bool suppress File PARAMETER V N C++ ogre +int sz Joint Ranges DECLARATION NM NM NPL C bullet3 +QStyleOptionTab tab Overlap DECLARATION NM N C++ kdevelop +gint table 2 id PARAMETER N D N C gimp +QSet tagged Resources DECLARATION NM NPL C++ calligra +ATNState target PARAMETER N C++ antlr4 +char target chars DECLARATION NM NPL C grpc +void target Started FUNCTION N V Java jenkins +TaskImpl Task Impl CLASS NM N Java jenkins +uint32t tbl index PARAMETER NM N C++ grpc +Object tcp Slave Agent Listener Lock ATTRIBUTE NM NM NM NM N Java jenkins +int TEGRA MORPH INIT FUNCTION PRE N V C++ opencv +DeclAttributes temp Attrs DECLARATION NM NPL C++ swift +TempCompMask Temp Comp Mask CLASS NM NM N C++ gimp.idents +TemperatureCauchy1D Temperature Cauchy 1 D CLASS NM N D NM C++ QuantLib +TemplatePreviewIconData Template Preview Icon Data CLASS NM NM NM NPL C++ kdevelop +TemporaryFolder Temporary Folder CLASS NM N Java junit4 +void tessellate To Mesh FUNCTION V P N C++ openFrameworks +int tex ID PARAMETER NM N Java openFrameworks +stringt text Chopped 2 DECLARATION N NM D C swift +TextPaintView Text Paint View CLASS NM NM N Java Telegram +InvocationOnMock the Invocation PARAMETER DT N Java mockito +AssignExpr Then ATTRIBUTE N C++ swift +int thread Array Size DECLARATION NM NM N Java junit4 +void throw Provision Exception If Errors Exist FUNCTION V NM N CJ NPL V Java elasticsearch +double tick Freq PARAMETER NM N C++ opencv +uint32 TIFF Current Tile FUNCTION NM NM N C opencv +TIFFSizeProc TIFF Get Size Proc FUNCTION NM V NM N C opencv +TileParameterDefaultTypeInternal Tile Parameter Default Type Internal CLASS NM NM NM N NM C++ opencv +float time range PARAMETER NM N C blender +TimeSignalCommand Time Signal Command CLASS NM NM N Java Telegram +Timespec time spec ATTRIBUTE NM N C# grpc +Timelapser Timelapser CLASS N C++ opencv +Cancellable timeout Task PARAMETER NM N Java elasticsearch +int times To Append Last Matcher PARAMETER NPL P V NM N Java mockito +vector tlv Symbols ATTRIBUTE NM NPL C++ swift +float[] tmp vec DECLARATION NM N C blender +ToStringWalker To String Walker CLASS P N N C++ grpc +double[] to XYZ DECLARATION P N C bullet3 +JSONObject token Data DECLARATION NM N Java jenkins +TokenPropertyRef Token Property Ref CLASS NM NM N Java antlr4 +TokenPropertyRef_channel Token Property Ref channel CLASS NM NM NM N Java antlr4 +Map token Store Typed Data DECLARATION NM NM NM N Java jenkins +TokenTypeDecl Token Type Decl CLASS NM NM N Java antlr4 +String token Type S DECLARATION NM NM N Java antlr4 +Token token Within Action PARAMETER N P N Java antlr4 +Position tool View Position FUNCTION NM NM N C++ kdevelop +bool Top Dir PARAMETER NM N C++ grpc +TopNGramRecord Top NGram Record CLASS NM NM N Java corenlp +int totchannel tot channel ATTRIBUTE NM N C blender +sizet tot elem ATTRIBUTE NM N C blender +CounterMetric total Merge Throttled Time ATTRIBUTE NM NM NM N Java elasticsearch +CommodityUnitCost trade Price ATTRIBUTE NM N C++ QuantLib +Builder training Examples FUNCTION NM NPL Java corenlp +Real tranched Loss After DECLARATION NM N P C++ QuantLib +PathInfo transform Path FUNCTION V N C++ QuantLib +Affine3 transform Unique Id DECLARATION NM NM N C++ ogre +TransportShardRefreshAction Transport Shard Refresh Action CLASS NM NM NM N Java elasticsearch +TreeElement Tree Element CLASS NM N C++ blender +TreePostScriptGenerator Tree Post Script Generator CLASS NM NM NM N Java antlr4 +bool trim Parse Trees DECLARATION V NM NPL C# antlr4 +TsurgeonParseException Tsurgeon Parse Exception CLASS NM NM N Java corenlp +Treebank tune Treebank DECLARATION NM N Java corenlp +TupleLValueEmitter Tuple LValue Emitter CLASS NM NM N C++ swift +void two Factor Response FUNCTION NM NM N C++ kdevelop +char txt alias DECLARATION NM N C grpc +int TYPE TEST RULE ATTRIBUTE NM NM N Java junit4 +byte TYPE WINDOW UPDATE ATTRIBUTE N NM NM Java okhttp +T[] typed Array PARAMETER NM N Java mockito +int ui Index DECLARATION NM N C++ ogre +BytesReference uncompress If Needed FUNCTION V CJ V Java elasticsearch +Handle underlying Fx Correlation PARAMETER NM NM N C++ QuantLib +Money undiscounted Amount PARAMETER NM N C++ QuantLib +String UNKNOWN USERNAME ATTRIBUTE NM N Java jenkins +void unload Textures FUNCTION V NPL C++ openFrameworks +void unpack texture Blend Func FUNCTION V NM NM N C irrlicht +requestmatcher unregistered request matcher ATTRIBUTE NM NM N C++ grpc +SoloFilePathFilter UNRESTRICTED ATTRIBUTE NM Java jenkins +GeglRectangle update area DECLARATION NM N C gimp +void update Mouse Pos FUNCTION V NM N C++ calligra +HttpUrl url From Json FUNCTION N P N Java okhttp +int usage PARAMETER N C++ openFrameworks +bool use Atm Spread ATTRIBUTE V NM N C++ QuantLib +bool use mat dirs DECLARATION V NM NPL C++ irrlicht +bool use Shadows 1 PARAMETER V NPL D C++ bullet3 +bool use Tabs PARAMETER V NPL C++ kdevelop +User2InternalIndex User 2 Internal Index CLASS N P NM N C++ bullet3 +void user hook 3 ATTRIBUTE NM N D C blender +long utc Timestamp Ms DECLARATION NM NM NPL Java Telegram +sizet utf8 size PARAMETER NM N C++ grpc +Guid uuid ATTRIBUTE N C# antlr4 +float v proj axis DECLARATION NM NM N C blender +StringTokenizer v Tok DECLARATION NM N Java elasticsearch +void V URL Encode FUNCTION PRE N V C++ bullet3 +uint val 32 ATTRIBUTE N D C++ opencv +sizet Val Size DECLARATION NM N C++ grpc +void validate Class Rules FUNCTION V NM NPL Java junit4 +u32 validate On PARAMETER V P C++ irrlicht +void validate Public Static Void Methods FUNCTION V NM NM NM NPL Java junit4 +List validator Strategies ATTRIBUTE NM NPL Java junit4 +Value Value CLASS N C++ Telegram +String value Count String DECLARATION NM NM N Java okhttp +QHash value Hash ATTRIBUTE NM N C++ calligra +ValueLabel Value Label CLASS NM N Java corenlp +Real value X DECLARATION NM N C++ QuantLib +VanillaForwardPayoff Vanilla Forward Payoff CLASS NM NM N C++ QuantLib +Object[] var Args DECLARATION NM NPL Java mockito +vector variables ATTRIBUTE NPL C++ QuantLib +Set vary Fields DECLARATION NM NPL Java okhttp +VerificationModeFactory Verification Mode Factory CLASS NM NM N Java mockito +VerificationOverTimeImpl verification Over Time DECLARATION N P N Java mockito +void verification Started FUNCTION N V Java mockito +VerificationStrategy verification Strategy ATTRIBUTE NM N Java mockito +emailkeymapping verifier get mapping FUNCTION N V N C++ grpc +VertexPosition Vertex Position CLASS NM N C++ bullet3 +VideoCapture_DShow Video Capture DShow CLASS NM NM N C++ opencv +freenectchunkcb video chunk cb ATTRIBUTE NM NM N C openFrameworks +void visit SIL Argument FUNCTION V NM N C++ swift +VoronoiFractureDemo Voronoi Fracture Demo CLASS NM NM N C++ bullet3 +int w Width ATTRIBUTE NM N C++ ogre +long wait Until DECLARATION V P Java jenkins +Set waiting List PARAMETER NM N Java jenkins +int want x PARAMETER V N C openFrameworks +WarningFailureException Warning Failure Exception CLASS NM NM N C++ openFrameworks +s32 wat id DECLARATION NM N C irrlicht +AtomicInteger weak Ref Lost ATTRIBUTE NM NM N Java jenkins +WebSocketListener Web Socket Listener CLASS NM NM N Java okhttp +vector weights Multipliers ATTRIBUTE NM NPL C++ opencv +StringPiece whole regexp ATTRIBUTE NM N C++ grpc +void widget Destroyed FUNCTION N V C++ kdevelop +boolean will Return Last Parameter PARAMETER V V NM N Java mockito +int win Error DECLARATION NM N C++ ogre +Challenge with Charset FUNCTION P N Java okhttp +MakeCms with Cms Leg Rule FUNCTION P NM NM N C++ QuantLib +ModelSettings with Market Rate Accuracy FUNCTION P NM NM N C++ QuantLib +Settings with Rate Bound FUNCTION P NM N C++ QuantLib +MockResponse with Web Socket Upgrade FUNCTION P NM NM N Java okhttp +gboolean within vertically DECLARATION P VM C gimp +WordLemmaTag word Lemma Tag PARAMETER NM NM N Java corenlp +void worker Thread Wait FUNCTION NM N V C++ bullet3 +WorkspaceFileMask Workspace File Mask CLASS NM NM N Java jenkins +vec3 world To Screen FUNCTION N P N C++ openFrameworks +int worst score DECLARATION NM N C opencv +RunListener wrap If Not Thread Safe FUNCTION V CJ VM NM N Java junit4 +WrapperType wrapped Verification PARAMETER NM N Java mockito +Class wrapper Class DECLARATION NM N Java junit4 +WriteContext Write Context CLASS NM N C++ grpc +void write Node Materials FUNCTION V NM NPL C++ irrlicht +int write Root PARAMETER V N C++ irrlicht +void write tcp data FUNCTION V NM N C grpc +WSDLSSolver WSDLS Solver CLASS NM N C++ blender +btScalar x 2 DECLARATION N D C++ bullet3 +float x Distance DECLARATION NM N Java openFrameworks +Atom X dnd Type List ATTRIBUTE NM NM NM N C blender +short x origin ATTRIBUTE NM N C bullet3 +f32 X scale ATTRIBUTE NM N C irrlicht +int x Tilt FUNCTION NM N C++ calligra +Real y In PARAMETER NM N C++ ogre +S32 ya Bottom ATTRIBUTE NM N C calligra +SmallVectorImpl yield MVs PARAMETER NM NPL C++ swift +freenectzeroplaneinfo z p i PARAMETER NM NM N C openFrameworks +double[] z probs PARAMETER NM NPL Java corenlp +Real z weight ATTRIBUTE NM N C++ QuantLib +int zero plane res ATTRIBUTE NM NM N C openFrameworks +int[] zoom x y PARAMETER NM N N C blender +T a PARAMETER N C++ drill +char a 0 PARAMETER N D C rigraph +int a 3 DECLARATION N D C toggldesktop +int a cap PARAMETER NM N C rigraph +int a Change PARAMETER PRE N C ccv +asn1_ctx_t a ctx PARAMETER NM N C wireshark +int a len PARAMETER NM N C naemon-core +u8 a light PARAMETER DT N C++ freeminer +int a low PARAMETER NM N C rigraph +u8 a Old Record 1 PARAMETER PRE NM N D C ccv +sqlite3_value** a Replace PARAMETER PRE V C ccv +Throwable a Throwable PARAMETER DT NM Java Spark +long a time PARAMETER NM N Java drill +ovsdb_type a type PARAMETER NM N C ovs +Void a Void PARAMETER DT N Java immutables +bool above Base PARAMETER P N C++ proxygen +class Abstract SV 2 Copier CLASS NM N D N Java drill +void add before forward FUNCTION V P N C++ caffe +customvariablesmember* add custom variable to service FUNCTION V NM N P N C naemon-core +void add Menu For List Nodes FUNCTION V N P NM NPL Java Spark +bool add no exist PARAMETER V DT V C++ s3fs-fuse +bool add no truncate cache PARAMETER V DT NM N C++ s3fs-fuse +int add parent to host FUNCTION V N P N C naemon-core +int add temp to args DECLARATION V N P NPL C weechat +void adjust to camera FUNCTION V P N C++ panda3d +uLong adler 1 DECLARATION N D C mgba +long after DECLARATION P Java drill +auto after 1 DECLARATION P D C++ meta +auto after 2 DECLARATION P D C++ meta +boolean after Equals DECLARATION P N Java Openfire +void after Filters Closed FUNCTION P NPL V Java Smack +boolean after First Batch ATTRIBUTE P NM N Java drill +request after handle DECLARATION P N C crow +void after Join Send History FUNCTION P V V N Java Openfire +void after Last FUNCTION P DT Java drill +boolean after Last Row ATTRIBUTE P NM N Java drill +ClassToInstanceMap after Processing ATTRIBUTE P V Java immutables +gboolean after release ATTRIBUTE P N C wireshark +ClassToInstanceMap after Round ATTRIBUTE P N Java immutables +ebb_after_write_cb after write cb ATTRIBUTE P V N C ccv +AfterXStanzas after X Stanzas ATTRIBUTE P D NPL Java Smack +off_t alias off ATTRIBUTE NM N C ccv +Set all Annotated Elements FUNCTION DT NM NPL Java immutables +EnumSet all Casts DECLARATION DT NPL Java drill +boolean all Cols Indexed PARAMETER DT NPL V Java drill +Map all Drill bits DECLARATION DT NM NPL Java drill +vector all Errors DECLARATION DT NPL C++ toggldesktop +List all Exprs DECLARATION DT NPL Java drill +Set all Fields ATTRIBUTE DT NPL Java drill +boolean all Final PARAMETER DT N Java immutables +List all Labels ATTRIBUTE DT NPL Java deeplearning4j +List all Methods PARAMETER DT NPL Java cglib +Set all Metrics DECLARATION DT NPL Java drill +ImmutableList all Mirrors FUNCTION DT NPL Java immutables +String all Names PARAMETER DT NPL Java cglib +Set all New Schema Paths FUNCTION DT NM NM NPL Java drill +List all Open Workspaces FUNCTION DT NM NPL Java deeplearning4j +Iterable all Options DECLARATION DT NPL Java drill +List all Pools PARAMETER DT NPL Java drill +boolean all Procedures Are Callable FUNCTION DT NPL V NM Java drill +String all Projects ATTRIBUTE DT NPL Java deeplearning4j +List all Room Names DECLARATION DT NM NPL Java Openfire +Map all Service Response PARAMETER DT NM N Java drill +gboolean all set DECLARATION DT N C wireshark +bool all space after DECLARATION DT N P C crow +bool all space before DECLARATION DT N P C crow +KeyStore all Store ATTRIBUTE DT N Java Spark +void all Streams Finished FUNCTION DT NPL V Java drill +int* all synced FUNCTION DT V C ovs +bool all Users PARAMETER DT NPL C++ facebook-repo-ds2 +ELoginRegister allow login or register ATTRIBUTE V V CJ V C freeminer +int among PARAMETER P C++ rigraph +i64 an Size PARAMETER NM N C ccv +checkout_conflictdata ancestor out PARAMETER NM N C git2r +Object and DECLARATION CJ Java immutables +DruidFilter and Filter At Index FUNCTION NM N P N Java drill +AndNode and Node PARAMETER NM N Java drill +byte another ID PARAMETER DT N Java Openfire +bool any diffuse ATTRIBUTE DT N C panda3d +primitive any hidden PARAMETER DT NM C++ panda3d +String APPLICATION INFO 2 ATTRIBUTE NM N D Java Spark +class are Unities in Shape CLASS V NPL P N Java deeplearning4j +Control arg 1 PARAMETER N D Java Openfire +DoublePointer arg 18 PARAMETER N D Java deeplearning4j +long arg 26 PARAMETER N D Java deeplearning4j +long arg 29 PARAMETER N D Java deeplearning4j +long arg 31 PARAMETER N D Java deeplearning4j +boolean as Array ATTRIBUTE P N Java drill +DCArrayParameter* as array parameter FUNCTION P NM N C++ panda3d +int as binary ATTRIBUTE P N C rigraph +BoundingBox* as bounding box FUNCTION P NM N C++ panda3d +char* as C String FUNCTION P NM N C++ freeminer +AnnotationMirror as Caching FUNCTION P V Java immutables +Object as Diamond DECLARATION P N Java immutables +double[] as Double FUNCTION P N Java deeplearning4j +DCField* as field FUNCTION P N C++ panda3d +git_diff_file as file PARAMETER P N C git2r +Function as Function ATTRIBUTE P N Java guava +CPPFunctionType* as function type FUNCTION P NM N C++ panda3d +ASIdentifiers as id PARAMETER NM N C toggldesktop +int as in ATTRIBUTE P P C rigraph +Int64 as Int64 FUNCTION P N C++ toggldesktop +long as Long DECLARATION P N Java guava +MapWriter as Map FUNCTION P N Java drill +DCMolecularField* as molecular field FUNCTION P NM N C++ panda3d +double as of PARAMETER P P C panda3d +void* as pointer FUNCTION P N C++ panda3d +ByteBuf as Read Only FUNCTION P NM VM Java drill +DCSimpleParameter* as simple parameter FUNCTION P NM N C++ panda3d +List as Sorted Entry List FUNCTION P NM NM N Java Singularity +String as String DECLARATION P N Java immutables +List as Stripes DECLARATION P NPL Java guava +Expression as Transform Generator Transform FUNCTION P NM NM N Java immutables +V as V DECLARATION P N Java guava +Var as Var FUNCTION P N C++ toggldesktop +int as warning PARAMETER P N C libxo +int as within ATTRIBUTE P P C rigraph +Writer as Writer FUNCTION P N Java guava +boolean at Least One Write ATTRIBUTE P DT D V Java drill +R at Most FUNCTION P DT Java immutables +int atalk len FUNCTION N NM C wireshark +int b 1 Index PARAMETER N D N Java drill +int B 1110 ATTRIBUTE N D Java deeplearning4j +byte b 3 PARAMETER N D Java guava +Indexer b float 16 Indexer DECLARATION PRE N D N Java deeplearning4j +double b float 16 To Double FUNCTION PRE N D P N Java deeplearning4j +bool b Force 16 bpp PARAMETER PRE V D N C++ panda3d +int b next DECLARATION N DT C git2r +svec b only PARAMETER N VM C ovs +int b Stat 1 ATTRIBUTE PRE N D C ccv +uint64_t bad only DECLARATION NM VM C toxcore +class Base Level 1 CLASS NM N D Java deeplearning4j +class Base Level 3 CLASS NM N D Java deeplearning4j +uint8_t be32 ofs PARAMETER NM N C ovs +long before DECLARATION P Java drill +void before Execute FUNCTION P V Java cglib +boolean before First ATTRIBUTE P DT Java drill +int before major ATTRIBUTE P N C++ panda3d +int before minor ATTRIBUTE P N C++ panda3d +off_t behind rem start DECLARATION P NM N C++ s3fs-fuse +off_t behind size PARAMETER P N C++ s3fs-fuse +off_t behind start PARAMETER P N C++ s3fs-fuse +LVecBase4 bi 0 DECLARATION N D C++ panda3d +class Bind 2 Module CLASS V D N Java Smack +int bit Field 0 ATTRIBUTE NM N D Java drill +ssize_t bit off DECLARATION NM N C++ facebook-repo-ds2 +int bits per pixel ATTRIBUTE NPL P N C panda3d +map blob name to last top idx DECLARATION NM N P DT NM N C++ caffe +String BLOCK CONTACT 16 x 16 ATTRIBUTE NM N D P D Java Spark +u32 block count all DECLARATION NM N DT C++ freeminer +void body 0 PARAMETER N D C panda3d +boolean both Empty Selection DECLARATION DT NM N Java drill +boolean both NonEmpty Selection DECLARATION DT NM N Java drill +string both Or All FUNCTION DT CJ DT C++ freeminer +UInt32 Bt3Zip Match Finder Get Matches FUNCTION NM NM N V NPL C mgba +bool btn down for dig ATTRIBUTE NM VM P N C++ freeminer +JButton btn save DECLARATION NM N Java Spark +char buf 3 DECLARATION N D C naemon-core +char buf out DECLARATION NM N C naemon-core +X buff Ptr 2 ATTRIBUTE NM N D C++ deeplearning4j +char buffer as string ATTRIBUTE N P N C weechat +char buffer out PARAMETER NM N C git2r +void build Schema For 2Dimensional Dataset FUNCTION V N P NM N Java drill +String by ATTRIBUTE P Java Smack +class Bypass Comparison 8192 x 8192 CLASS NM N D P D Java deeplearning4j +byte byte I Plus 1 DECLARATION NM N P D Java drill +long bytes in PARAMETER NPL NM Java drill +long bytes out PARAMETER NPL NM Java drill +int64 bytes to read DECLARATION NPL P V C++ deeplearning4j +class C Matrix 33 CLASS PRE N D C++ freeminer +SColor c outside DECLARATION PRE P C++ freeminer +igraph_vector_t c partition 2 DECLARATION NM N D C rigraph +int c receive only ATTRIBUTE PRE V VM C ovs +class C Vector 3 CLASS PRE N D C++ freeminer +char c where DECLARATION PRE VM C weechat +Dtype caffe next after FUNCTION PRE DT P C++ caffe +int caps 2 ATTRIBUTE N D C++ panda3d +gboolean cb service in host group each host FUNCTION NM N P NM N DT N C naemon-core +void* ccv atan 2 FUNCTION PRE N D C ccv +void* ccv cnnp batch norm add to output FUNCTION PRE PRE PRE PRE V P N C ccv +ccv_numeric_data_t* ccv get sparse matrix cell from vector FUNCTION PRE V NM NM N P N C ccv +void ccv nnc insert if prior to any FUNCTION PRE PRE V CJ NM P DT C ccv +ccv_nnc_tensor_t* ccv nnc tensor for while count FUNCTION PRE PRE N P NM N C ccv +ovs_list change set for tables ATTRIBUTE NM N P NPL C ovs +char chars 1 PARAMETER NPL D C weechat +guint chars per unit ATTRIBUTE NPL P N C wireshark +String CHAT COBROWSE IMAGE 24 x 24 ATTRIBUTE NM NM N D P D Java Spark +int check against known hosts FUNCTION V P NM NPL C git2r +internal_function* check arrival add next nodes FUNCTION V N V DT NPL C git2r +bool check content only PARAMETER V N VM C++ s3fs-fuse +void check for host flapping FUNCTION V P N V C naemon-core +bool check last arg FUNCTION V DT N C++ panda3d +int* checkout action wd only FUNCTION V NM N VM C git2r +int* checkout create the new FUNCTION PRE V DT NM C git2r +void clear all markers FUNCTION V DT NPL C++ rigraph +Builder clear Part 1 FUNCTION V N D Java drill +bool close fd when done PARAMETER V N VM V C git2r +char cmd 1 PARAMETER N D C weechat +int col 1 PARAMETER N D C++ deeplearning4j +LongType col Stride 1 DECLARATION NM N D C++ deeplearning4j +List column Statistics V 1s DECLARATION NM NPL NM D Java drill +ColumnTypeMetadata_v4 column Type Metadata v 4 DECLARATION NM NM N NM D Java drill +char command 2 DECLARATION N D C weechat +int commit on success PARAMETER V P N C git2r +int conditional match on branch FUNCTION NM N P N C git2r +List conjuncts 1 DECLARATION NPL D Java drill +conn conn in PARAMETER NM N C ovs +DrillConnectionImpl connection 1 DECLARATION N D Java drill +Contents contents 2 PARAMETER NPL D C++ panda3d +ccv_cnnp_model_t conv 0 DECLARATION N D C ccv +INDArray conv 2D FUNCTION N NM Java deeplearning4j +int conv in channels ATTRIBUTE NM NM NPL C++ caffe +int conv out channels ATTRIBUTE NM NM NPL C++ caffe +int conv out spatial dim ATTRIBUTE NM NM NM N C++ caffe +void conv rgba4444 FUNCTION V N C++ panda3d +int convert to 8 bit PARAMETER V P D N C mgba +int CONVERT TO UINT4 LENGTH ATTRIBUTE V P NM N Java drill +bool* copy primitives from FUNCTION V NPL P C++ panda3d +bool copy this file FUNCTION V DT N C++ panda3d +DrillCostBase cost 1 DECLARATION N D Java drill +int count 1 PARAMETER N D C git2r +uint32_t count 32 DECLARATION N D C weechat +class Cout Stream CLASS NM N C++ freeminer +float cp 0 DECLARATION N D C ccv +ContentParamType2 cpt 2 DECLARATION N D C++ freeminer +class Cropping 1D CLASS N NM Java deeplearning4j +ConvertSupport cs 2 PARAMETER N D C++ drill +LdapContext ctx 2 DECLARATION N D Java Openfire +class CuDNN Deconvolution Layer CLASS PRE NM N C++ caffe +curandGenerator_t curand generator FUNCTION NM N C++ caffe +WhichMemory data or diff PARAMETER N CJ N C++ caffe +SelectionVector4 data Sv 4 PARAMETER NM N D Java drill +Decimal38DenseWriter decimal 38 Dense FUNCTION N D NM Java drill +int DECIMAL 38 DENSE VALUE ATTRIBUTE NM D NM N Java drill +Class declared Type 1 PARAMETER NM N D Java guava +DrillBuf decompress Page V 1 FUNCTION V N NM D Java drill +class Deconvolution 2D CLASS N NM Java deeplearning4j +class Deconvolution 3D CLASS N NM Java deeplearning4j +class Deconvolution 3D Param Initializer CLASS NM NM NM N Java deeplearning4j +class Deconvolution Layer CLASS NM N C++ caffe +class Deconvolution Param Initializer CLASS NM NM N Java deeplearning4j +String default S3 Bucket ATTRIBUTE NM NM N Java Singularity +class Depthwise Convolution 2D CLASS NM N NM Java deeplearning4j +int description 2 ATTRIBUTE N D C weechat +int DESTINATION OPTIONS V6 ATTRIBUTE NM NPL NM Java drill +bool Destroy Usr 1 Handler FUNCTION V N D N C++ s3fs-fuse +int* dissect acdr ip or other FUNCTION V NM N CJ N C wireshark +PN_stdfloat dist 2 ATTRIBUTE N D C++ panda3d +igraph_integer_t distance 12 PARAMETER N D C rigraph +double dl now PARAMETER V VM C++ s3fs-fuse +class DL4J Invalid Input Exception CLASS PRE NM NM N Java deeplearning4j +float32x4_t dn 1 x 2 DECLARATION N D P D C ccv +bool do adjust this size FUNCTION V V DT N C++ panda3d +void do all sorted fn FUNCTION V DT V N C toggldesktop +void dof reg handoff dpp 0 FUNCTION PRE V NM N D C wireshark +uint32_t dot3 ad Agg Port Attached Agg ID ATTRIBUTE PRE PRE PRE PRE NM NM N C ovs +Icon down Icon ATTRIBUTE NM N Java Spark +int down time ATTRIBUTE P N C freeminer +String downsample ATTRIBUTE V Java drill +int dp if create and open FUNCTION PRE CJ V CJ V C ovs +int dp if index ATTRIBUTE PRE NM N C ovs +int* dps for each FUNCTION N P DT C ovs +OpenFlags ds 2 Flags PARAMETER N D NPL C++ facebook-repo-ds2 +int ds last FUNCTION PRE DT C ovs +PandaNode* dupe for flatten FUNCTION V P V C++ panda3d +GUID DX7 Device GUID ATTRIBUTE NM NM N C panda3d +E e 3 PARAMETER N D Java guava +E e 8 PARAMETER N D Java guava +void each seen event FUNCTION DT NM N C++ meta +igraph_vector_int_t edge color 2 PARAMETER NM N D C rigraph +igraph_vector_t edge map 2 PARAMETER NM N D C rigraph +igraph_inclist_t edges per node PARAMETER NPL P N C rigraph +ASTNodeInfo else Info DECLARATION NM N C++ cling +Stmt else Replacement DECLARATION NM N C++ cling +MinorType else Type DECLARATION NM N Java drill +boolean enable Push down ATTRIBUTE V V P Java drill +int ENCAPSULATING SECURITY V6 ATTRIBUTE NM N NM Java drill +unsigned encode only PARAMETER V VM C libxo +int end for DECLARATION V N C freeminer +char* end of record FUNCTION N P N C git2r +DrillbitEndpoint endpoint 1 PARAMETER N D Java drill +DrillbitEndpoint endpoint 2 PARAMETER N D Java drill +boolean enough Memory FUNCTION DT N Java drill +Object entry 1 PARAMETER N D Java guava +Client_data entry 2 DECLARATION N D C toxcore +git_tree_entry entry out PARAMETER NM N C git2r +char* Err no FUNCTION NM N C++ facebook-repo-ds2 +void* error 2 FUNCTION N D C rigraph +boolean error On 400 ATTRIBUTE N P D Java drill +auto even DECLARATION NM C++ meta +int even dist PARAMETER NM N C ccv +bool even split PARAMETER NM N C++ meta +JsonParseException ex 1 PARAMETER N D Java drill +Except except PARAMETER N Java drill +__int64 exit 64 DECLARATION N D C++ panda3d +long expand in ATTRIBUTE V N C toggldesktop +CODE* expression 7 FUNCTION N D C rigraph +CODE* expression 8 FUNCTION N D C rigraph +bool extend by hexahedron FUNCTION V P N C++ panda3d +PackOutFunc f out ATTRIBUTE N NM C++ freeminer +double f1 score FUNCTION NM N C++ meta +int fan in DECLARATION N P C++ caffe +uint64_t features per class DECLARATION NPL P N C meta +_NXMapTable field 37 ATTRIBUTE N D C toggldesktop +int field 4 ATTRIBUTE N D C toggldesktop +int field 63 ATTRIBUTE N D C toggldesktop +class File 2 Page App CLASS N P NM N C++ toggldesktop +int file no PARAMETER NM N C++ facebook-repo-ds2 +NodePathCollection find all matches FUNCTION V DT NPL C++ panda3d +TextureCollection* find all textures FUNCTION V DT NPL C++ panda3d +int flag true if should convert PARAMETER N NM CJ V V C panda3d +unsigned flag within ATTRIBUTE N P C git2r +int FLOAT4 VALUE ATTRIBUTE NM N Java drill +Reporter for Annotation FUNCTION NM N Java immutables +boolean for Attribute ATTRIBUTE NM N Java immutables +linear_model for avg DECLARATION P N C++ meta +boolean for Backprop PARAMETER P N Java deeplearning4j +TypeDescriptor for Class FUNCTION P N Java drill +DrillConfig for Client FUNCTION P N Java drill +Visibility for Implementation FUNCTION P N Java immutables +void* for num FUNCTION NM N C freeminer +MinorType for Number FUNCTION P N Java drill +DeclaringPackage for Package FUNCTION NM N Java immutables +Set for Resource FUNCTION P N Java drill +ProxyInfo for Socks4 Proxy FUNCTION P NM N Java Smack +boolean for Unknown Schema PARAMETER P NM N Java drill +bool force fog off ATTRIBUTE V N VM C++ freeminer +bool force nd im 2 col ATTRIBUTE V NM N P N C++ caffe +vector forward time per layer DECLARATION NM N P N C++ caffe +double fp irand 224 FUNCTION NM N D C rigraph +bool fp on ATTRIBUTE N P C++ rigraph +void fprint all protocols for layer types FUNCTION V DT NPL P NM NPL C wireshark +int* friend in close FUNCTION N P N C toxcore +state_id from PARAMETER P C s3fs-fuse +char from PARAMETER P C++ meta +T from Bytes FUNCTION P NPL Java Singularity +ThreadContext from context PARAMETER P N C panda3d +boolean from Docker Config ATTRIBUTE P NM N Java Singularity +String from Email PARAMETER P N Java Openfire +FromHeader from Header ATTRIBUTE P N Java Spark +uint8_t from id PARAMETER P N C toxcore +boolean from Inclusive PARAMETER P N Java guava +class from Iterator CLASS P N Java guava +JID from JID PARAMETER P N Java Openfire +K from Key PARAMETER P N Java guava +StanzaFilter from Room Filter ATTRIBUTE P NM N Java Smack +boolean from Server PARAMETER Java Openfire +SelectionVector4 from SV 4 PARAMETER P N D Java drill +int from Y PARAMETER P N Java Spark +int* fts5 MultiIter Do Compare FUNCTION PRE PRE V V C toggldesktop +void* fts5 Seg Iter Clear FUNCTION PRE NM N V C ccv +fts5yyParser fts5 yyp Parser PARAMETER PRE NM N C ccv +char* function and data DECLARATION N CJ N C weechat +bool g curand availability logged DECLARATION PRE NM N V C++ caffe +objectlist g next DECLARATION PRE DT C naemon-core +int* generate key or iv FUNCTION V N CJ N C wireshark +void Generate Prolog 1 FUNCTION V N D C++ facebook-repo-ds2 +string get a 1 FUNCTION V N D C++ panda3d +String get B64 Data FUNCTION V NM N Java Smack +LVecBase2d get data 2d FUNCTION V N NM C++ panda3d +LVecBase4d get data 4d FUNCTION V N NM C++ panda3d +LVecBase4i get data 4i FUNCTION V N NM C++ panda3d +int get Decimal 9 From Big Decimal FUNCTION V N D P NM N Java drill +time_t get last modified FUNCTION V DT N C++ s3fs-fuse +JLabel get Look and feel Label FUNCTION V NM CJ NM N Java Spark +long get next comment id FUNCTION V DT NM N C naemon-core +xmlChar* get next marker FUNCTION V DT N C++ s3fs-fuse +long get Part 2 FUNCTION V N D Java drill +LPoint3 get position world on a FUNCTION V NM N P N C++ panda3d +size_t get start r 1 FUNCTION V NM N D C++ meta +bool get U 16 No Ex FUNCTION V N D DT N C++ freeminer +int* git delta read header from stream FUNCTION PRE PRE V N P N C git2r +int git diff commit as email FUNCTION PRE PRE N P N C git2r +int git fs path to dir FUNCTION PRE NM N P N C git2r +int git index update all FUNCTION PRE PRE V DT C git2r +size_t git off map size FUNCTION PRE NM NM N C git2r +int* git repository head detached for work tree FUNCTION PRE PRE N V P NM N C git2r +void gl M 3 Inv FUNCTION PRE N D N C++ panda3d +void group with FUNCTION V P C++ panda3d +GsonBuilder gson Builder DECLARATION NM N Java immutables +void* gui buffer local var remove all FUNCTION PRE PRE NM N V DT C weechat +void gui buffer set time for each line FUNCTION PRE PRE V N P DT N C weechat +void* gui input search next FUNCTION PRE PRE V DT C weechat +int* gui line has tag no filter FUNCTION PRE PRE V N DT N C weechat +void* gui line mixed free all FUNCTION PRE PRE NM V DT C weechat +char* gui mouse event code 2 key FUNCTION PRE PRE NM N P N C weechat +void gui nick hash sum 64 FUNCTION PRE PRE NM N D C weechat +in6_addr gw 6 PARAMETER N D C ovs +auto h 2 server DECLARATION N D N C++ proxygen +int h if index ATTRIBUTE PRE NM N C ovs +int* handle send 2 FUNCTION V N D C toxcore +bool has after destruct ATTRIBUTE V NM N C freeminer +bool has each DECLARATION V DT C++ panda3d +bool has in band PARAMETER V NM N C ovs +bool has on activate ATTRIBUTE V P V C freeminer +bool has run at least once ATTRIBUTE V V VM VM VM C++ caffe +bool has Upgrade Token in Connection DECLARATION V NM N P N C++ proxygen +class Hash 32 Functions CLASS N D NPL Java drill +class Hash 64 Functions With Seed CLASS N D NPL P N Java drill +int hash and save FUNCTION V CJ V C git2r +float hbp 1 DECLARATION N D C ccv +gboolean header only PARAMETER N VM C wireshark +void hide all switches FUNCTION V DT NPL C++ panda3d +ObjectMapper hocon Mapper ATTRIBUTE NM N Java drill +void host 1 PARAMETER N D C naemon-core +int hosts down DECLARATION NPL VM C naemon-core +gint how PARAMETER VM C wireshark +float hp 1 DECLARATION N D C ccv +int httperf 2 FUNCTION N D C++ proxygen +int i 1 PARAMETER N D Java Openfire +igraph_error_t* i graph all st min cuts FUNCTION PRE PRE DT NM NM NPL C rigraph +class Iax2 Analysis Tree Widget Item CLASS PRE NM NM NM N C++ wireshark +IDAT idat ATTRIBUTE N C mgba +IDAT idat var PARAMETER NM N C mgba +char idb 1 if description DECLARATION N D NM N C wireshark +LogicalExpression if Condition DECLARATION NM N Java drill +IfElseWidthExpr if Else Width Expr PARAMETER NM NM NM N Java drill +SqlNode if Exists ATTRIBUTE NM N Java drill +IfExpression if Expr PARAMETER NM N Java drill +JBlock if Found DECLARATION CJ V Java drill +int if index ATTRIBUTE NM N C ovs +ifinfomsg if info DECLARATION NM N C ovs +char if name ATTRIBUTE NM N C ovs +JBlock if No Val DECLARATION CJ DT N Java drill +void if notifier wait FUNCTION NM N V C ovs +boolean if Present PARAMETER CJ NM Java immutables +OutputWidthExpression if Reduced Expr DECLARATION CJ NM N Java drill +PMIB_IF_TABLE2 if Table PARAMETER NM N C ovs +bool if Unique PARAMETER CJ NM C++ cling +int if Width DECLARATION NM N Java drill +int igraph 2 w heap init FUNCTION PRE D VM N V C rigraph +void igraph err no PARAMETER NM NM N C rigraph +int igraph i get subisomorphisms vf 2 inner FUNCTION PRE PRE V NPL N D NM C rigraph +bool im 2 col PARAMETER N P N C++ caffe +INDArray im 2 col 2d DECLARATION N P N NM Java deeplearning4j +class Im 2 col Layer CLASS N P N N C++ caffe +Image image 1 PARAMETER N D Java Spark +String in Action Code PARAMETER PRE NM N Java Smack +InetAddress in addr PARAMETER NM N Java Openfire +bool in best path ATTRIBUTE P NM N C++ rigraph +Channel in channel PARAMETER NM N C++ drill +verify_context in ctx PARAMETER NM N C++ drill +boolean in Eclipse Compiler ATTRIBUTE P NM N Java immutables +Map in Edges PARAMETER NM NPL Java guava +Set in Eq More Than Once DECLARATION P N DT CJ VM Java drill +int in expected RPC Type PARAMETER PRE NM NM N C++ drill +TypedFieldId in Field Id DECLARATION NM NM N Java drill +List in Fields ATTRIBUTE NM NPL Java drill +double in Flow New M DECLARATION NM NM NM N C++ rigraph +Context in for FUNCTION P N Java immutables +HANDLE in hand DECLARATION NM N C toggldesktop +void in how ATTRIBUTE V VM C mgba +Integer in Index PARAMETER NM N Java drill +ovs_be64 in key ATTRIBUTE NM N C ovs +igraph_adjlist_t in list DECLARATION NM N C rigraph +boolean in Literal DECLARATION P N Java Openfire +int in Mem Count PARAMETER P NM N Java drill +string in name PARAMETER NM N C++ meta +stbi_uc in near PARAMETER N P C panda3d +bool in neighbour heap ATTRIBUTE P NM N C++ rigraph +boolean in Outer List ATTRIBUTE P NM N Java drill +Pipe in Pipe PARAMETER NM N C++ toggldesktop +string in prop Name PARAMETER NM NM N C++ drill +SizeT in Size DECLARATION NM N C mgba +RelTrait in Trait ATTRIBUTE NM N Java drill +BOOLEAN in Transaction DECLARATION P N C ovs +int in Vector DECLARATION NM N Java drill +bool include all fetch heads DECLARATION V DT NM NPL C git2r +boolean incoming Has Sv 2 DECLARATION V V N D Java drill +int index as child ATTRIBUTE V P N C++ deeplearning4j +int index as parent ATTRIBUTE V P N C++ deeplearning4j +xo_info_t info p PARAMETER NM N C libxo +void init 2 FUNCTION N D C++ toggldesktop +BIO inkey bio DECLARATION NM N C toggldesktop +Map inlinables ATTRIBUTE NPL Java immutables +int32_t inp 0 DECLARATION N D C mgba +int insert at PARAMETER V P C++ panda3d +void* insert V 4 Headers FUNCTION V N D NPL C++ s3fs-fuse +uint32_t insn 0 PARAMETER N D C++ facebook-repo-ds2 +uint32_t insn 1 PARAMETER N D C++ facebook-repo-ds2 +void instr max ATTRIBUTE NM N C++ toggldesktop +String interval SubString 1 DECLARATION NM N D Java drill +int INVERSE COMPUTE FOR WORD OF ALL 1S ATTRIBUTE V N P N P DT NPL Java guava +u_int8_t ip6 h nxt ATTRIBUTE PRE N NM C ovs +guint8 ip6r0 slmap ATTRIBUTE PRE N C wireshark +void irc batch free all FUNCTION PRE PRE V DT C weechat +int* irc color convert rgb 2 irc FUNCTION PRE PRE V N P N C weechat +int* irc message split 005 FUNCTION PRE N V D C weechat +void* irc notify new for all servers FUNCTION PRE PRE NM P DT NPL C weechat +bool is 1 x 1 ATTRIBUTE V D P D C++ caffe +bool is 32 ATTRIBUTE V D C facebook-repo-ds2 +bool* is a ge zero and a lt b FUNCTION V N NM D CJ N NM N C++ caffe +bool is all PARAMETER V DT C++ s3fs-fuse +boolean is Base32 FUNCTION V N Java Openfire +bool* is convertible to FUNCTION V NM P C++ panda3d +char is dir 2 PARAMETER V N D C git2r +bool is even PARAMETER V NM C++ toggldesktop +bool is HTTP11 FUNCTION V N C++ proxygen +bool is step up ATTRIBUTE V N P C++ freeminer +boolean is Supports Limit Push down FUNCTION V V N V P Java drill +bool is this FUNCTION V N C++ panda3d +int* is valid escalation for service notification FUNCTION V NM N P NM N C naemon-core +bool is valid position 2 DECLARATION V NM N D C++ freeminer +int iterations per sample ATTRIBUTE NPL P N C++ freeminer +class Java11 Web Socket CLASS PRE NM N Java Smack +class Java11 Web Socket Factory CLASS PRE NM NM N Java Smack +Map join Mj Id 2 Scan Mj Id DECLARATION V NM N P V NM N Java drill +K k 6 PARAMETER N D Java guava +K k 7 PARAMETER N D Java guava +K k 8 PARAMETER N D Java guava +int k Diy Significand Size ATTRIBUTE NM NM NM N C++ panda3d +uint64_t k Dp Significand Mask ATTRIBUTE NM NM NM N C++ panda3d +int k Dp Significand Size ATTRIBUTE NM NM NM N C++ panda3d +bool k Im 2 Col DECLARATION NM N P N C++ caffe +uint32_t keep when false PARAMETER V VM NM C rigraph +uint32_t keep when true PARAMETER V VM NM C rigraph +class Keras 2D Embedding CLASS PRE NM N Java deeplearning4j +class Keras Convolution 2D CLASS PRE N NM Java deeplearning4j +class Keras Deconvolution 2D CLASS PRE N NM Java deeplearning4j +class Keras Depthwise Convolution 2D CLASS PRE NM N NM Java deeplearning4j +class Keras Upsampling 1D CLASS PRE N NM Java deeplearning4j +class Keras Zero Padding 1D CLASS PRE NM N NM Java deeplearning4j +void* kill nonused tcp FUNCTION V NM N C toxcore +uint16_t l 4 ofs ATTRIBUTE N D N C ovs +int l get node or nil FUNCTION PRE V N CJ NM C++ freeminer +int* l place schematic on vmanip FUNCTION PRE V N P N C++ freeminer +int l set last run mod FUNCTION PRE V DT V N C++ freeminer +integer l wk 1 DECLARATION PRE N D C rigraph +double l1 regularizer ATTRIBUTE NM N C meta +void l2 norm transform FUNCTION NM NM V C meta +int last Access Time PARAMETER DT NM N Java drill +guint32 last ack seq ATTRIBUTE DT NM N C wireshark +long last Active ATTRIBUTE DT N Java Openfire +Instant last Activity DECLARATION DT N Java Openfire +Date last Activity Date Range Max ATTRIBUTE DT NM NM NM N Java Openfire +long last Answered Request ID ATTRIBUTE DT NM NM N Java Openfire +String last Argument DECLARATION DT N Java immutables +boolean last Batch Read ATTRIBUTE DT NM N Java drill +int last bucket DECLARATION DT N C naemon-core +unique_ptr last builder ATTRIBUTE DT N C++ meta +size_t last bytes ATTRIBUTE DT NPL C git2r +AtomicInteger last Coordination Id ATTRIBUTE DT NM N Java drill +time_t last data purge ATTRIBUTE DT N V C weechat +Document last Document ATTRIBUTE DT N Java drill +BatchHolder last Entry Batch PARAMETER DT NM N Java drill +DWORD last err PARAMETER DT N C ovs +SQLException last Exception DECLARATION DT N Java Openfire +QualType last Expr Ty DECLARATION DT NM N C++ cling +AtomicLong last Heartbeat Time PARAMETER DT NM N Java Singularity +uint64_t last id DECLARATION DT N C++ meta +int last Idx ATTRIBUTE DT N Java deeplearning4j +size_t last in target DECLARATION DT P N C git2r +int last Index Of Dot DECLARATION DT N P N Java immutables +K last Key FUNCTION DT N Java guava +int last layer index ATTRIBUTE DT NM N C++ caffe +label_id last lbl DECLARATION DT N C++ meta +INDArray last Mem Cell ATTRIBUTE DT NM N Java deeplearning4j +HashMap last Message ATTRIBUTE DT N Java Spark +Date last Message Date PARAMETER DT NM N Java Smack +Mode last Mode ATTRIBUTE DT N C++ freeminer +guint last n ATTRIBUTE DT N C wireshark +bool last part PARAMETER DT N C++ s3fs-fuse +long last Pending Task Cache ATTRIBUTE DT NM NM N Java Singularity +u16 last percent ATTRIBUTE DT N C++ freeminer +AtomicLong last Persister Success PARAMETER DT NM N Java Singularity +int last pos PARAMETER DT N C toggldesktop +ScanRange last Range DECLARATION DT N Java drill +RelNode last Rel Node DECLARATION DT NM N Java drill +long last Request Utilization Cache ATTRIBUTE DT NM NM N Java Singularity +IterOutcome last Right Status DECLARATION DT NM N Java drill +int last Row ATTRIBUTE DT N Java drill +t_plugin_script last script ATTRIBUTE DT N C weechat +int last Segment Index DECLARATION DT NM N Java drill +int last Set DECLARATION DT N Java drill +int last Slash Index DECLARATION DT NM N Java drill +StreamID last Stream PARAMETER DT N C++ proxygen +auto last Stream Id Size DECLARATION DT NM NM N C++ proxygen +Optional last Task Status ATTRIBUTE DT NM N Java Singularity +uint64_t last term ATTRIBUTE DT N C ovs +u64 last time ATTRIBUTE DT N C++ freeminer +time_t last time critical ATTRIBUTE DT N NM C naemon-core +u64 last time ms ATTRIBUTE DT N NM C++ freeminer +char last Transition ATTRIBUTE DT N Java drill +long last Update Time ATTRIBUTE DT NM N Java deeplearning4j +SingularityTaskUsage last Usage DECLARATION DT N Java Singularity +long last used ATTRIBUTE DT V C ovs +int last used Ypos ATTRIBUTE DT V N Java Spark +boolean last Value PARAMETER DT N Java immutables +rusage last wakeup PARAMETER DT V C ovs +wint_t last wc PARAMETER DT N C git2r +int last Write PARAMETER DT N Java drill +int last Write Index FUNCTION DT NM N Java drill +JVar last Writer Idx ATTRIBUTE DT NM N Java drill +int last Y ATTRIBUTE DT N C mgba +String LAYER FIELD POOL 1D SIZE ATTRIBUTE NM NM N NM NM Java deeplearning4j +char least addr ATTRIBUTE DT N C++ panda3d +Intratype left Intra type DECLARATION NM NM N Java immutables +uint32_t left to parse DECLARATION V P V C++ proxygen +size_t len a PARAMETER NM N C freeminer +String less Terminal Path ATTRIBUTE NM NM N Java Singularity +LikeFilter like Filter PARAMETER NM N C++ drill +RunQuery limit 0 Query DECLARATION V D N Java drill +int lines after ATTRIBUTE NPL P C weechat +size_t lines in hunk DECLARATION NPL P N C git2r +int lines per option DECLARATION NPL P N C weechat +long lk 1 DECLARATION N D Java drill +int load from lib dir PARAMETER V P NM N C weechat +git_str local path out PARAMETER NM NM N C git2r +string log on wait PARAMETER V P V C++ caffe +JComboBox look and feel ATTRIBUTE NM CJ NM Java Spark +JLabel look and feel Label ATTRIBUTE NM CJ NM N Java Spark +LPoint3 look at ATTRIBUTE V P C++ panda3d +double m 11 ATTRIBUTE N D C++ freeminer +double m 13 ATTRIBUTE N D C++ freeminer +double m 21 ATTRIBUTE N D C++ freeminer +double m 22 ATTRIBUTE N D C++ freeminer +double m 23 ATTRIBUTE N D C++ freeminer +double m 31 ATTRIBUTE N D C++ freeminer +double m 33 ATTRIBUTE N D C++ freeminer +bool m all Tables Selectable ATTRIBUTE PRE DT NPL NM C++ drill +QString m endpoint a ATTRIBUTE PRE NM N C++ wireshark +ForWhat m for What ATTRIBUTE PRE P DT C++ freeminer +double m in Nanoseconds ATTRIBUTE PRE P NPL C++ freeminer +AssertionInfo m last Assertion Info ATTRIBUTE PRE DT NM N C++ freeminer +bool m last Assertion Passed ATTRIBUTE PRE DT N V C++ freeminer +size_t m last Connection ATTRIBUTE PRE DT N C++ drill +string m last Query ATTRIBUTE PRE DT N C++ drill +Option m last Result ATTRIBUTE PRE DT N C++ freeminer +u16 m last used id ATTRIBUTE PRE DT NM N C++ freeminer +bool m like Escape Clause Supported ATTRIBUTE PRE NM NM N V C++ drill +map m name to id ATTRIBUTE PRE N P N C++ freeminer +CachedVertexShaderSetting m perspective bias 1 vertex ATTRIBUTE PRE NM NM D N C++ freeminer +void m PSP 2 Load ROM FUNCTION PRE N D V N C mgba +string m redirected Cout ATTRIBUTE PRE V N C++ freeminer +int m sbox 4 ATTRIBUTE PRE N D Java Openfire +T m to ATTRIBUTE PRE P C++ toggldesktop +Vector m to copy ATTRIBUTE PRE P N Java freeminer +bool m used up ATTRIBUTE PRE V VM C++ freeminer +string m what ATTRIBUTE PRE DT C++ toggldesktop +String MAIL FORWARD 16 x 16 ATTRIBUTE V N D P D Java Spark +int* make no n indexed FUNCTION V DT N NM C++ panda3d +int max v 4 frag list size ATTRIBUTE NM N D NM NM N C ovs +int md5 out len DECLARATION NM NM N C++ s3fs-fuse +void merge chunks by bucket size FUNCTION V NPL P NM N C++ meta +int* merge driver name for path FUNCTION V NM N P N C git2r +auto merged 1 ATTRIBUTE N D C++ deeplearning4j +auto merged 3 ATTRIBUTE N D C++ deeplearning4j +char message after mod ATTRIBUTE N P N C weechat +char message before mod ATTRIBUTE V P N C weechat +char message no color DECLARATION N DT N C weechat +float mid 1 DECLARATION N D C++ freeminer +LogicalMinus minus PARAMETER P Java drill +bool monitor everything by default PARAMETER V DT P N C ovs +ccv_cnnp_dataframe_data_item_t more data DECLARATION DT N C ccv +ClosingFuture more Futures PARAMETER DT NPL Java guava +Gs more Generators PARAMETER DT NPL C++ freeminer +u8 more To Follow PARAMETER DT P V C ccv +int morecore properties ATTRIBUTE PRE NPL C++ panda3d +long murmur 3 64 FUNCTION N D D Java drill +class Murmur Hash 3 CLASS NM N D Java drill +class Murmur3 128 Hash Function CLASS PRE D NM N Java guava +MapNode n 3 DECLARATION N D C++ freeminer +Int n col 2 PARAMETER NM N D C rigraph +MapNode n dirt with grass DECLARATION PRE N P N C++ freeminer +uint64_t n frag too small ATTRIBUTE NM N VM NM C ovs +MapNode n from PARAMETER N P C++ freeminer +MapNode n water or ice DECLARATION N N CJ N C++ freeminer +string name 1 PARAMETER N D C++ meta +char name what ATTRIBUTE N DT C freeminer +void Net after forward FUNCTION PRE P N C++ caffe +void Net before backward FUNCTION PRE P N C++ caffe +void Net before forward FUNCTION PRE P N C++ caffe +int* netdev dummy queue dump next FUNCTION PRE PRE N V DT C ovs +int* network pass socks5 proxy FUNCTION NM NM NM N C weechat +int new Param 1 ATTRIBUTE NM N D C++ freeminer +int new Param 2 ATTRIBUTE NM N D C++ freeminer +int new Shape 2 DECLARATION NM N D Java deeplearning4j +BatchInfo next ATTRIBUTE DT C++ toggldesktop +t_gui_bar next bar ATTRIBUTE DT N C weechat +float next Cast Cost DECLARATION DT NM N Java drill +char next Char DECLARATION DT N Java drill +t_config_file next config ATTRIBUTE DT N C weechat +xodtemplate_daterange next date range DECLARATION DT NM N C naemon-core +double next Double FUNCTION DT N Java immutables +void next Egress FUNCTION DT N C++ proxygen +Group next Element ATTRIBUTE DT N Java Openfire +int32_t next event ATTRIBUTE DT N C mgba +String next Field ATTRIBUTE DT N Java drill +String next Field Name FUNCTION DT NM N Java immutables +NextFilter Next Filter PARAMETER DT N Java Openfire +ClassNames next Generated DECLARATION DT V Java drill +t_gui_nick_group next group ATTRIBUTE DT N C weechat +xodtemplate_hostescalation next he DECLARATION DT N C naemon-core +int next head FUNCTION DT N C git2r +int next Id ATTRIBUTE DT N Java drill +t_irc_ignore next ignore ATTRIBUTE DT V C weechat +ImmutableEntry next In Bucket FUNCTION DT P N Java guava +TaskStatus next In Memory FUNCTION DT P N Java Singularity +Object next Instance FUNCTION DT N Java cglib +boolean next Integer If Not EOF FUNCTION DT NM CJ DT N Java drill +char next Key DECLARATION DT N C++ cling +Object next Label DECLARATION DT N Java guava +int next Local ATTRIBUTE DT N Java cglib +string next marker DECLARATION DT N C++ s3fs-fuse +int next max ATTRIBUTE DT N C ovs +vector next metadata FUNCTION DT N C++ meta +List next Names DECLARATION DT NPL Java drill +Cell next nonsingleton ATTRIBUTE DT N C++ rigraph +objectlist next object list DECLARATION DT NM N C naemon-core +gint next offset DECLARATION DT N C wireshark +ReadStatus next Page From Queue FUNCTION DT N P N Java drill +Object next Page Value DECLARATION DT NM N Java drill +int next Partition To Return ATTRIBUTE DT N P N Java drill +uint32_t next PC DECLARATION DT N C++ facebook-repo-ds2 +ClassNames next Precompiled DECLARATION DT NM Java drill +Token* next Preprocessed FUNCTION DT V C++ toggldesktop +string next Protocol PARAMETER DT N C++ proxygen +string next Protos PARAMETER DT NPL C++ proxygen +query_handler next qh ATTRIBUTE DT N C naemon-core +u8 next real face DECLARATION DT NM N C++ freeminer +Pair next Row Key Batch FUNCTION DT NM NM N Java drill +xodtemplate_serviceextinfo next se DECLARATION DT N C naemon-core +servicesmember next services member DECLARATION DT NM N C naemon-core +ClassSet next Set DECLARATION DT N Java drill +Node* next Sibling FUNCTION DT N C++ toggldesktop +FileSplit next Split FUNCTION DT N Java drill +long next tag DECLARATION DT N C++ s3fs-fuse +Runnable next Task ATTRIBUTE DT N Java guava +int next tex ATTRIBUTE DT N C++ panda3d +char next Tok Ptr PARAMETER DT NM N C toggldesktop +ASN1_GENERALIZEDTIME next upd PARAMETER DT N C toggldesktop +int32_t next update ATTRIBUTE DT N C mgba +long next Update Time ATTRIBUTE DT NM N Java drill +long next Value ATTRIBUTE DT N Java metrics +ImmutableList no Attributes ATTRIBUTE DT NPL Java immutables +bool no cache PARAMETER DT N C freeminer +int no callback DECLARATION DT N C git2r +fdpage_list_t no data pages DECLARATION DT NM NPL C++ s3fs-fuse +no_delay no Delay DECLARATION DT N C++ drill +float no dig delay timer ATTRIBUTE DT N NM N C++ freeminer +bool no emerge PARAMETER VM V C++ freeminer +bool no Error FUNCTION DT N C++ proxygen +int no in DECLARATION NM N C rigraph +boolean no Interfaces ATTRIBUTE DT NPL Java deeplearning4j +int no lock Lock FUNCTION DT NM N C ccv +char no log DECLARATION DT N C weechat +bool no logo PARAMETER DT N C cling +igraph_integer_t no of edges 2 DECLARATION N P NPL D C rigraph +int no of groups PARAMETER N P NPL C rigraph +int no of nodes DECLARATION N P NPL C rigraph +igraph_integer_t no out types PARAMETER NM NM NPL C rigraph +bool no output ATTRIBUTE DT N C++ freeminer +bool no random PARAMETER DT N C++ freeminer +uint64_t no replay DECLARATION DT N C toxcore +boolean no Reply PARAMETER DT N Java Smack +bool no Runtime PARAMETER DT N C++ cling +SourceLocation no Src Loc DECLARATION DT NM N C++ cling +int no text PARAMETER DT N C toggldesktop +bool no truncate PARAMETER DT V C++ s3fs-fuse +Writable no Val ATTRIBUTE DT N Java deeplearning4j +PandaNode node 2 PARAMETER N D C++ panda3d +float noise 2 PARAMETER N D C++ freeminer +auto none DECLARATION DT C++ meta +void normalize by rebuilding FUNCTION V P V C++ panda3d +int* not a local branch FUNCTION VM DT NM N C git2r +guint16 noti flags number PARAMETER NM NM N C wireshark +int notified on ATTRIBUTE V P C naemon-core +int notify contact of host FUNCTION V N P N C naemon-core +Instant now DECLARATION VM Java Openfire +string now cache DECLARATION VM V C++ s3fs-fuse +long now Micros PARAMETER VM NPL Java guava +string now path DECLARATION VM N C++ s3fs-fuse +time_t now time DECLARATION VM N C++ s3fs-fuse +TextureCollection* ns find all textures FUNCTION PRE V DT NPL C++ panda3d +bool null or empty FUNCTION NM CJ NM C++ deeplearning4j +int num ascnt ATTRIBUTE NM N C rigraph +Blob num by chans ATTRIBUTE N P NPL C++ caffe +int num faces to draw DECLARATION NM NPL P V C++ freeminer +int num kernels col 2 im ATTRIBUTE NM NPL N P N C++ caffe +int num kernels im 2 col ATTRIBUTE NM NPL N P N C++ caffe +int num of after release pdus ATTRIBUTE N P P V NPL C wireshark +int num of gops ATTRIBUTE N P NPL C wireshark +size_t num to check DECLARATION N P V C mgba +int num vertices per primitive DECLARATION NM NPL P N C++ panda3d +long number of nodes DECLARATION N P NPL C++ rigraph +NumericType numeric type 3 PARAMETER NM N D C++ panda3d +int nxt in ATTRIBUTE P P C rigraph +MultiUserChatService o 2 PARAMETER N D Java Openfire +LongType o Stride 0 DECLARATION NM N D C++ deeplearning4j +btCollisionObject obj 0 DECLARATION N D C++ panda3d +int obsess over host PARAMETER V P N C naemon-core +odp_key_fitness* odp nsh key from attr FUNCTION PRE PRE N P N C ovs +float odx 1 DECLARATION N D C rigraph +class OF 1515 CLASS N D Java Openfire +ofbundle of bundle PARAMETER PRE N C ovs +ofservice of service DECLARATION PRE N C ovs +off_t off PARAMETER N C meta +auto off Arg DECLARATION NM N C++ facebook-repo-ds2 +long off Bits PARAMETER NM NPL C++ panda3d +ImageIcon off Icon ATTRIBUTE NM N Java Openfire +int offset p PARAMETER NM N C wireshark +igraph_real_t offset to left contour ATTRIBUTE V P NM N C rigraph +igraph_real_t offset to right contour ATTRIBUTE V P NM N C rigraph +ofpbuf* ofp buf clone with headroom FUNCTION PRE PRE N P N C ovs +git_oid oid a PARAMETER NM N C git2r +int old Param 2 ATTRIBUTE NM N D C++ freeminer +ObjectName on DECLARATION N Java metrics +void on Connection Close FUNCTION P N V Java Openfire +int on disk ATTRIBUTE P N C git2r +void on Egress Buffered FUNCTION P N V C++ proxygen +ErrorCode on Execute Program FUNCTION P V N C++ facebook-repo-ds2 +ObjectNameFactory on Factory PARAMETER NM N Java metrics +bp::object on gradients ready ATTRIBUTE P NPL NM C++ caffe +size_t on Header Bytes Generated DECLARATION P NM NPL V C proxygen +ImageIcon on Icon ATTRIBUTE NM N Java Openfire +void on Ingress Error FUNCTION P NM N C++ proxygen +void on Metric Removed FUNCTION P N V Java metrics +ebb_header_cb on multipart header field ATTRIBUTE P NM NM N C ccv +void on Ping Reply Latency FUNCTION P NM NM N C++ proxygen +void on Post Execute FUNCTION P N V Java freeminer +ErrorCode on Query Current Thread FUNCTION P NM NM N C++ facebook-repo-ds2 +ErrorCode on Query Hardware Watchpoint Count FUNCTION P NM NM NM N C++ facebook-repo-ds2 +ebb_element_cb on query string ATTRIBUTE P NM N C ccv +http_data_cb on reason ATTRIBUTE P N C proxygen +void* on request response FUNCTION P NM N C ccv +void on Response Content FUNCTION P NM N Java metrics +void on Server Egress Paused FUNCTION P NM N V C++ proxygen +bool on Server Side PARAMETER P NM N C++ toggldesktop +void on Shutdown Request FUNCTION P NM N Java drill +void on start FUNCTION P N C++ caffe +object on start ATTRIBUTE P N C++ caffe +http_data_cb on status ATTRIBUTE P N C crow +ErrorCode* on Thread Is Alive FUNCTION P N V NM C++ facebook-repo-ds2 +void on Tick FUNCTION P N Java drill +bool on Transport Ready Common FUNCTION P NM NM N C++ proxygen +ovsthread_once once DECLARATION VM C ovs +bool only amz PARAMETER VM N C++ s3fs-fuse +boolean only Done ATTRIBUTE VM NM Java Smack +Entry only Entry DECLARATION VM N Java guava +bool only if existing PARAMETER VM CJ V C git2r +boolean only Impersonation Enabled ATTRIBUTE VM N V Java drill +bool only in ground ATTRIBUTE VM P N C freeminer +igraph_bool_t only indices PARAMETER VM NPL C rigraph +boolean only Local PARAMETER VM N Java Openfire +bool only pool PARAMETER VM N C++ s3fs-fuse +int only user ATTRIBUTE VM N C toggldesktop +char open if empty DECLARATION V CJ NM C libxo +int opposite Major Fragment Id ATTRIBUTE NM NM NM N Java drill +u8 or conf PARAMETER NM N C ccv +DruidFilter or Filter At Index FUNCTION NM N P N Java drill +RexNode or Pred PARAMETER NM N Java drill +double or Sel DECLARATION NM N Java drill +igraph_vector_int_t order out PARAMETER N NM C rigraph +int out audio samples DECLARATION NM NM NPL C toxcore +T out Buff DECLARATION NM N C++ deeplearning4j +int out Buff Posn DECLARATION NM NM N Java Openfire +ErrorCode out Code PARAMETER NM N C++ proxygen +char out dev DECLARATION NM N C ovs +ExAttributes out Ex Attributes PARAMETER NM NM NPL C++ proxygen +RelFieldCollation out Field Collation DECLARATION NM NM N Java drill +ofstream out file DECLARATION NM N C++ meta +unique_ptr out Header Data DECLARATION NM NM N C++ proxygen +void out how PARAMETER VM VM C mgba +TypedFieldId out Key Field Ids PARAMETER NM NM NM NPL Java drill +uint32_t out list PARAMETER NM N C toxcore +igraph_inclist_t out list DECLARATION NM N C rigraph +bool out max val ATTRIBUTE V NM N C++ caffe +AtomicLong out Messages ATTRIBUTE NM NPL Java Openfire +int out Name Index DECLARATION NM NM N Java drill +void* out of domain FUNCTION P P N C rigraph +boolean out Of Memory PARAMETER P P N Java drill +uint64_t out Opaque Data PARAMETER NM NM N C++ proxygen +Prel out Prel DECLARATION NM N Java drill +PushId out Push Id PARAMETER NM NM N C++ proxygen +igraph_vector_t out seq PARAMETER NM N C rigraph +String out Stat Name PARAMETER NM NM N Java drill +lm_state out state PARAMETER NM N C++ meta +int out Types Offset DECLARATION NM NM N Java deeplearning4j +int out vlan ATTRIBUTE NM N C ovs +size_t over size DECLARATION P N C++ s3fs-fuse +OVS_WARN_UNUSED_RESULT* ovs db transient datum from json FUNCTION PRE PRE NM N P N C ovs +json* ovsdb atom string create no copy FUNCTION PRE PRE N V DT V C ovs +int ovsdb datum compare 3way FUNCTION PRE N V VM C ovs +int p 2 DECLARATION N D Java Openfire +v2s16 p 2 d PARAMETER N D N C++ freeminer +Fts5Table p Fts5 ATTRIBUTE PRE PRE C ccv +Mem p Mem 1 PARAMETER PRE N D C ccv +Token p Name 2 PARAMETER PRE N D C mgba +double p save DECLARATION PRE V C rigraph +u8 p5 Err msg PARAMETER NM NM N C toggldesktop +long pad 0 ATTRIBUTE N D C++ panda3d +uint8_t pad 2 ATTRIBUTE N D C ovs +int parallel for FUNCTION NM N C++ deeplearning4j +vector param propagate down ATTRIBUTE N V P C++ caffe +NetParameter param upgraded pad PARAMETER N V N C++ caffe +long part 3 DECLARATION N D Java guava +boolean partition Filter Push down ATTRIBUTE NM N V P Java drill +boolean partitionby DECLARATION N Java drill +char pass in DECLARATION NM N C toggldesktop +bool pass through DECLARATION V P C++ freeminer +List past Threshold DECLARATION P N Java Singularity +String path Name for Logs PARAMETER NM N P NPL Java Singularity +ArrayList paths 2 DECLARATION NPL D Java deeplearning4j +long per Ex Train DECLARATION P N V Java deeplearning4j +String period no Period ATTRIBUTE N DT N Java Spark +float* perlin Map 3D FUNCTION NM N NM C++ freeminer +int pfcp up function features o9 flags DECLARATION PRE NM N V NM NPL C wireshark +void place all FUNCTION V DT C++ panda3d +bool playing Ch 2 ATTRIBUTE V N D C mgba +int plfit errno PARAMETER NM N C rigraph +char* plugin api info color rgb 2 term cb FUNCTION PRE PRE PRE NM N P NM N C weechat +void* plugin if mainwindow update toolbars FUNCTION PRE PRE PRE V NPL C++ wireshark +char plugin name for upgrade ATTRIBUTE NM N P N C weechat +void* plugin script str 2 ptr FUNCTION PRE PRE N P D C weechat +SchemaPlus plus Of This PARAMETER P P DT Java drill +void pointer 1 PARAMETER N D C weechat +int prefix x 1 ATTRIBUTE NM N D C weechat +ShaderContext* prepare now FUNCTION V VM C++ panda3d +Cell prev nonsingleton ATTRIBUTE DT N C++ rigraph +UInt32 price 2 DECLARATION N D C mgba +void print and free json FUNCTION V CJ V N C ovs +Multimap probe Side Scan 2 hj ATTRIBUTE NM NM N P N Java drill +int process IpV6 Packet FUNCTION V NM N Java drill +gboolean* propagate when not up FUNCTION V CJ VM P C naemon-core +CLzmaEncProps props 2 PARAMETER NPL D C toggldesktop +ImmutableList proto classes DECLARATION NM NPL Java immutables +set provided DECLARATION V C++ deeplearning4j +ColumnMetadata provided Column DECLARATION V N Java drill +String provided Password PARAMETER V N Java Openfire +TupleMetadata provided Schema ATTRIBUTE V N Java drill +void ptr 1 PARAMETER N D C++ proxygen +void ptr 2 PARAMETER N D C++ proxygen +t_irc_channel ptr channel 2 DECLARATION NM N D C weechat +char ptr in buf DECLARATION NM NM N C weechat +char ptr next DECLARATION N DT C weechat +t_config_option ptr option 1 DECLARATION NM N D C weechat +mxArray* ptr to handle FUNCTION N P N C++ caffe +node_info q 0 h DECLARATION N D N C++ meta +QueryId q 1 PARAMETER N D C++ drill +QueryId q 2 PARAMETER N D C++ drill +uint64_t r 14 ATTRIBUTE N D C facebook-repo-ds2 +uint32_t r 8 ATTRIBUTE N D C facebook-repo-ds2 +SEXP* R igraph 0 or vector bool to SEXP FUNCTION PRE PRE D CJ NM N P N C rigraph +SEXP R igraph add myid to env FUNCTION PRE PRE V N P N C rigraph +SEXP R igraph matrix to SEXP FUNCTION PRE PRE N P N C rigraph +shared_ptr rank 0 PARAMETER N D C++ caffe +bool read only ATTRIBUTE V VM C ovs +bool Read S3fs Passwd File FUNCTION V NM NM N C++ s3fs-fuse +bool refine equal to first ATTRIBUTE V N P DT C++ rigraph +class Regression 2D Adapter CLASS NM NM N Java deeplearning4j +v3f rel cam up DECLARATION NM N P C++ freeminer +void* relay auth parse pbkdf2 FUNCTION PRE PRE V N C weechat +void* remove from menu bar FUNCTION V P NM N C++ panda3d +int* rename object no copy FUNCTION V N DT V C++ s3fs-fuse +class Replace ES419 Language Filter CLASS V NM NM N Java Singularity +bool replace if exists FUNCTION V CJ V C++ meta +void restore degs only FUNCTION V NPL VM C++ rigraph +ProjResult result 2 DECLARATION N D Java drill +int* revwalk next toposort FUNCTION PRE DT N C git2r +char RFC3526 PRIME 4096 DECLARATION NM N D C toggldesktop +int rgb 2 DECLARATION N D Java Smack +double rmu 0 ATTRIBUTE N D C rigraph +int root 1 ATTRIBUTE N D C panda3d +Point3 row 2 DECLARATION N D C++ panda3d +RelDataType row Type 1 PARAMETER NM N D Java drill +uint8_t rtc Free Page 1 ATTRIBUTE PRE V N D C mgba +MediaType RTF UTF8 ATTRIBUTE N N Java guava +bool run once DECLARATION V VM C++ s3fs-fuse +ovs_be64 rx1024 to 1522 packets ATTRIBUTE N P D NPL C ovs +ovs_be64 rx128 to 255 packets ATTRIBUTE N P D NPL C ovs +auto s 1 DECLARATION N D C++ meta +ImmutableBitSet s Gby DECLARATION NM N Java drill +sockaddr_in6 s in 6 DECLARATION NM N D C++ freeminer +ObjectFile s o File PARAMETER NM NM N C++ cling +int s o Timeout DECLARATION NM NM N Java Openfire +bool s set find and delete FUNCTION NM N V CJ V C ovs +S3Artifact s3 Artifact ATTRIBUTE NM N Java Singularity +Set s3 Buckets DECLARATION PRE NPL Java Singularity +List s3 Services ATTRIBUTE PRE NPL Java Singularity +String s3 Uploader Key Pattern DECLARATION PRE NM NM N Java Singularity +boolean s3 Use V2 Signing ATTRIBUTE NM V NM N Java Singularity +S3fsCurl s3fs curl DECLARATION PRE N C++ s3fs-fuse +void* s3fs exit fuse loop FUNCTION PRE NM NM N C++ s3fs-fuse +void* s3fs init FUNCTION PRE V C++ s3fs-fuse +fuse_operations s3fs oper DECLARATION PRE N C++ s3fs-fuse +int* s3fs read link FUNCTION PRE NM N C++ s3fs-fuse +bool s3fs str to offt FUNCTION PRE N P N C++ s3fs-fuse +int* s3fs truncate FUNCTION PRE V C++ s3fs-fuse +int* s3fs utimens FUNCTION PRE N C++ s3fs-fuse +int samples per buffer ATTRIBUTE NPL P N C++ panda3d +void save ATTRIBUTE V C meta +String SAVE AS 16 x 16 ATTRIBUTE V P D P D Java Spark +bool save before unloading DECLARATION V P V C++ freeminer +png_size_t save buffer max ATTRIBUTE V NM N C toggldesktop +size_t save data length ATTRIBUTE NM NM N C toxcore +int save err no DECLARATION V NM N C ovs +Long save Every Ms ATTRIBUTE V DT N Java deeplearning4j +boolean save Every Since Last ATTRIBUTE V DT CJ DT Java deeplearning4j +SingularityCreateResult save Expiring Object FUNCTION V NM N Java Singularity +void save Mail Record FUNCTION V NM N Java Singularity +boolean save Output ATTRIBUTE V N Java deeplearning4j +String save Password DECLARATION V N Java Spark +boolean save Samples FUNCTION V NPL Java drill +SingularityCreateResult save Task History Update FUNCTION V NM NM N Java Singularity +int sbox 4 DECLARATION N D Java Openfire +void* scan for sliders FUNCTION V P NPL C++ panda3d +uint64_t select in word FUNCTION V P N C meta +void send everything FUNCTION V DT C++ panda3d +int send to buffer PARAMETER V P N C weechat +class Separable Convolution 2D CLASS NM N NM Java deeplearning4j +guint32 Server 1 ATTRIBUTE N D C wireshark +string server and port DECLARATION N CJ N C++ panda3d +ccv_cnnp_cmd_exec_io_set_by_t set by DECLARATION V P C ccv +void set dampen on bodies FUNCTION V V P NPL C++ panda3d +void set data 4d FUNCTION V N NM C++ panda3d +String SET GROUP NAME 1 ATTRIBUTE V NM N D Java Openfire +void set has blob included in max row size FUNCTION V V N V P NM NM N C++ drill +bool Set IAM v2 API Token FUNCTION V NM NM NM N C++ s3fs-fuse +Builder set Part 1 FUNCTION V N D Java drill +void set server and port FUNCTION V N CJ N C++ panda3d +String SHA256 ALGORITHM ATTRIBUTE NM N Java metrics +int shift by PARAMETER V P C git2r +int short too PARAMETER NM VM C mgba +String show String for Add DECLARATION NM N P N Java Spark +uint64_t significand ATTRIBUTE N C++ toggldesktop +Date since ATTRIBUTE VM Java Smack +class Singularity S3 Service CLASS PRE PRE N Java Singularity +class Singularity S3 Services CLASS PRE PRE NPL Java Singularity +class Singularity S3 Uploader CLASS PRE PRE N Java Singularity +class Singularity S3 Uploader Configuration CLASS PRE PRE NM N Java Singularity +class Singularity S3 Uploader Content Headers CLASS PRE PRE NM NM NPL Java Singularity +class Singularity S3 Uploader File CLASS PRE PRE NM N Java Singularity +uint32_t size from ipv4 DECLARATION N P N C ovs +int size in datum DECLARATION N P N C++ caffe +bool skip im 2 col PARAMETER V N P N C++ caffe +class SLF4J Debugger Factory CLASS PRE NM N Java Smack +void SM83 Instruction IRQ Stall FUNCTION PRE PRE NM N C mgba +bool SM83 Tick Internal FUNCTION PRE N NM C mgba +sockaddr_in sock 4 DECLARATION N D C toxcore +prpack_result* solve via gs FUNCTION V P N C++ rigraph +bool some air DECLARATION DT N C++ freeminer +boolean some Columns Indexed FUNCTION DT NPL V Java drill +int some data not displayed DECLARATION DT NM VM V C weechat +Class some Reference ATTRIBUTE DT N Java drill +class Sort Collector Labels 1 CLASS V NM NPL D C++ panda3d +class Sort Collector Labels 2 CLASS V NM NPL D C++ panda3d +bool sorts less FUNCTION V DT C++ panda3d +int spaces every PARAMETER NPL DT C++ panda3d +void* spell speller add dicts to hash FUNCTION PRE PRE V NPL P N C weechat +void* spx nt prod 1 FUNCTION PRE NM N D C rigraph +void sqlite3 Expr If True FUNCTION PRE N CJ NM C mgba +void sqlite3 Fts3 Hash Find DECLARATION PRE PRE N V C toggldesktop +void* sqlite3 Fts3 Hash Insert FUNCTION PRE PRE N V C ccv +void sqlite3 Fts3 Seg Reader Free DECLARATION PRE PRE NM N V C ccv +int* sqlite3 Fts5 Parser Fallback FUNCTION PRE PRE N V C ccv +void sqlite3 Put 4 byte FUNCTION PRE V D N C ccv +int* sqlite3 rtree geometry callback FUNCTION PRE PRE NM N C ccv +int sqlite3 Where Trace DECLARATION PRE NM N C mgba +With* sqlite3 With Dup FUNCTION PRE P V C ccv +flow_wildcards src 1 PARAMETER N D C ovs +re_node_set src 2 PARAMETER N D C git2r +ovs_be16 src as ATTRIBUTE NM N C ovs +ostringstream ss all DECLARATION N DT C++ s3fs-fuse +stnode_t st arg 2 PARAMETER NM N D C wireshark +size_t start R 1 DECLARATION NM N D C++ meta +int stash update index from diff FUNCTION PRE V N P N C git2r +lm_state state next DECLARATION N DT C++ meta +boolean still Waiting DECLARATION VM V Java Smack +void store Last Err no FUNCTION V DT NM N C ccv +int str 2 int FUNCTION N P N C rigraph +const_iterator str 2 it DECLARATION N D N C++ s3fs-fuse +string str all DECLARATION N DT C++ s3fs-fuse +string str iam v2 token DECLARATION NM NM NM N C++ s3fs-fuse +string str now DECLARATION N VM C++ s3fs-fuse +optional stream for FUNCTION V P C++ meta +git_odb_stream stream out PARAMETER V N C git2r +int* string base16 decode FUNCTION N NM V C weechat +int* string base16 encode FUNCTION N NM V C weechat +int string base32 decode FUNCTION N NM V C weechat +void* string conv base64 6 x 4 to 8 x 3 FUNCTION N V N D P D P D P D C weechat +void* string conv base64 8 x 3 to 6 x 4 FUNCTION N V N D P D P D P D C weechat +char string p PARAMETER NM N C naemon-core +int subexp to ATTRIBUTE N P C git2r +int* submodule load each FUNCTION PRE V DT C git2r +List subtypes 1 DECLARATION NPL D Java drill +int sum 2 DECLARATION N D C rigraph +boolean supports Join Push down ATTRIBUTE V V V P Java drill +boolean supports Sort Push down ATTRIBUTE V V V P Java drill +void T 1 Prepare Mun map Code FUNCTION N D V NM NM N C++ facebook-repo-ds2 +PN_stdfloat t 1 y DECLARATION N D N C++ panda3d +uint32_t T 2 MOV 16 Set Immediate FUNCTION N D N D V NM C++ facebook-repo-ds2 +T3 t 3 DECLARATION N D Java immutables +PushTransactionRAII T for Deser DECLARATION N P N C++ cling +timespec t s now DECLARATION NM N VM C++ s3fs-fuse +timeval t v now DECLARATION NM N VM C weechat +ParquetTableMetadata_v4 table Metadata V 4 DECLARATION NM N NM D Java drill +void* tag all FUNCTION V DT C++ panda3d +int target 1 ATTRIBUTE N D C mgba +double task Reconciliation Response P 999 PARAMETER NM NM NM N D Java Singularity +void temp Data 1 ATTRIBUTE NM N D C++ caffe +void temp Data 2 ATTRIBUTE NM N D C++ caffe +fsblkcnt_t ten 00 DECLARATION D D C++ s3fs-fuse +ccv_nnc_tensor_symbol_info_t tensor a PARAMETER NM N C ccv +double term and class FUNCTION N CJ N C++ meta +char text only PARAMETER N VM C libxo +int text search where ATTRIBUTE N V VM C weechat +Occupant that DECLARATION DT Java Openfire +DruidStoragePluginConfig that Config DECLARATION DT N Java drill +LogicalExpression that Expr PARAMETER DT N Java drill +Class the Class DECLARATION DT N Java Openfire +LittleEndianBytes the Getter ATTRIBUTE DT N Java guava +keyValuePair the max DECLARATION DT N C++ rigraph +Filter the New Filter DECLARATION DT NM N Java drill +expected_counts this DECLARATION DT C meta +int this bucket DECLARATION DT N C naemon-core +PyObject this class PARAMETER DT N C++ panda3d +contactgroup this contact group DECLARATION DT NM N C naemon-core +LogicalExpression this Expr PARAMETER DT N Java drill +xodtemplate_host this host DECLARATION DT N C naemon-core +hostescalation this host escalation PARAMETER DT NM N C naemon-core +hostgroup this host group DECLARATION DT NM N C naemon-core +int this index DECLARATION DT N C++ panda3d +void this Instance PARAMETER DT N C toggldesktop +bool this Iteration PARAMETER DT N C++ proxygen +string this Key DECLARATION DT N C++ proxygen +uchar32_t this Letter PARAMETER DT N C++ freeminer +N this Node PARAMETER DT N Java guava +T this Object PARAMETER DT N Java Smack +serviceescalation this service escalation PARAMETER DT NM N C naemon-core +xodtemplate_serviceextinfo this service ext info PARAMETER DT NM NM N C naemon-core +servicesmember this services member DECLARATION DT NM N C naemon-core +Thread thread B to A ATTRIBUTE NM N P N Java Openfire +void throw if error FUNCTION V CJ N C++ meta +NativeLong til or since ATTRIBUTE P CJ P Java Spark +int time for each line ATTRIBUTE N P DT N C weechat +float time from last punch ATTRIBUTE N P DT N C++ freeminer +time_t time now DECLARATION N VM C weechat +float time of day smooth ATTRIBUTE N P N NM C++ freeminer +ifstream titles in DECLARATION NM N C++ meta +uint32_t to Ack DECLARATION P N C++ proxygen +RuntimeFilterWritable to Aggregate DECLARATION P V Java drill +ValueVector to Alloc PARAMETER P V Java drill +String to Bare JID FUNCTION P NM N Java Openfire +X509CertSelector to Be Validated DECLARATION P V V Java Spark +size_t to Consume PARAMETER P V C++ proxygen +AttributeBuilderThirdPartyModel to Copy From PARAMETER P V P Java immutables +mp_part_list_t to copy list DECLARATION P NM N C++ s3fs-fuse +DataType to Count PARAMETER P N Java deeplearning4j +CoordinateSystem to cs PARAMETER P N C++ panda3d +Delete to Delete FUNCTION P V Java drill +double to Doubles FUNCTION P NPL Java deeplearning4j +C to Element PARAMETER P N Java guava +List to Field List FUNCTION P NM N Java drill +String to Filename FUNCTION P N Java immutables +Map to Filter Conditions FUNCTION P NM NPL Java drill +ValueVectorReadExpression to Hash Field Exp DECLARATION P NM NM N Java drill +ToHeader to Header DECLARATION P N Java Spark +double to height DECLARATION P N C++ panda3d +int to Index DECLARATION P N Java Openfire +List to Launch DECLARATION P V Java Singularity +vector to merge DECLARATION P V C++ meta +FileMetaData to Merge PARAMETER P V Java drill +int to Read Remaining DECLARATION P V NM Java drill +Prel to Register PARAMETER P V Java drill +char to repo PARAMETER P N C git2r +TupleMetadata to Schema FUNCTION P N Java drill +Long to Snapshot Id ATTRIBUTE P NM N Java drill +Function to Sort Fn DECLARATION P NM N Java immutables +Instant to Time PARAMETER P N Java metrics +MinorType to Types PARAMETER P NPL Java drill +string to upper FUNCTION P NM C++ meta +Deque to Visit ATTRIBUTE P V Java immutables +auto to write DECLARATION P V C++ meta +String to Yaml FUNCTION P N Java deeplearning4j +boolean too Many Shuffling Tasks DECLARATION VM NM V NPL Java Singularity +boolean too Old DECLARATION VM NM Java Singularity +double total oov only DECLARATION NM N VM C++ meta +void* tox self set no spam FUNCTION PRE PRE V DT N C toxcore +igraph_bool_t transpose a PARAMETER V N C rigraph +JPanel tree and Info ATTRIBUTE N CJ N Java Spark +timespec ts 1 PARAMETER N D C++ s3fs-fuse +timespec ts 2 PARAMETER N D C++ s3fs-fuse +ovs_be64 tx1523 to max packets ATTRIBUTE N P NM NPL C ovs +MajorType type 2 PARAMETER N D Java drill +uint32_t u 1 bit count PARAMETER N D NM N C ovs +guint32 u32 Pointer DECLARATION NM N C wireshark +udpif udpif ATTRIBUTE N C ovs +double ul now PARAMETER V VM C++ s3fs-fuse +class Un7zip App CLASS V N C++ toggldesktop +String UNBLOCK CONTACT 16 x 16 ATTRIBUTE V N D P D Java Spark +void unbox or zero FUNCTION V CJ N Java cglib +int under Provisioned Requests PARAMETER P V NPL Java Singularity +uint32_t unknown 20 DECLARATION N D C++ cling +uint32_t unknown 24 DECLARATION N D C++ cling +int until DECLARATION P Java drill +LVector3 up ATTRIBUTE P C++ panda3d +bool Update S3fs Credential FUNCTION V NM N C++ s3fs-fuse +bool Upgrade V 1 Layer Parameter FUNCTION V N D NM N C++ caffe +class Upsampling 2D CLASS N NM Java deeplearning4j +SEXP us PARAMETER N C rigraph +bool use default if empty PARAMETER V N CJ NM C git2r +string username only DECLARATION N VM C++ panda3d +int32_t utf8 next code point FUNCTION N DT NM N C++ meta +int utf8 only ATTRIBUTE N VM C weechat +V1LayerParameter v 0 layer connection PARAMETER N D NM N C++ caffe +float v 00 PARAMETER N D C++ freeminer +int V 1 ATTRIBUTE N D C++ rigraph +int v 1 index PARAMETER N D N Java drill +float v 110 PARAMETER N D C++ freeminer +int V 2 ATTRIBUTE N D C++ rigraph +V v 5 PARAMETER N D Java guava +FvalueFromLiteral val from literal ATTRIBUTE N P N C wireshark +int* validate tree and parents FUNCTION V N CJ NPL C git2r +Any value 6 PARAMETER N D C++ toggldesktop +guint8 value a and front reserved DECLARATION NM N CJ N V C wireshark +T value or FUNCTION N CJ C++ meta +NoiseParams value out PARAMETER NM N C++ freeminer +vector vector 1 PARAMETER N D C++ drill +vector vector 2 PARAMETER N D C++ drill +uint32_t* version bitmap from version FUNCTION NM N P N C ovs +bool vertex 0 2 connected ATTRIBUTE N D D V C++ freeminer +igraph_vector_t vertex to the left PARAMETER N P DT NM C rigraph +int vid 1 PARAMETER N D C rigraph +IntType vmax over base DECLARATION N P N C++ deeplearning4j +LongType vol Stride 1 DECLARATION NM N D C++ deeplearning4j +BigIntVector vv 0 DECLARATION N D Java drill +int* w clique 1 FUNCTION NM N D C rigraph +atomic_bool wait for reload ATTRIBUTE V P N C ovs +int wait For S3 Links Seconds ATTRIBUTE V P NM NM NPL Java Singularity +void wait for server start FUNCTION V P NM N C crow +int waiting for dht connection DECLARATION V P NM N C toxcore +Duration warmup time ATTRIBUTE NM N C++ freeminer +ALenum warn if error FUNCTION V CJ N C++ freeminer +uint32_t wc 10 DECLARATION N D C ovs +SCM weechat guile api string input for buffer FUNCTION PRE PRE PRE NM N P N C weechat +void* weechat js unload all FUNCTION PRE PRE V DT C++ weechat +SEXP when DECLARATION VM C git2r +CountDownLatch when Closed ATTRIBUTE VM V Java guava +SqlNode where ATTRIBUTE N Java drill +SqlNode where Clause ATTRIBUTE NM N Java drill +size_t where len PARAMETER NM N C toggldesktop +int which Button DECLARATION DT N C++ panda3d +int64_t while count ATTRIBUTE NM N C ccv +ccv_nnc_symbolic_graph_t while graph PARAMETER NM N C ccv +MediaType with Charset DECLARATION P N Java guava +Builder with Conf FUNCTION P N Java drill +Builder with Entries FUNCTION P NPL Java drill +void with Implicit Columns FUNCTION P NM NPL Java drill +boolean with Index ATTRIBUTE P N Java drill +MaterializedField with Path And Type FUNCTION P N CJ N Java drill +BatchSchemaBuilder with Schema Builder FUNCTION P NM N Java drill +bool with System PARAMETER P N C++ cling +Naming with Unary Operator ATTRIBUTE P NM N Java immutables +FieldReference within ATTRIBUTE P Java drill +char words to add PARAMETER NPL P V C weechat +void work 1 ATTRIBUTE N D C rigraph +CPPType wrapped around PARAMETER V P C++ panda3d +void write Data Page V 2 FUNCTION V NM N NM D Java drill +double x 3 mean ATTRIBUTE N D N C++ deeplearning4j +uint64_t x 4 ATTRIBUTE N D C facebook-repo-ds2 +uint64_t x 8 ATTRIBUTE N D C facebook-repo-ds2 +int x from PARAMETER N P C++ panda3d +png_uint_32 x pixels per unit ATTRIBUTE NM NPL P N C mgba +int x to PARAMETER N P C++ panda3d +int* X509 set1 not Before FUNCTION PRE PRE DT P C toggldesktop +xdfile_t xdf 2 PARAMETER N D C git2r +xdfenv_t xe 2 PARAMETER N D C git2r +objectlist* xodtemplate expand host groups and hosts FUNCTION PRE V NM NPL CJ NPL C naemon-core +class Xpp3 Xml Pull Parser CLASS PRE NM NM N Java Smack +class Xpp3 Xml Pull Parser Factory CLASS PRE NM NM NM N Java Smack +png_uint_32 y pixels per unit ATTRIBUTE D NPL P N C mgba +class YOLO 2 CLASS N D Java deeplearning4j +double z 2 X mag DECLARATION N P N N C++ freeminer +felem z in DECLARATION NM N C toggldesktop +void zero or null FUNCTION N CJ N Java cglib \ No newline at end of file diff --git a/main b/main index 8a68e5d..86ee7bf 100755 --- a/main +++ b/main @@ -1,139 +1,20 @@ #!/usr/bin/env python -import os, sqlite3, random, argparse +import os, argparse from datetime import datetime -from src.classifier_multiclass import perform_classification, TrainingAlgorithm -import pandas as pd -import numpy as np +from src.tree_based_tagger.classifier_multiclass import load_config_tree, train_tree +from src.lm_based_tagger.train_model import train_lm from src.tag_identifier import start_server -from src.download_code2vec_vectors import * -from src.feature_generator import custom_to_numeric, universal_to_custom, createFeatures -from src.create_models import createModel, stable_features, mutable_feature_list, columns_to_drop +from src.tree_based_tagger.download_code2vec_vectors import * from version import __version__ +from datasets import Dataset -# Get the directory of the current script SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) def get_version(): """Return the current version of SCANL Tagger.""" return f"SCANL Tagger version {__version__}" -def read_input(sql, features, conn): - """ - Read input data from an SQLite database and preprocess it. - - This function reads data from the specified SQL query and database connection, shuffles the rows, and then applies - a preprocessing function called 'createFeatures' to create additional features. - - Args: - sql (str): The SQL query to fetch data from the database. - conn (sqlite3.Connection): The SQLite database connection. - - Returns: - pandas.DataFrame: A DataFrame containing the preprocessed input data. - """ - input_data = pd.read_sql_query(sql, conn) - print(" -- -- -- -- Read " + str(len(input_data)) + " input rows -- -- -- -- ") - print(input_data.columns) - input_data_copy = input_data.copy() - rows = input_data_copy.values.tolist() - random.shuffle(rows) - shuffled_input_data = pd.DataFrame(rows, columns=input_data.columns) - modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR) - input_data = createFeatures(shuffled_input_data, features, modelGensimEnglish=modelGensimEnglish, modelTokens=modelTokens, modelMethods=modelMethods) - return input_data - -def train(config): - """ - Train a part of speech tagger model using specified features and a training dataset. - This function reads data from an SQLite database, preprocesses it, and performs classification using a specified set - of features. The results are written to an output file, including information about the training process and the - distribution of labels in the training data. - Args: - config (dict): A dictionary containing configuration data. - Returns: - None - """ - - # Extract configuration values from the 'config' dictionary - input_file = config['input_file'] - sql_statement = config['sql_statement'] - identifier_column = config['identifier_column'] - dependent_variable = config['dependent_variable'] - pyrandom_seed = config['pyrandom_seed'] - trainingSeed = config['trainingSeed'] - classifierSeed = config['classifierSeed'] - - np.random.seed(config['npseed']) - random.seed(pyrandom_seed) - independent_variables = config['independent_variables'] - - # ############################################################### - print(" -- -- Started: Reading Database -- -- ") - connection = sqlite3.connect(input_file) - df_input = read_input(sql_statement, independent_variables, connection) - print(" -- -- Completed: Reading Input -- -- ") - # ############################################################### - - # Create an explicit copy to avoid SettingWithCopyWarning - #independent_variables.remove("EMB_FEATURES") - df_features = df_input[independent_variables].copy() - df_class = df_input[[dependent_variable]].copy() - - category_variables = [] - categorical_columns = ['NLTK_POS', 'PREV_POS', 'NEXT_POS'] - - # Safely handle categorical variables - for category_column in categorical_columns: - if category_column in df_features.columns: - category_variables.append(category_column) - df_features.loc[:, category_column] = df_features[category_column].astype(str) - - # Ensure output directories exist - output_dir = os.path.join(SCRIPT_DIR, 'output') - os.makedirs(output_dir, exist_ok=True) - - filename = os.path.join(output_dir, 'results.txt') - mode = 'a' if os.path.exists(filename) else 'w' - - with open(filename, mode) as results_text_file: - results_text_file.write(datetime.now().strftime("%H:%M:%S") + "\n") - - # Print config in a readable fashion - results_text_file.write("Configuration:\n") - for key, value in config.items(): - results_text_file.write(f"{key}: {value}\n") - results_text_file.write("\n") - - for category_column in category_variables: - # Explicitly handle categorical conversion - unique_values = df_features[category_column].unique() - category_map = {} - for value in unique_values: - print(value) - if value in universal_to_custom: - category_map[value] = custom_to_numeric[universal_to_custom[value]] - else: - category_map[value] = custom_to_numeric['NOUN'] # Assign 'NM' (8) for unknown categories - - df_features.loc[:, category_column] = df_features[category_column].map(category_map) - - print(" -- -- Distribution of labels in corpus -- -- ") - print(df_class[dependent_variable].value_counts()) - results_text_file.write(f"SQL: {sql_statement}\n") - results_text_file.write(f"Features: {df_features}\n") - - algorithms = [TrainingAlgorithm.XGBOOST] - #pd.set_option('display.max_rows', None) # Show all rows - pd.set_option('display.max_columns', None) # Show all columns - pd.set_option('display.width', None) # Prevent line wrapping - pd.set_option('display.max_colwidth', None) # Show full content of each cell - - print(df_features) - perform_classification(df_features, df_class, results_text_file, - output_dir, algorithms, trainingSeed, - classifierSeed, columns_to_drop) - if __name__ == "__main__": """ Use argparse to allow the user to choose either running the tagger or training a new tagger @@ -155,47 +36,53 @@ if __name__ == "__main__": Note: If no arguments are provided or if there is an invalid argument, the script will display usage instructions. - - Author: Christian Newman + Version: 2.0.0 """ - parser = argparse.ArgumentParser() - + + parser = argparse.ArgumentParser(description="SCALAR identifier tagger") parser.add_argument("-v", "--version", action="store_true", help="print tagger application version") - parser.add_argument("-r", "--run", action="store_true", help="run server for part of speech tagging requests") - parser.add_argument("-t", "--train", action="store_true", help="run training set to retrain the model") - parser.add_argument("-a", "--address", nargs=1, action="store", help="configure server address", ) - parser.add_argument("--port", nargs=1, action="store", help="configure server port") - parser.add_argument("--protocol", nargs=1, action="store", help="configure whether the server uses http or https") - parser.add_argument("--words", nargs=1, action="store", help="provide path to a list of acceptable abbreviations") + # Core run/train model arguments + parser.add_argument("--mode", choices=["train", "run"], required=True, help="Choose to 'train' or 'run' the model") + parser.add_argument("--model_type", choices=["tree_based", "lm_based"], required=True, help="Specify which model type to use") + parser.add_argument("--input_path", type=str, help="Path to TSV file for training") + parser.add_argument("--model_dir", type=str, default="models", help="Directory to load/save model") + parser.add_argument("--config_path", type=str, default="serve.json", help="Path to config JSON (used in run mode)") + + # Run-specific options + parser.add_argument("--port", type=int, help="Port to bind server") + parser.add_argument("--protocol", type=str, help="Protocol (http/https)") + parser.add_argument("--word", type=str, help="Word used in config") + parser.add_argument("--address", type=str, help="Server address") args = parser.parse_args() - + if args.version: print(get_version()) - elif args.run: - download_files() - temp_config = {} - print(args) - if args.address != None: temp_config["address"] = args.address[0] - if args.port != None: temp_config["port"] = args.port[0] - if args.protocol != None: temp_config["protocol"] = args.protocol[0] - if args.words != None: temp_config["words"] = args.words[0] - start_server(temp_config) - elif args.train: - download_files() - # Define a configuration dictionary and pass it to the train function - config = { - 'input_file': os.path.join(SCRIPT_DIR, 'input', 'scanl_tagger_training_db_11_29_2024.db'), - 'sql_statement': 'select * from training_set', - 'identifier_column': "ID", - 'dependent_variable': 'CORRECT_TAG', - 'pyrandom_seed': random.randint(0, 2**32 - 1), - 'trainingSeed': random.randint(0, 2**32 - 1), - 'classifierSeed': random.randint(0, 2**32 - 1), - 'npseed': random.randint(0, 2**32 - 1), - 'independent_variables': stable_features + mutable_feature_list - } - train(config) + elif args.mode == "train": + if args.model_type == "tree_based": + config = load_config_tree(SCRIPT_DIR) + download_files() + train_tree(config) + elif args.model_type == "lm_based": + download_files() + train_lm(SCRIPT_DIR) + + elif args.mode == "run": + + # Inject overrides + config["model_type"] = args.model_type + config["model_dir"] = args.model_dir + + if args.port: + config["port"] = args.port + if args.protocol: + config["protocol"] = args.protocol + if args.word: + config["word"] = args.word + if args.address: + config["address"] = args.address + + start_server(temp_config=config) else: - parser.print_usage() + parser.print_usage() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 450f86d..51e31b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,7 +17,7 @@ filelock==3.17.0 flair==0.15.0 Flask==3.1.0 fonttools==4.55.6 -fsspec==2023.5.0 +fsspec==2024.12.0 ftfy==6.3.1 gdown==5.2.0 gensim==4.3.3 @@ -42,6 +42,18 @@ mpmath==1.3.0 networkx==3.4.2 nltk==3.9.1 numpy==1.26.4 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 packaging==24.2 pandas==2.2.3 pillow==11.1.0 @@ -81,6 +93,7 @@ torch==2.5.1 tqdm==4.67.1 transformer-smaller-training-vocab==0.4.0 transformers==4.48.1 +triton==3.1.0 typing_extensions==4.12.2 tzdata==2025.1 urllib3==2.3.0 @@ -88,4 +101,4 @@ waitress==3.0.2 wcwidth==0.2.13 Werkzeug==3.1.3 Wikipedia-API==0.8.1 -wrapt==1.17.2 +wrapt==1.17.2 \ No newline at end of file diff --git a/src/lm_based_tagger/__init__.py b/src/lm_based_tagger/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/lm_based_tagger/distilbert_preprocessing.py b/src/lm_based_tagger/distilbert_preprocessing.py new file mode 100644 index 0000000..874a88a --- /dev/null +++ b/src/lm_based_tagger/distilbert_preprocessing.py @@ -0,0 +1,187 @@ +# distilbert_preprocessing.py + +import re +from nltk import pos_tag +import nltk +from difflib import SequenceMatcher +import pandas as pd +from datasets import Dataset + +# Download once (we’ll just do it quietly here) +nltk.download('averaged_perceptron_tagger_eng', quiet=True) +nltk.download('universal_tagset', quiet=True) + +# === Constants === +VOWELS = set("aeiou") +LOW_FREQ_TAGS = {"CJ", "VM", "PRE", "V"} + +# Map of context strings ➔ “feature tokens” +CONTEXT_MAP = { + "FUNCTION": "@func", + "PARAMETER": "@param", + "ATTRIBUTE": "@attr", + "DECLARATION": "@decl", + "CLASS": "@class" +} + + +def detect_hungarian_prefix(first_token): + """ + If the first token starts with 1–3 letters followed by an uppercase or underscore, + return "@hung_". Otherwise "@hung_none". + """ + m = re.match(r'^([a-zA-Z]{1,3})[A-Z_]', first_token) + if m: + return f"@hung_{m.group(1).lower()}" + return "@hung_none" + + +def detect_digit_feature(tokens): + """ + If any token has a digit, return "@has_digit", else "@no_digit". + """ + for token in tokens: + if any(char.isdigit() for char in token): + return "@has_digit" + return "@no_digit" + + +def consonant_vowel_ratio_bucket(tokens): + """ + Compute the average consonant/vowel ratio across all alphabetic tokens, + then bucket into low/mid/high. + """ + def ratio(tok): + tok_lower = tok.lower() + num_vowels = sum(1 for c in tok_lower if c in VOWELS) + num_consonants = sum(1 for c in tok_lower if c.isalpha() and c not in VOWELS) + return num_consonants / (num_vowels + 1e-5) + + ratios = [ratio(tok) for tok in tokens if tok.isalpha()] + if not ratios: + return "@cvr_none" + + avg_ratio = sum(ratios) / len(ratios) + if avg_ratio < 1.5: + return "@cvr_low" + elif avg_ratio < 3.0: + return "@cvr_mid" + else: + return "@cvr_high" + + +def system_prefix_similarity(first_token, system_name): + """ + Compute a SequenceMatcher ratio against the system name, then bucket: + >0.9 ➔ "@sim_high", >0.6 ➔ "@sim_mid", >0.3 ➔ "@sim_low", else "@sim_none". + """ + if not first_token or not system_name: + return "@sim_none" + sys_lower = system_name.strip().lower() + tok_lower = first_token.strip().lower() + r = SequenceMatcher(None, tok_lower, sys_lower).ratio() + if r > 0.9: + return "@sim_high" + elif r > 0.6: + return "@sim_mid" + elif r > 0.3: + return "@sim_low" + else: + return "@sim_none" + + +def prepare_dataset(df: pd.DataFrame, label2id: dict): + """ + Takes a DataFrame with columns: + - "tokens" : List[str] (split identifier) + - "tags" : List[str] (gold PoS tags, same length as tokens) + - "CONTEXT" : e.g. "FUNCTION", "PARAMETER", etc. + - "SYSTEM_NAME" : string + + Returns a HuggingFace `datasets.Dataset` with two fields: + - "tokens" : List[List[str]] (the FULL token sequence, including exactly 7 feature tokens + position tokens + identifier tokens) + - "ner_tags" : List[List[int]] (the aligned label IDs, with -100 in front for each feature token) + """ + rows = [] + for _, row in df.iterrows(): + tokens = row["tokens"] + tags = row["tags"] + + # 1. Build 7 feature tokens (context, system, hungarian, cvr, digit, sim, nltk) + context_token = CONTEXT_MAP.get(row["CONTEXT"].strip().upper(), "@unknown") + system_token = f"@system_{row['SYSTEM_NAME'].strip().lower().replace(' ', '_')}" + hungarian_token = detect_hungarian_prefix(tokens[0]) if tokens else "@hung_none" + cvr_token = consonant_vowel_ratio_bucket(tokens) + digit_token = detect_digit_feature(tokens) + sim_token = system_prefix_similarity(tokens[0], row["SYSTEM_NAME"]) if tokens else "@sim_none" + + # 2. NLTK POS tags (universal tagset) + nltk_tags = pos_tag(tokens, tagset="universal") + universal_tags = [tag.lower() for _, tag in nltk_tags] + nltk_feature = f"@nltk_{'-'.join(universal_tags)}" + + # 3. Position tags: interleave with identifier tokens + length = len(tokens) + if length == 1: + pos_tokens = ["@pos_2"] + else: + pos_tokens = ["@pos_0"] + ["@pos_1"] * (length - 2) + ["@pos_2"] + tokens_with_pos = [val for pair in zip(pos_tokens, tokens) for val in pair] + + # 4. Build the “full” token list (7 feature tokens + 2*len(tokens) position‐identifier tokens) + full_tokens = [ + context_token, + system_token, + hungarian_token, + cvr_token, + digit_token, + sim_token, + nltk_feature, + ] + tokens_with_pos + + # 5. Build the aligned labels array: + # - First 7 entries → -100 (because they are feature tokens) + # - Then for each identifier token, [-100, label2id[tag]] + ner_tags_with_pos = [val for tag in tags for val in (-100, label2id[tag])] + full_labels = [-100] * 7 + ner_tags_with_pos + + rows.append({ + "tokens": full_tokens, + "ner_tags": full_labels + }) + + return Dataset.from_dict({ + "tokens": [r["tokens"] for r in rows], + "ner_tags": [r["ner_tags"] for r in rows] + }) + + +def tokenize_and_align_labels(example, tokenizer): + """ + example: a dict with + - "tokens" : List[str] (the full token sequence, including exactly 7 feature tokens) + - "ner_tags" : List[int] (same length as above) + + We run `tokenizer(example["tokens"], is_split_into_words=True, truncation=True)`, + then align `word_ids()` with `example["ner_tags"]` exactly as in test.py. + """ + tokenized = tokenizer( + example["tokens"], + truncation=True, + is_split_into_words=True + ) + + labels = [] + word_ids = tokenized.word_ids() + + for word_id in word_ids: + if word_id is None: + labels.append(-100) + elif word_id < len(example["ner_tags"]): + labels.append(example["ner_tags"][word_id]) + else: + # Just in case of truncation + labels.append(-100) + + tokenized["labels"] = labels + return tokenized diff --git a/src/lm_based_tagger/distilbert_tagger.py b/src/lm_based_tagger/distilbert_tagger.py new file mode 100644 index 0000000..bf3c4b7 --- /dev/null +++ b/src/lm_based_tagger/distilbert_tagger.py @@ -0,0 +1,178 @@ +import re +import torch +from nltk import pos_tag +import nltk +from difflib import SequenceMatcher +from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification + +# Make sure we have the same NLTK tagset +nltk.download('averaged_perceptron_tagger_eng', quiet=True) +nltk.download('universal_tagset', quiet=True) + +VOWELS = set("aeiou") +CONTEXT_MAP = { + "FUNCTION": "@func", + "PARAMETER": "@param", + "ATTRIBUTE": "@attr", + "DECLARATION": "@decl", + "CLASS": "@class" +} + + +def detect_hungarian_prefix(first_token): + m = re.match(r'^([a-zA-Z]{1,3})[A-Z_]', first_token) + if m: + return f"@hung_{m.group(1).lower()}" + return "@hung_none" + + +def detect_digit_feature(tokens): + for token in tokens: + if any(char.isdigit() for char in token): + return "@has_digit" + return "@no_digit" + + +def consonant_vowel_ratio_bucket(tokens): + def ratio(tok): + tok_lower = tok.lower() + num_vowels = sum(1 for c in tok_lower if c in VOWELS) + num_consonants = sum(1 for c in tok_lower if c.isalpha() and c not in VOWELS) + return num_consonants / (num_vowels + 1e-5) + + ratios = [ratio(tok) for tok in tokens if tok.isalpha()] + if not ratios: + return "@cvr_none" + avg_ratio = sum(ratios) / len(ratios) + if avg_ratio < 1.5: + return "@cvr_low" + elif avg_ratio < 3.0: + return "@cvr_mid" + else: + return "@cvr_high" + + +def normalize_type(type_str): + ts = type_str.strip().lower() + ts = ts.replace("*", "_ptr") + ts = ts.replace(" ", "_") + return f"@{ts}" + + +def normalize_language(lang_str): + return "@lang_" + lang_str.strip().lower().replace("++", "pp").replace("#", "sharp") + + +def system_prefix_similarity(first_token, system_name): + if not first_token or not system_name: + return "@sim_none" + sys_lower = system_name.strip().lower() + tok_lower = first_token.strip().lower() + r = SequenceMatcher(None, tok_lower, sys_lower).ratio() + if r > 0.9: + return "@sim_high" + elif r > 0.6: + return "@sim_mid" + elif r > 0.3: + return "@sim_low" + else: + return "@sim_none" + + +class DistilBertTagger: + def __init__(self, model_path: str): + """ + Expects `model_path` to be a folder where the fine-tuned DistilBertForTokenClassification + (and its tokenizer) have been saved via `trainer.save_model(...)` and `tokenizer.save_pretrained(...)`. + """ + self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_path) + self.model = DistilBertForTokenClassification.from_pretrained(model_path) + self.model.eval() + + def tag_identifier(self, tokens, context, type_str, language, system_name): + """ + 1) Build the “feature tokens + position tokens + identifier tokens” sequence + 2) Tokenize with `is_split_into_words=True` + 3) Run the model, take argmax over token logits + 4) Align via `word_ids()`, skipping: + - Any word_id = None + - Any word_id < 9 (because first 9 tokens were “feature tokens” => labels = -100) + - Repeated word_ids (so we pick only the first sub-token of each “(pos, identifier‐word)” pair) + 5) Return a list of numeric labels. (If you want strings, you can map via id2label externally.) + """ + + # 1. Re–compute exactly the same feature tokens as in training: + context_token = CONTEXT_MAP.get(context.strip().upper(), "@unknown") + system_token = f"@system_{system_name.strip().lower().replace(' ', '_')}" + hungarian_token = detect_hungarian_prefix(tokens[0]) if tokens else "@hung_none" + cvr_token = consonant_vowel_ratio_bucket(tokens) + digit_token = detect_digit_feature(tokens) + sim_token = system_prefix_similarity(tokens[0], system_name) if tokens else "@sim_none" + type_token = normalize_type(type_str) + lang_token = normalize_language(language) + + # Position tags for each identifier token + length = len(tokens) + if length == 1: + pos_tokens = ["@pos_2"] + else: + pos_tokens = ["@pos_0"] + ["@pos_1"] * (length - 2) + ["@pos_2"] + + # NLTK POS feature + nltk_tags = pos_tag(tokens, tagset="universal") + universal_tags = [tag.lower() for _, tag in nltk_tags] + nltk_feature = f"@nltk_{'-'.join(universal_tags)}" + + # Interleave pos_tokens + identifier tokens + tokens_with_pos = [val for pair in zip(pos_tokens, tokens) for val in pair] + + # Build the full input token sequence (exactly what training saw): + input_tokens = [ + context_token, + system_token, + hungarian_token, + cvr_token, + digit_token, + sim_token, + type_token, + lang_token, + nltk_feature + ] + tokens_with_pos + + # 2. Tokenize + encoded = self.tokenizer( + input_tokens, + is_split_into_words=True, + return_tensors="pt", + truncation=True, + padding=True + ) + + # 3. Inference + with torch.no_grad(): + logits = self.model( + input_ids=encoded["input_ids"], + attention_mask=encoded["attention_mask"] + )[0] + + # 4. Take argmax, then align via word_ids() + predictions = torch.argmax(logits, dim=-1).squeeze().tolist() + word_ids = encoded.word_ids() + + pred_labels = [] + previous_word_idx = None + + for idx, word_idx in enumerate(word_ids): + # Skip if special token (None), or if it's part of the first 9 “feature tokens” + if word_idx is None or word_idx < 9: + continue + # Skip if it’s the same word_idx as the previous (to avoid sub-token duplicates) + if word_idx == previous_word_idx: + continue + + pred_labels.append(predictions[idx]) + previous_word_idx = word_idx + + # Now, pred_labels is a list of numeric IDs (length == len(tokens)), + # in the same order as your original “tokens” list. + return pred_labels diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py new file mode 100644 index 0000000..cb9358a --- /dev/null +++ b/src/lm_based_tagger/train_model.py @@ -0,0 +1,127 @@ +import os +import pandas as pd +from sklearn.model_selection import train_test_split +import torch +from transformers import ( + Trainer, + TrainingArguments, + DistilBertTokenizerFast, + DistilBertConfig, + DistilBertForTokenClassification, + DataCollatorForTokenClassification +) +from datasets import Dataset +from src.lm_based_tagger.distilbert_preprocessing import prepare_dataset, tokenize_and_align_labels + +# === Labels & Mappings === +LABEL_LIST = ["CJ", "D", "DT", "N", "NM", "NPL", "P", "PRE", "V", "VM"] +LABEL2ID = {label: i for i, label in enumerate(LABEL_LIST)} +ID2LABEL = {i: label for label, i in LABEL2ID.items()} + +def train_lm(script_dir: str): + input_path = os.path.join(script_dir, "input", "tagger_data.tsv") + output_dir = os.path.join(script_dir, "output") + os.makedirs(output_dir, exist_ok=True) + + # 1) Read TSV & build tokens/tags lists + df = pd.read_csv(input_path, sep="\t", dtype=str).dropna(subset=["SPLIT", "GRAMMAR_PATTERN"]) + df = df[df["SPLIT"].str.strip().astype(bool)] + df["tokens"] = df["SPLIT"].apply(lambda x: x.strip().split()) + df["tags"] = df["GRAMMAR_PATTERN"].apply(lambda x: x.strip().split()) + df = df[df.apply(lambda r: len(r["tokens"]) == len(r["tags"]), axis=1)] + + # 2) Train/Test split (stratify by CONTEXT) + train_df, test_df = train_test_split( + df, test_size=0.15, random_state=42, stratify=df["CONTEXT"] + ) + + # 3) Upsample low-frequency tags (in training set only) + low_freq_tags = {"CJ", "VM", "PRE", "V"} + low_freq_df = train_df[train_df["tags"].apply(lambda tags: any(t in low_freq_tags for t in tags))] + train_df = pd.concat([train_df] + [low_freq_df] * 2, ignore_index=True) + + # 4) Tokenizer + tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") + + # 5) Convert each split into a HF Dataset via the shared prepare_dataset(...) + train_dataset = prepare_dataset(train_df, LABEL2ID) + test_dataset = prepare_dataset(test_df, LABEL2ID) + + # 6) Tokenize + align labels + tokenized_train = train_dataset.map( + lambda ex: tokenize_and_align_labels(ex, tokenizer), + batched=False + ) + tokenized_test = test_dataset.map( + lambda ex: tokenize_and_align_labels(ex, tokenizer), + batched=False + ) + + # 7) Build config & model using uncased vocab + config = DistilBertConfig.from_pretrained( + "distilbert-base-uncased", + num_labels=len(LABEL_LIST), + id2label=ID2LABEL, + label2id=LABEL2ID + ) + model = DistilBertForTokenClassification.from_pretrained( + "distilbert-base-uncased", + config=config + ) + + # 8) Training arguments + training_args = TrainingArguments( + output_dir=output_dir, + evaluation_strategy="epoch", + save_strategy="epoch", + learning_rate=5e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=10, + weight_decay=0.01, + warmup_ratio=0.1, + lr_scheduler_type="cosine", + load_best_model_at_end=True, + metric_for_best_model="eval_macro_f1", + greater_is_better=True, + save_total_limit=1, + logging_dir=os.path.join(output_dir, "logs"), + report_to="none", + seed=42 + ) + + # 9) Collate Data + data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) + + # 10) Macro‐F1 computation + def compute_metrics(eval_pred): + from sklearn.metrics import f1_score + logits, labels = eval_pred + preds = logits.argmax(axis=-1) + + true_preds = [] + true_labels = [] + for pred_row, label_row in zip(preds, labels): + for p, l in zip(pred_row, label_row): + if l != -100: + true_preds.append(p) + true_labels.append(l) + + macro_f1 = f1_score(true_labels, true_preds, average="macro") + return {"eval_macro_f1": macro_f1} + + # 11) Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_train, + eval_dataset=tokenized_test, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics + ) + + # 12) Train & save + trainer.train() + trainer.save_model(output_dir) + tokenizer.save_pretrained(output_dir) diff --git a/src/tag_identifier.py b/src/tag_identifier.py index 305e390..fdc42fc 100644 --- a/src/tag_identifier.py +++ b/src/tag_identifier.py @@ -3,16 +3,21 @@ import joblib import nltk import pandas as pd -from src.feature_generator import createFeatures, universal_to_custom, custom_to_numeric -from flask import Flask +from flask import Flask, request from waitress import serve from spiral import ronin import json import sqlite3 -from src.create_models import createModel, stable_features, mutable_feature_list +from src.tree_based_tagger.feature_generator import createFeatures, universal_to_custom, custom_to_numeric +from src.tree_based_tagger.create_models import createModel, stable_features, mutable_feature_list +from src.lm_based_tagger.distilbert_tagger import DistilBertTagger + app = Flask(__name__) SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +model_type = None +lm_model = None + class ModelData: def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> None: """ @@ -28,7 +33,6 @@ def __init__(self, modelTokens, modelMethods, modelGensimEnglish, wordCount) -> self.ModelMethods = modelMethods self.ModelGensimEnglish = modelGensimEnglish self.wordCount = wordCount - # self.ModelClassifier = joblib.load('output/model_RandomForestClassifier.pkl') class AppCache: def __init__(self, Path) -> None: @@ -127,7 +131,7 @@ def load(self): def find(self, item): return item in self.Words -def initialize_model(): +def initialize_model(selected_model_type): """ Initialize and load word vectors for the application, and load a word count DataFrame. @@ -137,23 +141,25 @@ def initialize_model(): Returns: tuple: (ModelData, WORD_COUNT DataFrame) """ - print("Loading word vectors!!") - modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR) - print("Word vectors loaded!!") - - # Load the word count JSON file into a DataFrame - word_count_path = os.path.join("input", "word_count.json") - if os.path.exists(word_count_path): - print(f"Loading word count data from {word_count_path}...") - word_count_df = pd.read_json(word_count_path, orient='index', typ='series').reset_index() - word_count_df.columns = ['word', 'log_frequency'] - print("Word count data loaded!") - else: - print(f"Word count file not found at {word_count_path}. Initializing empty DataFrame.") - word_count_df = pd.DataFrame(columns=['word', 'log_frequency']) - - # Create and store model data - app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df) + global model_type, lm_model + model_type = selected_model_type + if model_type == "tree_based": + print("Loading word vectors!!") + modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR) + print("Word vectors loaded!!") + word_count_path = os.path.join("input", "word_count.json") + if os.path.exists(word_count_path): + print(f"Loading word count data from {word_count_path}...") + word_count_df = pd.read_json(word_count_path, orient='index', typ='series').reset_index() + word_count_df.columns = ['word', 'log_frequency'] + else: + print(f"Word count file not found at {word_count_path}. Initializing empty DataFrame.") + word_count_df = pd.DataFrame(columns=['word', 'log_frequency']) + app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df) + elif model_type == "lm_based": + print("Loading DistilBERT tagger...") + lm_model = DistilBertTagger(SCRIPT_DIR) + print("DistilBERT tagger loaded!") def start_server(temp_config = {}): """ @@ -169,12 +175,13 @@ def start_server(temp_config = {}): None """ print('initializing model...') - initialize_model() + selected_model = temp_config.get("model_type", "tree_based") + initialize_model(selected_model) print("loading cache...") if not os.path.isdir("cache"): os.mkdir("cache") - print("laoding dictionary") + print("loading dictionary") app.english_words = set(w.lower() for w in nltk.corpus.words.words()) #insert english words from words/en.txt @@ -248,6 +255,10 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None) cache = AppCache("cache/"+cache_id+".db3") cache.load() + system_name = request.args.get("system_name", default="") + programming_language = request.args.get("language", default="") + data_type = request.args.get("type", default="") + #TODO: update this documentation """ Process a web request to analyze an identifier within a specific context. @@ -267,7 +278,20 @@ def listen(identifier_name: str, identifier_context: str, cache_id: str = None) # get the start time start_time = time.perf_counter() - + + if model_type == "lm_based": + result = { + "words": [] + } + tags = lm_model.predict(words, identifier_context, programming_language, data_type, system_name) + for word, tag in zip(words, tags): + dictionary = dictionary_lookup(word) + result["words"].append({word: {"tag": tag, "dictionary": dictionary}}) + tag_time = time.perf_counter() - start_time + if cache_id: + AppCache(f"cache/{cache_id}.db3").add(identifier_name, result, identifier_context, tag_time) + return result + # Split identifier_name into words words = ronin.split(identifier_name) diff --git a/src/tree_based_tagger/__init__.py b/src/tree_based_tagger/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/classifier_multiclass.py b/src/tree_based_tagger/classifier_multiclass.py similarity index 64% rename from src/classifier_multiclass.py rename to src/tree_based_tagger/classifier_multiclass.py index 104926f..66378d3 100644 --- a/src/classifier_multiclass.py +++ b/src/tree_based_tagger/classifier_multiclass.py @@ -7,14 +7,19 @@ from sklearn.metrics import f1_score from sklearn.metrics import matthews_corrcoef from sklearn.metrics import make_scorer -from sklearn.metrics import classification_report, precision_recall_fscore_support -from sklearn.model_selection import GridSearchCV, cross_validate, StratifiedKFold, cross_val_predict +from sklearn.metrics import classification_report +from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict from sklearn.model_selection import train_test_split from sklearn.inspection import permutation_importance +from src.tree_based_tagger.feature_generator import custom_to_numeric, universal_to_custom, createFeatures +from src.tree_based_tagger.create_models import createModel, stable_features, mutable_feature_list, columns_to_drop import pandas as pd from enum import Enum -import src.feature_generator import multiprocessing +import os, sqlite3, random +import pandas as pd +import numpy as np +from datetime import datetime class TrainingAlgorithm(Enum): RANDOM_FOREST = "RandomForest" @@ -62,6 +67,138 @@ def __init__(self, X_train, X_test, y_train, y_test, X_train_original, X_test_or self.X_test_original = X_test_original self.labels = labels +def load_config_tree(SCRIPT_DIR): + # Mimic Python-based config instead of JSON + config = { + 'script_dir': SCRIPT_DIR, + 'input_file': os.path.join(SCRIPT_DIR, 'input', 'scanl_tagger_training_db_11_29_2024.db'), + 'sql_statement': 'select * from training_set', + 'identifier_column': "ID", + 'dependent_variable': 'CORRECT_TAG', + 'pyrandom_seed': random.randint(0, 2**32 - 1), + 'trainingSeed': random.randint(0, 2**32 - 1), + 'classifierSeed': random.randint(0, 2**32 - 1), + 'npseed': random.randint(0, 2**32 - 1), + 'independent_variables': stable_features + mutable_feature_list + } + print(config) + return config + +def read_input(sql, features, conn, config): + """ + Read input data from an SQLite database and preprocess it. + + This function reads data from the specified SQL query and database connection, shuffles the rows, and then applies + a preprocessing function called 'createFeatures' to create additional features. + + Args: + sql (str): The SQL query to fetch data from the database. + conn (sqlite3.Connection): The SQLite database connection. + + Returns: + pandas.DataFrame: A DataFrame containing the preprocessed input data. + """ + input_data = pd.read_sql_query(sql, conn) + print(" -- -- -- -- Read " + str(len(input_data)) + " input rows -- -- -- -- ") + print(input_data.columns) + input_data_copy = input_data.copy() + rows = input_data_copy.values.tolist() + random.shuffle(rows) + shuffled_input_data = pd.DataFrame(rows, columns=input_data.columns) + modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=config['script_dir']) + input_data = createFeatures(shuffled_input_data, features, modelGensimEnglish=modelGensimEnglish, modelTokens=modelTokens, modelMethods=modelMethods) + return input_data + +def train_tree(config): + """ + Train a part of speech tagger model using specified features and a training dataset. + This function reads data from an SQLite database, preprocesses it, and performs classification using a specified set + of features. The results are written to an output file, including information about the training process and the + distribution of labels in the training data. + Args: + config (dict): A dictionary containing configuration data. + Returns: + None + """ + + # Extract configuration values from the 'config' dictionary + input_file = config['input_file'] + sql_statement = config['sql_statement'] + identifier_column = config['identifier_column'] + dependent_variable = config['dependent_variable'] + pyrandom_seed = config['pyrandom_seed'] + trainingSeed = config['trainingSeed'] + classifierSeed = config['classifierSeed'] + + np.random.seed(config['npseed']) + random.seed(pyrandom_seed) + independent_variables = config['independent_variables'] + + # ############################################################### + print(" -- -- Started: Reading Database -- -- ") + connection = sqlite3.connect(input_file) + df_input = read_input(sql_statement, independent_variables, connection, config) + print(" -- -- Completed: Reading Input -- -- ") + # ############################################################### + + # Create an explicit copy to avoid SettingWithCopyWarning + #independent_variables.remove("EMB_FEATURES") + df_features = df_input[independent_variables].copy() + df_class = df_input[[dependent_variable]].copy() + + category_variables = [] + categorical_columns = ['NLTK_POS', 'PREV_POS', 'NEXT_POS'] + + # Safely handle categorical variables + for category_column in categorical_columns: + if category_column in df_features.columns: + category_variables.append(category_column) + df_features.loc[:, category_column] = df_features[category_column].astype(str) + + # Ensure output directories exist + output_dir = os.path.join(config['script_dir'], 'output') + os.makedirs(output_dir, exist_ok=True) + + filename = os.path.join(output_dir, 'results.txt') + mode = 'a' if os.path.exists(filename) else 'w' + + with open(filename, mode) as results_text_file: + results_text_file.write(datetime.now().strftime("%H:%M:%S") + "\n") + + # Print config in a readable fashion + results_text_file.write("Configuration:\n") + for key, value in config.items(): + results_text_file.write(f"{key}: {value}\n") + results_text_file.write("\n") + + for category_column in category_variables: + # Explicitly handle categorical conversion + unique_values = df_features[category_column].unique() + category_map = {} + for value in unique_values: + print(value) + if value in universal_to_custom: + category_map[value] = custom_to_numeric[universal_to_custom[value]] + else: + category_map[value] = custom_to_numeric['NOUN'] # Assign 'NM' (8) for unknown categories + + df_features.loc[:, category_column] = df_features[category_column].map(category_map) + + print(" -- -- Distribution of labels in corpus -- -- ") + print(df_class[dependent_variable].value_counts()) + results_text_file.write(f"SQL: {sql_statement}\n") + results_text_file.write(f"Features: {df_features}\n") + + algorithms = [TrainingAlgorithm.XGBOOST] + #pd.set_option('display.max_rows', None) # Show all rows + pd.set_option('display.max_columns', None) # Show all columns + pd.set_option('display.width', None) # Prevent line wrapping + pd.set_option('display.max_colwidth', None) # Show full content of each cell + + print(df_features) + perform_classification(df_features, df_class, results_text_file, + output_dir, algorithms, trainingSeed, + classifierSeed, columns_to_drop) def build_datasets(X, y, output_directory, trainingSeed): # Ensure the output directory exists os.makedirs(output_directory, exist_ok=True) diff --git a/src/create_models.py b/src/tree_based_tagger/create_models.py similarity index 100% rename from src/create_models.py rename to src/tree_based_tagger/create_models.py diff --git a/src/download_code2vec_vectors.py b/src/tree_based_tagger/download_code2vec_vectors.py similarity index 100% rename from src/download_code2vec_vectors.py rename to src/tree_based_tagger/download_code2vec_vectors.py diff --git a/src/feature_generator.py b/src/tree_based_tagger/feature_generator.py similarity index 100% rename from src/feature_generator.py rename to src/tree_based_tagger/feature_generator.py From 347ef4edd46cfea2be70f255be7aaddd4102c48e Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Mon, 2 Jun 2025 21:57:21 -0400 Subject: [PATCH 32/51] Prepare to re-add kfold --- src/lm_based_tagger/train_model.py | 67 ++++++++++++++++++-------- src/tree_based_tagger/create_models.py | 45 ----------------- 2 files changed, 47 insertions(+), 65 deletions(-) diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py index cb9358a..dfa8b25 100644 --- a/src/lm_based_tagger/train_model.py +++ b/src/lm_based_tagger/train_model.py @@ -13,6 +13,9 @@ from datasets import Dataset from src.lm_based_tagger.distilbert_preprocessing import prepare_dataset, tokenize_and_align_labels +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +print("Using device:", device) + # === Labels & Mappings === LABEL_LIST = ["CJ", "D", "DT", "N", "NM", "NPL", "P", "PRE", "V", "VM"] LABEL2ID = {label: i for i, label in enumerate(LABEL_LIST)} @@ -69,26 +72,50 @@ def train_lm(script_dir: str): config=config ) - # 8) Training arguments - training_args = TrainingArguments( - output_dir=output_dir, - evaluation_strategy="epoch", - save_strategy="epoch", - learning_rate=5e-5, - per_device_train_batch_size=16, - per_device_eval_batch_size=16, - num_train_epochs=10, - weight_decay=0.01, - warmup_ratio=0.1, - lr_scheduler_type="cosine", - load_best_model_at_end=True, - metric_for_best_model="eval_macro_f1", - greater_is_better=True, - save_total_limit=1, - logging_dir=os.path.join(output_dir, "logs"), - report_to="none", - seed=42 - ) + if device == "cpu": + # 8) Training arguments + training_args = TrainingArguments( + output_dir=output_dir, + evaluation_strategy="epoch", + save_strategy="epoch", + learning_rate=5e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=10, + weight_decay=0.01, + warmup_ratio=0.1, + lr_scheduler_type="cosine", + load_best_model_at_end=True, + metric_for_best_model="eval_macro_f1", + greater_is_better=True, + save_total_limit=1, + logging_dir=os.path.join(output_dir, "logs"), + report_to="none", + seed=42 + ) + else: + training_args = TrainingArguments( + output_dir=output_dir, + eval_strategy="epoch", + save_strategy="epoch", + learning_rate=5e-5, + per_device_train_batch_size=4, # ↓ reduce to fit in VRAM + per_device_eval_batch_size=4, + gradient_accumulation_steps=4, # simulates batch size of 16 + num_train_epochs=10, + weight_decay=0.01, + warmup_ratio=0.1, + lr_scheduler_type="cosine", + load_best_model_at_end=True, + save_total_limit=1, + metric_for_best_model="eval_macro_f1", # or "eval_loss" if macro F1 isn't computed + greater_is_better=True, # set to False if using loss + logging_dir=os.path.join(output_dir, "logs"), + report_to="none", + seed=42, + fp16=False, + dataloader_pin_memory=False, # no benefit if no CUDA pinning + ) # 9) Collate Data data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) diff --git a/src/tree_based_tagger/create_models.py b/src/tree_based_tagger/create_models.py index 6a13b66..451d697 100644 --- a/src/tree_based_tagger/create_models.py +++ b/src/tree_based_tagger/create_models.py @@ -1,7 +1,5 @@ import gensim.downloader as api -from gensim.models import KeyedVectors as word2vec import json, os -from gensim.models import KeyedVectors import logging #'VERB_SCORE', 'DET_SCORE', 'ENGLISHV_SCORE', 'POSITION_RATIO','METHODV_SCORE', 'CONTAINSLISTVERB' stable_features = ['WORD', 'SPLIT_IDENTIFIER', 'CONTEXT_NUMBER'] #'LANGUAGE' 'PREP_SCORE' 'CONTAINSLISTVERB','CONTAINSCLOSEDSET' @@ -75,47 +73,4 @@ def createModel(pklFile="", rootDir=""): method_txt_path = os.path.join(rootDir, 'code2vec', 'target_vecs.txt') method_native_path = os.path.join(rootDir, 'code2vec', 'target_vecs.kv') - return modelGensimTokens, modelGensimMethods, modelGensimEnglish - - # Helper function to load models safely - def load_model(txt_path, native_path, model_name): - """ - Load a word vector model, converting from text format if necessary. - - Args: - txt_path (str): Path to the text-based word vectors. - native_path (str): Path to the native .kv format file. - model_name (str): Name of the model for logging. - - Returns: - KeyedVectors or None: The loaded model, or None if unavailable. - """ - try: - if os.path.exists(native_path): - logger.info(f"Loading {model_name} from native format...") - return KeyedVectors.load(native_path) - - elif os.path.exists(txt_path): - logger.info(f"Native format for {model_name} not found. Converting from text format...") - model = KeyedVectors.load_word2vec_format(txt_path, binary=False) - try: - model.save(native_path) - logger.info(f"{model_name} vectors converted and saved to {native_path}") - except PermissionError: - logger.warning(f"Permission denied when saving {model_name} to {native_path}. Using in-memory only.") - return model - - else: - logger.warning(f"{model_name} vector file not found at {txt_path} or {native_path}. Skipping.") - return None - - except Exception as e: - logger.warning(f"Failed to load {model_name}: {e}") - return None - - # Load models with the new safe function - modelGensimTokens = load_model(token_txt_path, token_native_path, "Token vectors") - modelGensimMethods = load_model(method_txt_path, method_native_path, "Method vectors") - - logger.info("Model loading complete.") return modelGensimTokens, modelGensimMethods, modelGensimEnglish \ No newline at end of file From 2417e49cc4f9758fd785bd282ce2a4683624d5c4 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Tue, 3 Jun 2025 10:42:17 -0400 Subject: [PATCH 33/51] Load model when server runs, listen for url --- main | 34 ++- src/lm_based_tagger/distilbert_tagger.py | 15 +- src/lm_based_tagger/train_model.py | 357 ++++++++++++++++------- src/tag_identifier.py | 145 ++++----- src/tree_based_tagger/create_models.py | 2 +- 5 files changed, 339 insertions(+), 214 deletions(-) diff --git a/main b/main index 86ee7bf..07d795b 100755 --- a/main +++ b/main @@ -4,6 +4,7 @@ import os, argparse from datetime import datetime from src.tree_based_tagger.classifier_multiclass import load_config_tree, train_tree from src.lm_based_tagger.train_model import train_lm +from src.lm_based_tagger.distilbert_tagger import DistilBertTagger from src.tag_identifier import start_server from src.tree_based_tagger.download_code2vec_vectors import * from version import __version__ @@ -69,20 +70,29 @@ if __name__ == "__main__": train_lm(SCRIPT_DIR) elif args.mode == "run": + if args.model_type == "tree_based": + config = load_config_tree() + # Inject overrides + download_files() + config["model_type"] = args.model_type + config["model_dir"] = args.model_dir - # Inject overrides - config["model_type"] = args.model_type - config["model_dir"] = args.model_dir + if args.port: + config["port"] = args.port + if args.protocol: + config["protocol"] = args.protocol + if args.word: + config["word"] = args.word + if args.address: + config["address"] = args.address - if args.port: - config["port"] = args.port - if args.protocol: - config["protocol"] = args.protocol - if args.word: - config["word"] = args.word - if args.address: - config["address"] = args.address + start_server(temp_config=config) + elif args.model_type == "lm_based": + start_server(temp_config={ + 'script_dir': SCRIPT_DIR, + 'model': os.path.join(SCRIPT_DIR, 'output', 'best_model'), + 'model_type':'lm_based' + }) - start_server(temp_config=config) else: parser.print_usage() \ No newline at end of file diff --git a/src/lm_based_tagger/distilbert_tagger.py b/src/lm_based_tagger/distilbert_tagger.py index bf3c4b7..322d3e0 100644 --- a/src/lm_based_tagger/distilbert_tagger.py +++ b/src/lm_based_tagger/distilbert_tagger.py @@ -89,6 +89,10 @@ def __init__(self, model_path: str): self.model = DistilBertForTokenClassification.from_pretrained(model_path) self.model.eval() + # ── Extract id2label from the saved config.json ── + # model.config.id2label maps string keys ("0", "1", ...) to tag names (e.g. "N", "V", "PRE", ...) + self.id2label = { int(k): v for k, v in self.model.config.id2label.items() } + def tag_identifier(self, tokens, context, type_str, language, system_name): """ 1) Build the “feature tokens + position tokens + identifier tokens” sequence @@ -97,8 +101,8 @@ def tag_identifier(self, tokens, context, type_str, language, system_name): 4) Align via `word_ids()`, skipping: - Any word_id = None - Any word_id < 9 (because first 9 tokens were “feature tokens” => labels = -100) - - Repeated word_ids (so we pick only the first sub-token of each “(pos, identifier‐word)” pair) - 5) Return a list of numeric labels. (If you want strings, you can map via id2label externally.) + - Repeated word_ids (so we pick only the first sub-token of each “(pos, identifier-word)” pair) + 5) Return a list of string labels by mapping numeric IDs through `self.id2label`. """ # 1. Re–compute exactly the same feature tokens as in training: @@ -173,6 +177,7 @@ def tag_identifier(self, tokens, context, type_str, language, system_name): pred_labels.append(predictions[idx]) previous_word_idx = word_idx - # Now, pred_labels is a list of numeric IDs (length == len(tokens)), - # in the same order as your original “tokens” list. - return pred_labels + # 5. Map numeric IDs → string tags via self.id2label + pred_tag_strings = [ self.id2label[label_id] for label_id in pred_labels ] + + return pred_tag_strings diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py index dfa8b25..5f2db21 100644 --- a/src/lm_based_tagger/train_model.py +++ b/src/lm_based_tagger/train_model.py @@ -1,154 +1,297 @@ +# train_model.py + import os +import time +import random + +import numpy as np import pandas as pd -from sklearn.model_selection import train_test_split import torch + +from sklearn.model_selection import train_test_split, KFold +from sklearn.metrics import f1_score, accuracy_score, classification_report + from transformers import ( Trainer, TrainingArguments, DistilBertTokenizerFast, DistilBertConfig, DistilBertForTokenClassification, - DataCollatorForTokenClassification + DataCollatorForTokenClassification, + EarlyStoppingCallback ) + from datasets import Dataset from src.lm_based_tagger.distilbert_preprocessing import prepare_dataset, tokenize_and_align_labels +# If CUDA is available, use it; otherwise fallback to CPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Using device:", device) -# === Labels & Mappings === +# === Random Seeds === +# Match test.py’s seed settings for reproducibility :contentReference[oaicite:0]{index=0} +RAND_STATE = 209 +random.seed(RAND_STATE) +np.random.seed(RAND_STATE) +torch.manual_seed(RAND_STATE) +torch.cuda.manual_seed_all(RAND_STATE) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +# === Hyperparameters / Config === +K = 2 # number of CV folds +HOLDOUT_RATIO = 0.15 # 15% held out for final evaluation +EPOCHS = 5 # number of epochs per fold +EARLY_STOP = 2 # patience for early stopping +LOW_FREQ_TAGS = {"CJ", "VM", "PRE", "V"} + +# === Label List & Mappings (unchanged from your original) :contentReference[oaicite:1]{index=1} === LABEL_LIST = ["CJ", "D", "DT", "N", "NM", "NPL", "P", "PRE", "V", "VM"] LABEL2ID = {label: i for i, label in enumerate(LABEL_LIST)} ID2LABEL = {i: label for label, i in LABEL2ID.items()} + def train_lm(script_dir: str): + # 1) Paths input_path = os.path.join(script_dir, "input", "tagger_data.tsv") output_dir = os.path.join(script_dir, "output") os.makedirs(output_dir, exist_ok=True) - # 1) Read TSV & build tokens/tags lists + # 2) Read the TSV & build “tokens” / “tags” columns :contentReference[oaicite:2]{index=2} df = pd.read_csv(input_path, sep="\t", dtype=str).dropna(subset=["SPLIT", "GRAMMAR_PATTERN"]) df = df[df["SPLIT"].str.strip().astype(bool)] df["tokens"] = df["SPLIT"].apply(lambda x: x.strip().split()) df["tags"] = df["GRAMMAR_PATTERN"].apply(lambda x: x.strip().split()) + # Keep only rows where len(tokens) == len(tags) df = df[df.apply(lambda r: len(r["tokens"]) == len(r["tags"]), axis=1)] - # 2) Train/Test split (stratify by CONTEXT) - train_df, test_df = train_test_split( - df, test_size=0.15, random_state=42, stratify=df["CONTEXT"] + # 3) Initial Train/Val Split (15% hold-out) :contentReference[oaicite:3]{index=3} + train_df, val_df = train_test_split( + df, + test_size=HOLDOUT_RATIO, + random_state=RAND_STATE, + stratify=df["CONTEXT"] ) - # 3) Upsample low-frequency tags (in training set only) - low_freq_tags = {"CJ", "VM", "PRE", "V"} - low_freq_df = train_df[train_df["tags"].apply(lambda tags: any(t in low_freq_tags for t in tags))] - train_df = pd.concat([train_df] + [low_freq_df] * 2, ignore_index=True) + # 4) Upsample low-frequency tags **in the training set only** :contentReference[oaicite:4]{index=4} + low_freq_df = train_df[train_df["tags"].apply(lambda tags: any(t in LOW_FREQ_TAGS for t in tags))] + train_df_upsampled = pd.concat([train_df] + [low_freq_df] * 2, ignore_index=True) - # 4) Tokenizer + # 5) Tokenizer (uncased, matching test.py) tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") - # 5) Convert each split into a HF Dataset via the shared prepare_dataset(...) - train_dataset = prepare_dataset(train_df, LABEL2ID) - test_dataset = prepare_dataset(test_df, LABEL2ID) - - # 6) Tokenize + align labels - tokenized_train = train_dataset.map( - lambda ex: tokenize_and_align_labels(ex, tokenizer), - batched=False - ) - tokenized_test = test_dataset.map( + # 6) Prepare final hold-out “validation” Dataset :contentReference[oaicite:5]{index=5} + val_dataset = prepare_dataset(val_df, LABEL2ID) + tokenized_val = val_dataset.map( lambda ex: tokenize_and_align_labels(ex, tokenizer), batched=False ) - # 7) Build config & model using uncased vocab - config = DistilBertConfig.from_pretrained( - "distilbert-base-uncased", - num_labels=len(LABEL_LIST), - id2label=ID2LABEL, - label2id=LABEL2ID - ) - model = DistilBertForTokenClassification.from_pretrained( - "distilbert-base-uncased", - config=config - ) + # 7) Set up K-Fold + kf = KFold(n_splits=K, shuffle=True, random_state=RAND_STATE) + best_macro_f1 = -1.0 + best_model_dir = None + + fold = 1 + for train_idx, test_idx in kf.split(train_df_upsampled): + print(f"\n=== Fold {fold} ===") + + # 7a) Split the upsampled DataFrame into this fold’s train/test + fold_train_df = train_df_upsampled.iloc[train_idx].reset_index(drop=True) + fold_test_df = train_df_upsampled.iloc[test_idx].reset_index(drop=True) - if device == "cpu": - # 8) Training arguments - training_args = TrainingArguments( - output_dir=output_dir, - evaluation_strategy="epoch", - save_strategy="epoch", - learning_rate=5e-5, - per_device_train_batch_size=16, - per_device_eval_batch_size=16, - num_train_epochs=10, - weight_decay=0.01, - warmup_ratio=0.1, - lr_scheduler_type="cosine", - load_best_model_at_end=True, - metric_for_best_model="eval_macro_f1", - greater_is_better=True, - save_total_limit=1, - logging_dir=os.path.join(output_dir, "logs"), - report_to="none", - seed=42 + # 7b) Build HuggingFace Datasets via prepare_dataset(...) :contentReference[oaicite:6]{index=6} + fold_train_dataset = prepare_dataset(fold_train_df, LABEL2ID) + fold_test_dataset = prepare_dataset(fold_test_df, LABEL2ID) + + # 7c) Tokenize + align labels (exactly as before) :contentReference[oaicite:7]{index=7} + tokenized_train = fold_train_dataset.map( + lambda ex: tokenize_and_align_labels(ex, tokenizer), + batched=False + ) + tokenized_test = fold_test_dataset.map( + lambda ex: tokenize_and_align_labels(ex, tokenizer), + batched=False ) - else: - training_args = TrainingArguments( - output_dir=output_dir, - eval_strategy="epoch", - save_strategy="epoch", - learning_rate=5e-5, - per_device_train_batch_size=4, # ↓ reduce to fit in VRAM - per_device_eval_batch_size=4, - gradient_accumulation_steps=4, # simulates batch size of 16 - num_train_epochs=10, - weight_decay=0.01, - warmup_ratio=0.1, - lr_scheduler_type="cosine", - load_best_model_at_end=True, - save_total_limit=1, - metric_for_best_model="eval_macro_f1", # or "eval_loss" if macro F1 isn't computed - greater_is_better=True, # set to False if using loss - logging_dir=os.path.join(output_dir, "logs"), - report_to="none", - seed=42, - fp16=False, - dataloader_pin_memory=False, # no benefit if no CUDA pinning + + # 8) Build fresh model + config for this fold :contentReference[oaicite:8]{index=8} + config = DistilBertConfig.from_pretrained( + "distilbert-base-uncased", + num_labels=len(LABEL_LIST), + id2label=ID2LABEL, + label2id=LABEL2ID ) + model = DistilBertForTokenClassification.from_pretrained( + "distilbert-base-uncased", + config=config + ) + model.to(device) + + # 9) TrainingArguments (with early stopping) :contentReference[oaicite:9]{index=9} + if device.type == "cpu": + training_args = TrainingArguments( + output_dir=os.path.join(output_dir, f"fold_{fold}"), + eval_strategy="epoch", + save_strategy="epoch", + learning_rate=5e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=EPOCHS, + weight_decay=0.01, + warmup_ratio=0.1, + lr_scheduler_type="cosine", + load_best_model_at_end=True, + metric_for_best_model="eval_macro_f1", + greater_is_better=True, + save_total_limit=1, + logging_dir=os.path.join(output_dir, "logs", f"fold_{fold}"), + report_to="none", + seed=RAND_STATE + ) + else: + training_args = TrainingArguments( + output_dir=os.path.join(output_dir, f"fold_{fold}"), + eval_strategy="epoch", + save_strategy="epoch", + learning_rate=5e-5, + per_device_train_batch_size=4, # smaller per-GPU batch size + per_device_eval_batch_size=4, + gradient_accumulation_steps=4, # to simulate batch size = 16 + num_train_epochs=EPOCHS, + weight_decay=0.01, + warmup_ratio=0.1, + lr_scheduler_type="cosine", + load_best_model_at_end=True, + metric_for_best_model="eval_macro_f1", + greater_is_better=True, + save_total_limit=1, + logging_dir=os.path.join(output_dir, "logs", f"fold_{fold}"), + report_to="none", + seed=RAND_STATE, + fp16=False, + dataloader_pin_memory=False + ) - # 9) Collate Data - data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) - - # 10) Macro‐F1 computation - def compute_metrics(eval_pred): - from sklearn.metrics import f1_score - logits, labels = eval_pred - preds = logits.argmax(axis=-1) - - true_preds = [] - true_labels = [] - for pred_row, label_row in zip(preds, labels): - for p, l in zip(pred_row, label_row): - if l != -100: - true_preds.append(p) - true_labels.append(l) - - macro_f1 = f1_score(true_labels, true_preds, average="macro") - return {"eval_macro_f1": macro_f1} - - # 11) Trainer - trainer = Trainer( - model=model, - args=training_args, - train_dataset=tokenized_train, - eval_dataset=tokenized_test, + # 10) Data collator (dynamic padding) :contentReference[oaicite:10]{index=10} + data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) + + # 11) compute_metrics function (macro-F1) :contentReference[oaicite:11]{index=11} + def compute_metrics(eval_pred): + logits, labels = eval_pred + preds = logits.argmax(axis=-1) + + true_preds = [] + true_labels = [] + for pred_row, label_row in zip(preds, labels): + for p, l in zip(pred_row, label_row): + if l != -100: + true_preds.append(p) + true_labels.append(l) + + macro_f1 = f1_score(true_labels, true_preds, average="macro") + return {"eval_macro_f1": macro_f1} + + # 12) Trainer for this fold (with EarlyStopping) :contentReference[oaicite:12]{index=12} + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_train, + eval_dataset=tokenized_test, + tokenizer=tokenizer, + data_collator=data_collator, + callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOP)], + compute_metrics=compute_metrics + ) + # Avoid deprecation warning (explicitly set tokenizer on trainer) + trainer.tokenizer = tokenizer + + # 13) Train this fold + trainer.train() + + # 14) Evaluate on this fold’s held-out split + preds_logits, labels, _ = trainer.predict(tokenized_test) + preds = np.argmax(preds_logits, axis=-1) + + # Convert to (flattened) label strings for F1 + true_labels_list = [ + ID2LABEL[l] + for sent_labels, sent_preds in zip(labels, preds) + for (l, p) in zip(sent_labels, sent_preds) + if l != -100 + ] + pred_labels_list = [ + ID2LABEL[p] + for sent_labels, sent_preds in zip(labels, preds) + for (l, p) in zip(sent_labels, sent_preds) + if l != -100 + ] + + fold_macro_f1 = f1_score(true_labels_list, pred_labels_list, average="macro") + print(f"Fold {fold} Macro F1: {fold_macro_f1:.4f}") + + # 15) If this fold’s model is the best so far, save it + if fold_macro_f1 > best_macro_f1: + best_macro_f1 = fold_macro_f1 + best_model_dir = os.path.join(output_dir, "best_model") + trainer.save_model(best_model_dir) + tokenizer.save_pretrained(best_model_dir) + + fold += 1 + + # 16) After all folds, report best fold‐score & load best model for final evaluation + print(f"\nBest fold model saved at: {best_model_dir}, Macro F1 = {best_macro_f1:.4f}") + + # 17) Final Evaluation on held-out val_df + best_model = DistilBertForTokenClassification.from_pretrained(best_model_dir) + best_model.to(device) + + # Build a fresh set of TrainingArguments that never runs evaluation epochs: + final_args = TrainingArguments( + output_dir=os.path.join(output_dir, "final_eval"), + per_device_eval_batch_size=16, + eval_strategy="no", + save_strategy="no", + logging_dir=os.path.join(output_dir, "logs", "final_eval"), + report_to="none", + seed=RAND_STATE + ) + val_trainer = Trainer( + model=best_model, + args=final_args, tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics + data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer) + # ← note: no eval_dataset here, because we’ll call .predict(...) manually ) - # 12) Train & save - trainer.train() - trainer.save_model(output_dir) - tokenizer.save_pretrained(output_dir) + start_time = time.perf_counter() + val_preds_logits, val_labels, _ = val_trainer.predict(tokenized_val) + end_time = time.perf_counter() + + val_preds = np.argmax(val_preds_logits, axis=-1) + + flat_true = [ + ID2LABEL[l] + for sent_labels, sent_preds in zip(val_labels, val_preds) + for (l, p) in zip(sent_labels, sent_preds) + if l != -100 + ] + flat_pred = [ + ID2LABEL[p] + for sent_labels, sent_preds in zip(val_labels, val_preds) + for (l, p) in zip(sent_labels, sent_preds) + if l != -100 + ] + + print("\nFinal Evaluation on Held-Out Set:") + print(classification_report(flat_true, flat_pred)) + + # Report inference speed + total_tokens = sum(len(ex["tokens"]) for ex in val_dataset) + total_examples = len(val_dataset) + elapsed = end_time - start_time + print(f"\nInference Time: {elapsed:.2f}s for {total_examples} identifiers ({total_tokens} tokens)") + print(f"Tokens/sec: {total_tokens / elapsed:.2f}") + print(f"Identifiers/sec: {total_examples / elapsed:.2f}") + + final_macro_f1 = f1_score(flat_true, flat_pred, average="macro") + print(f"\nFinal Macro F1 on Held-Out Set: {final_macro_f1:.4f}") diff --git a/src/tag_identifier.py b/src/tag_identifier.py index fdc42fc..0158bb1 100644 --- a/src/tag_identifier.py +++ b/src/tag_identifier.py @@ -131,7 +131,7 @@ def load(self): def find(self, item): return item in self.Words -def initialize_model(selected_model_type): +def initialize_model(temp_config = {}): """ Initialize and load word vectors for the application, and load a word count DataFrame. @@ -142,7 +142,7 @@ def initialize_model(selected_model_type): tuple: (ModelData, WORD_COUNT DataFrame) """ global model_type, lm_model - model_type = selected_model_type + model_type = temp_config.get("model_type", "tree_based") if model_type == "tree_based": print("Loading word vectors!!") modelTokens, modelMethods, modelGensimEnglish = createModel(rootDir=SCRIPT_DIR) @@ -158,7 +158,7 @@ def initialize_model(selected_model_type): app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df) elif model_type == "lm_based": print("Loading DistilBERT tagger...") - lm_model = DistilBertTagger(SCRIPT_DIR) + lm_model = DistilBertTagger(temp_config['model']) print("DistilBERT tagger loaded!") def start_server(temp_config = {}): @@ -176,7 +176,7 @@ def start_server(temp_config = {}): """ print('initializing model...') selected_model = temp_config.get("model_type", "tree_based") - initialize_model(selected_model) + initialize_model(temp_config) print("loading cache...") if not os.path.isdir("cache"): os.mkdir("cache") @@ -239,132 +239,99 @@ def probe(cache_id: str): @app.route('//') @app.route('///') def listen(identifier_name: str, identifier_context: str, cache_id: str = None) -> list[dict]: - #check if identifier name has already been used + # --- Cache lookup (unchanged) --- cache = None - #find the existing cache in app.caches or create a new one if it doesn't exist - if cache_id != None: - if os.path.exists("cache/"+cache_id+".db3"): - #check if the identifier name is in this cache and return it if so - cache = AppCache("cache/"+cache_id+".db3") + if cache_id is not None: + if os.path.exists("cache/" + cache_id + ".db3"): + cache = AppCache("cache/" + cache_id + ".db3") data = cache.retrieve(identifier_name, identifier_context) - if data != False: + if data is not False: cache.encounter(identifier_name, identifier_context) return data else: - #create the cache - cache = AppCache("cache/"+cache_id+".db3") + cache = AppCache("cache/" + cache_id + ".db3") cache.load() - + + # Pull query‐string parameters system_name = request.args.get("system_name", default="") programming_language = request.args.get("language", default="") data_type = request.args.get("type", default="") - - #TODO: update this documentation - """ - Process a web request to analyze an identifier within a specific context. - - This route function takes two URL parameters (identifier_name, and identifier_context) from an - incoming HTTP request and performs data preprocessing and feature extraction on the identifier_name. - It then uses a trained classifier to annotate the identifier with part-of-speech tags and other linguistic features. - - Args: - identifier_name (str): The name of the identifier to be analyzed. - identifier_context (str): The context in which the identifier appears. - Returns: - List[dict]: A list of dictionaries containing words and their predicted POS tags. - """ print(f"INPUT: {identifier_name} {identifier_context}") - - # get the start time start_time = time.perf_counter() - + + # 1) Split the identifier into tokens for **both** modes + words = ronin.split(identifier_name) + + # 2) If we asked for the LM‐based (DistilBERT) tagger, use it if model_type == "lm_based": - result = { - "words": [] - } - tags = lm_model.predict(words, identifier_context, programming_language, data_type, system_name) + result = { "words": [] } + + tags = lm_model.tag_identifier( + tokens=words, + context=identifier_context, + type_str=data_type, + language=programming_language, + system_name=system_name + ) + for word, tag in zip(words, tags): dictionary = dictionary_lookup(word) - result["words"].append({word: {"tag": tag, "dictionary": dictionary}}) + result["words"].append({ + word: { "tag": tag, "dictionary": dictionary } + }) + tag_time = time.perf_counter() - start_time if cache_id: AppCache(f"cache/{cache_id}.db3").add(identifier_name, result, identifier_context, tag_time) return result - - # Split identifier_name into words - words = ronin.split(identifier_name) - - # # Create initial data frame + + # 3) Else: use the existing tree‐based tagger + # Create initial DataFrame data = pd.DataFrame({ 'WORD': words, 'SPLIT_IDENTIFIER': ' '.join(words), - 'CONTEXT_NUMBER': context_to_number(identifier_context), # Predefined context number + 'CONTEXT_NUMBER': context_to_number(identifier_context), }) - # create response JSON - # tags = list(annotate_identifier(app.model_data.ModelClassifier, data)) - result = { - "words" : [] - } - - # Add features to the data + # Build features data = createFeatures( data, mutable_feature_list, modelGensimEnglish=app.model_data.ModelGensimEnglish, ) - - categorical_features = ['NLTK_POS','PREV_POS', 'NEXT_POS'] - category_variables = [] + # Convert any categorical features to numeric + categorical_features = ['NLTK_POS', 'PREV_POS', 'NEXT_POS'] for category_column in categorical_features: if category_column in data.columns: - category_variables.append(category_column) - data.loc[:, category_column] = data[category_column].astype(str) - - for category_column in category_variables: - # Explicitly handle categorical conversion - unique_values = data[category_column].unique() - category_map = {} - for value in unique_values: - if value in universal_to_custom: - category_map[value] = custom_to_numeric[universal_to_custom[value]] - else: - category_map[value] = custom_to_numeric['NOUN'] # Assign 'NM' (8) for unknown categories - - data.loc[:, category_column] = data[category_column].map(category_map) - - # Convert categorical variables to numeric - # Load and apply the classifier + data[category_column] = data[category_column].astype(str) + unique_vals = data[category_column].unique() + category_map = {} + for val in unique_vals: + if val in universal_to_custom: + category_map[val] = custom_to_numeric[universal_to_custom[val]] + else: + category_map[val] = custom_to_numeric['NOUN'] + data[category_column] = data[category_column].map(category_map) + + # Load classifier and annotate clf = joblib.load(os.path.join(SCRIPT_DIR, '..', 'models', 'model_GradientBoostingClassifier.pkl')) predicted_tags = annotate_identifier(clf, data) - # Combine words and their POS tags into a parseable format - #result = [{'word': word, 'pos_tag': tag} for word, tag in zip(words, predicted_tags)] - - for i in range(len(words)): - #check dictionary - dictionary = "UC" #uncategorized - word = words[i] + result = { "words": [] } + for i, word in enumerate(words): dictionary = dictionary_lookup(word) - result["words"].append( - { - words[i] : { - "tag" : predicted_tags[i], - "dictionary" : dictionary - } - } - ) + result["words"].append({ + word: { "tag": predicted_tags[i], "dictionary": dictionary } + }) - # get time it took to tag the identifier tag_time = time.perf_counter() - start_time - - # append result to cache - if cache_id != None: + if cache_id is not None: cache.add(identifier_name, result, identifier_context, tag_time) return result + def context_to_number(context): """ diff --git a/src/tree_based_tagger/create_models.py b/src/tree_based_tagger/create_models.py index 451d697..e147a7c 100644 --- a/src/tree_based_tagger/create_models.py +++ b/src/tree_based_tagger/create_models.py @@ -1,4 +1,3 @@ -import gensim.downloader as api import json, os import logging #'VERB_SCORE', 'DET_SCORE', 'ENGLISHV_SCORE', 'POSITION_RATIO','METHODV_SCORE', 'CONTAINSLISTVERB' @@ -39,6 +38,7 @@ def createModel(pklFile="", rootDir=""): (modelGensimTokens, modelGensimMethods, modelGensimEnglish). Models that fail to load are set to None. """ + import gensim.downloader as api # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') From 4dcd8d4e3f5a72962729a56e76fa30d9677828d7 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Wed, 4 Jun 2025 01:52:33 -0400 Subject: [PATCH 34/51] A half vibe coded mess, but I think it works. Needs a ton of clean up. --- src/lm_based_tagger/distilbert_crf.py | 111 ++++++++++++ .../distilbert_preprocessing.py | 4 +- src/lm_based_tagger/distilbert_tagger.py | 74 +++++--- src/lm_based_tagger/train_model.py | 158 ++++++++++++++---- 4 files changed, 283 insertions(+), 64 deletions(-) create mode 100644 src/lm_based_tagger/distilbert_crf.py diff --git a/src/lm_based_tagger/distilbert_crf.py b/src/lm_based_tagger/distilbert_crf.py new file mode 100644 index 0000000..4fc5486 --- /dev/null +++ b/src/lm_based_tagger/distilbert_crf.py @@ -0,0 +1,111 @@ +# distilbert_crf.py +import torch, os +import torch.nn as nn +from TorchCRF import CRF +from transformers import DistilBertModel, DistilBertConfig + +class DistilBertCRFForTokenClassification(nn.Module): + """ + DistilBERT ➜ dropout ➜ linear projection ➜ CRF. + The CRF layer models label‑to‑label transitions, so the model + is optimised at *sequence* level rather than *token* level. + """ + def __init__(self, + num_labels: int, + id2label: dict, + label2id: dict, + pretrained_name: str = "distilbert-base-uncased", + dropout_prob: float = 0.1): + super().__init__() + + self.config = DistilBertConfig.from_pretrained( + pretrained_name, + num_labels=num_labels, + id2label=id2label, + label2id=label2id, + ) + self.bert = DistilBertModel.from_pretrained(pretrained_name, config=self.config) + self.dropout = nn.Dropout(dropout_prob) + self.classifier = nn.Linear(self.config.hidden_size, num_labels) + self.crf = CRF(num_labels, batch_first=True) + + def forward(self, + input_ids=None, + attention_mask=None, + labels=None, + **kwargs): + + # Hugging Face occasionally injects helper fields (e.g. num_items_in_batch) + # Filter `kwargs` down to what DistilBertModel.forward actually accepts. + ALLOWED = { + "head_mask", "inputs_embeds", "position_ids", + "output_attentions", "output_hidden_states", "return_dict" + } + bert_kwargs = {k: v for k, v in kwargs.items() if k in ALLOWED} + + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + **bert_kwargs, + ) + # —— Build emissions once —————————————————————————————— + sequence_output = self.dropout(outputs[0]) # [B, T, H] + emission_scores = self.classifier(sequence_output) # [B, T, C] + + seq_len = emission_scores.size(1) # original token length + + # ============================== TRAINING ============================== + if labels is not None: + # 1. Drop [CLS] (idx 0) and [SEP] (idx –1) + emissions = emission_scores[:, 1:-1, :] # [B, T‑2, C] + tags = labels[:, 1:-1].clone() # [B, T‑2] + crf_mask = (tags != -100) # True = keep + + # 2. For any position that’s masked‑off ➜ set tag to a valid id (0) + tags[~crf_mask] = 0 + + # 3. Guarantee first timestep is ON for every sequence + first_off = (~crf_mask[:, 0]).nonzero(as_tuple=True)[0] + if len(first_off): + crf_mask[first_off, 0] = True # flip mask to ON + tags[first_off, 0] = 0 # give it tag 0 + + loss = -self.crf(emissions, tags, mask=crf_mask, reduction="mean") + return {"loss": loss, "logits": emission_scores} + + # ============================= INFERENCE ============================== + else: + crf_mask = attention_mask[:, 1:-1].bool() # [B, T‑2] + emissions = emission_scores[:, 1:-1, :] # [B, T‑2, C] + best_paths = self.crf.decode(emissions, mask=crf_mask) + return {"logits": emission_scores, + "predictions": best_paths} + + from transformers import DistilBertConfig + @classmethod + def from_pretrained(cls, ckpt_dir, **kw): + from safetensors import safe_open + cfg = DistilBertConfig.from_pretrained(ckpt_dir) + model = cls( + num_labels=cfg.num_labels, + id2label=cfg.id2label, + label2id=cfg.label2id, + pretrained_name=cfg._name_or_path or "distilbert-base-uncased", + **kw, + ) + + weight_path_pt = os.path.join(ckpt_dir, "pytorch_model.bin") + weight_path_safe = os.path.join(ckpt_dir, "model.safetensors") + + if os.path.exists(weight_path_pt): + state = torch.load(weight_path_pt, map_location="cpu") + elif os.path.exists(weight_path_safe): + state = {} + with safe_open(weight_path_safe, framework="pt", device="cpu") as f: + for k in f.keys(): + state[k] = f.get_tensor(k) + else: + raise FileNotFoundError("No weight file found in checkpoint directory.") + + model.load_state_dict(state) + return model \ No newline at end of file diff --git a/src/lm_based_tagger/distilbert_preprocessing.py b/src/lm_based_tagger/distilbert_preprocessing.py index 874a88a..1043463 100644 --- a/src/lm_based_tagger/distilbert_preprocessing.py +++ b/src/lm_based_tagger/distilbert_preprocessing.py @@ -143,7 +143,7 @@ def prepare_dataset(df: pd.DataFrame, label2id: dict): # - First 7 entries → -100 (because they are feature tokens) # - Then for each identifier token, [-100, label2id[tag]] ner_tags_with_pos = [val for tag in tags for val in (-100, label2id[tag])] - full_labels = [-100] * 7 + ner_tags_with_pos + full_labels = [0] * 7 + ner_tags_with_pos # ← use 0, not -100 rows.append({ "tokens": full_tokens, @@ -163,7 +163,7 @@ def tokenize_and_align_labels(example, tokenizer): - "ner_tags" : List[int] (same length as above) We run `tokenizer(example["tokens"], is_split_into_words=True, truncation=True)`, - then align `word_ids()` with `example["ner_tags"]` exactly as in test.py. + then align `word_ids()` with `example["ner_tags"]` """ tokenized = tokenizer( example["tokens"], diff --git a/src/lm_based_tagger/distilbert_tagger.py b/src/lm_based_tagger/distilbert_tagger.py index 322d3e0..4d62bb9 100644 --- a/src/lm_based_tagger/distilbert_tagger.py +++ b/src/lm_based_tagger/distilbert_tagger.py @@ -4,6 +4,7 @@ import nltk from difflib import SequenceMatcher from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification +from .distilbert_crf import DistilBertCRFForTokenClassification # Make sure we have the same NLTK tagset nltk.download('averaged_perceptron_tagger_eng', quiet=True) @@ -82,16 +83,25 @@ def system_prefix_similarity(first_token, system_name): class DistilBertTagger: def __init__(self, model_path: str): """ - Expects `model_path` to be a folder where the fine-tuned DistilBertForTokenClassification - (and its tokenizer) have been saved via `trainer.save_model(...)` and `tokenizer.save_pretrained(...)`. + `model_path` must contain: + • config.json + • model.safetensors OR pytorch_model.bin + • tokenizer files (tokenizer.json, vocab.txt, …) """ self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_path) - self.model = DistilBertForTokenClassification.from_pretrained(model_path) - self.model.eval() - # ── Extract id2label from the saved config.json ── - # model.config.id2label maps string keys ("0", "1", ...) to tag names (e.g. "N", "V", "PRE", ...) - self.id2label = { int(k): v for k, v in self.model.config.id2label.items() } + # Try CRF wrapper first (it can load .safetensors or .bin) + try: + self.model = DistilBertCRFForTokenClassification.from_pretrained(model_path) + except Exception: + # Fallback: plain DistilBERT head (no CRF layer present) + from transformers import DistilBertForTokenClassification + self.model = DistilBertForTokenClassification.from_pretrained(model_path) + + self.model.eval() # inference mode + + # id2label keys can be strings → convert to int + self.id2label = {int(k): v for k, v in self.model.config.id2label.items()} def tag_identifier(self, tokens, context, type_str, language, system_name): """ @@ -138,9 +148,7 @@ def tag_identifier(self, tokens, context, type_str, language, system_name): cvr_token, digit_token, sim_token, - type_token, - lang_token, - nltk_feature + nltk_feature, ] + tokens_with_pos # 2. Tokenize @@ -152,32 +160,44 @@ def tag_identifier(self, tokens, context, type_str, language, system_name): padding=True ) - # 3. Inference + # ─── 3. Inference ─────────────────────────────────────────── with torch.no_grad(): - logits = self.model( + out = self.model( input_ids=encoded["input_ids"], - attention_mask=encoded["attention_mask"] - )[0] + attention_mask=encoded["attention_mask"], + ) - # 4. Take argmax, then align via word_ids() - predictions = torch.argmax(logits, dim=-1).squeeze().tolist() - word_ids = encoded.word_ids() + # One label per *input* token + if isinstance(out, dict) and "predictions" in out: # CRF path + labels_per_token = out["predictions"][0] # list[int] + else: # logits + logits = out[0] if isinstance(out, (tuple, list)) else out + labels_per_token = torch.argmax(logits, dim=-1).squeeze().tolist() - pred_labels = [] - previous_word_idx = None + # ─── 4. Re‑align to identifier words ────────────────────── + pred_labels, previous_word_idx = [], None + word_ids = encoded.word_ids() # same length as labels_per_token for idx, word_idx in enumerate(word_ids): - # Skip if special token (None), or if it's part of the first 9 “feature tokens” - if word_idx is None or word_idx < 9: + # a) skip special tokens ([CLS]/[SEP]) + if word_idx is None: + continue + # b) skip the 7 leading feature tokens + if word_idx < 7: continue - # Skip if it’s the same word_idx as the previous (to avoid sub-token duplicates) + # c) skip every @pos_* placeholder (@pos tokens sit at even + # offsets after the 7 features: 7,9,11, … so (w‑7)%2 == 0) + if (word_idx - 7) % 2 == 0: + continue + # d) skip duplicate word‑pieces if word_idx == previous_word_idx: continue - pred_labels.append(predictions[idx]) + label_idx = idx - 1 # shift because [CLS] was removed + if label_idx < len(labels_per_token): + pred_labels.append(labels_per_token[label_idx]) previous_word_idx = word_idx - # 5. Map numeric IDs → string tags via self.id2label - pred_tag_strings = [ self.id2label[label_id] for label_id in pred_labels ] - - return pred_tag_strings + # Map numeric IDs → tag strings + pred_tag_strings = [self.id2label[i] for i in pred_labels] + return pred_tag_strings \ No newline at end of file diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py index 5f2db21..0139d25 100644 --- a/src/lm_based_tagger/train_model.py +++ b/src/lm_based_tagger/train_model.py @@ -1,5 +1,3 @@ -# train_model.py - import os import time import random @@ -7,6 +5,7 @@ import numpy as np import pandas as pd import torch +from .distilbert_crf import DistilBertCRFForTokenClassification from sklearn.model_selection import train_test_split, KFold from sklearn.metrics import f1_score, accuracy_score, classification_report @@ -29,7 +28,6 @@ print("Using device:", device) # === Random Seeds === -# Match test.py’s seed settings for reproducibility :contentReference[oaicite:0]{index=0} RAND_STATE = 209 random.seed(RAND_STATE) np.random.seed(RAND_STATE) @@ -39,13 +37,13 @@ torch.backends.cudnn.benchmark = False # === Hyperparameters / Config === -K = 2 # number of CV folds +K = 5 # number of CV folds HOLDOUT_RATIO = 0.15 # 15% held out for final evaluation -EPOCHS = 5 # number of epochs per fold +EPOCHS = 10 # number of epochs per fold EARLY_STOP = 2 # patience for early stopping LOW_FREQ_TAGS = {"CJ", "VM", "PRE", "V"} -# === Label List & Mappings (unchanged from your original) :contentReference[oaicite:1]{index=1} === +# === Label List & Mappings === LABEL_LIST = ["CJ", "D", "DT", "N", "NM", "NPL", "P", "PRE", "V", "VM"] LABEL2ID = {label: i for i, label in enumerate(LABEL_LIST)} ID2LABEL = {i: label for label, i in LABEL2ID.items()} @@ -57,7 +55,7 @@ def train_lm(script_dir: str): output_dir = os.path.join(script_dir, "output") os.makedirs(output_dir, exist_ok=True) - # 2) Read the TSV & build “tokens” / “tags” columns :contentReference[oaicite:2]{index=2} + # 2) Read the TSV & build “tokens” / “tags” columns df = pd.read_csv(input_path, sep="\t", dtype=str).dropna(subset=["SPLIT", "GRAMMAR_PATTERN"]) df = df[df["SPLIT"].str.strip().astype(bool)] df["tokens"] = df["SPLIT"].apply(lambda x: x.strip().split()) @@ -65,7 +63,7 @@ def train_lm(script_dir: str): # Keep only rows where len(tokens) == len(tags) df = df[df.apply(lambda r: len(r["tokens"]) == len(r["tags"]), axis=1)] - # 3) Initial Train/Val Split (15% hold-out) :contentReference[oaicite:3]{index=3} + # 3) Initial Train/Val Split (15% hold-out) train_df, val_df = train_test_split( df, test_size=HOLDOUT_RATIO, @@ -73,14 +71,14 @@ def train_lm(script_dir: str): stratify=df["CONTEXT"] ) - # 4) Upsample low-frequency tags **in the training set only** :contentReference[oaicite:4]{index=4} + # 4) Upsample low-frequency tags **in the training set only** low_freq_df = train_df[train_df["tags"].apply(lambda tags: any(t in LOW_FREQ_TAGS for t in tags))] train_df_upsampled = pd.concat([train_df] + [low_freq_df] * 2, ignore_index=True) - # 5) Tokenizer (uncased, matching test.py) + # 5) Tokenizer tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") - # 6) Prepare final hold-out “validation” Dataset :contentReference[oaicite:5]{index=5} + # 6) Prepare final hold-out “validation” Dataset val_dataset = prepare_dataset(val_df, LABEL2ID) tokenized_val = val_dataset.map( lambda ex: tokenize_and_align_labels(ex, tokenizer), @@ -100,11 +98,11 @@ def train_lm(script_dir: str): fold_train_df = train_df_upsampled.iloc[train_idx].reset_index(drop=True) fold_test_df = train_df_upsampled.iloc[test_idx].reset_index(drop=True) - # 7b) Build HuggingFace Datasets via prepare_dataset(...) :contentReference[oaicite:6]{index=6} + # 7b) Build HuggingFace Datasets via prepare_dataset(...) fold_train_dataset = prepare_dataset(fold_train_df, LABEL2ID) fold_test_dataset = prepare_dataset(fold_test_df, LABEL2ID) - # 7c) Tokenize + align labels (exactly as before) :contentReference[oaicite:7]{index=7} + # 7c) Tokenize + align labels (exactly as before) tokenized_train = fold_train_dataset.map( lambda ex: tokenize_and_align_labels(ex, tokenizer), batched=False @@ -114,20 +112,23 @@ def train_lm(script_dir: str): batched=False ) - # 8) Build fresh model + config for this fold :contentReference[oaicite:8]{index=8} + # 8) Build fresh model + config for this fold config = DistilBertConfig.from_pretrained( "distilbert-base-uncased", num_labels=len(LABEL_LIST), id2label=ID2LABEL, label2id=LABEL2ID ) - model = DistilBertForTokenClassification.from_pretrained( - "distilbert-base-uncased", - config=config - ) + model = DistilBertCRFForTokenClassification( + num_labels=len(LABEL_LIST), + id2label=ID2LABEL, + label2id=LABEL2ID, + pretrained_name="distilbert-base-uncased", + dropout_prob=0.1 + ).to(device) model.to(device) - # 9) TrainingArguments (with early stopping) :contentReference[oaicite:9]{index=9} + # 9) TrainingArguments (with early stopping) if device.type == "cpu": training_args = TrainingArguments( output_dir=os.path.join(output_dir, f"fold_{fold}"), @@ -172,26 +173,74 @@ def train_lm(script_dir: str): dataloader_pin_memory=False ) - # 10) Data collator (dynamic padding) :contentReference[oaicite:10]{index=10} + # 10) Data collator (dynamic padding) data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) - # 11) compute_metrics function (macro-F1) :contentReference[oaicite:11]{index=11} + # 11) compute_metrics function (macro-F1) + def compute_metrics(eval_pred): - logits, labels = eval_pred - preds = logits.argmax(axis=-1) + """ + Works for both: + • Plain classifier logits → argmax along last dim + • CRF Viterbi paths (list/2‑D ndarray) → use directly + Returns: + - eval_macro_f1 + - eval_token_accuracy + - eval_identifier_accuracy + """ + # ── 1. Unpack ──────────────────────────────────────────────────── + if isinstance(eval_pred, tuple): # older HF (<4.38) + preds, labels = eval_pred + else: # EvalPrediction obj + preds = eval_pred.predictions + labels = eval_pred.label_ids + + # ── 2. Convert logits → label IDs if needed ───────────────────── + # * 3‑D tensor : [B, T, C] → argmax(C) + # * 2‑D tensor : already IDs + # * list/obj‑nd : variable‑length decode paths + if isinstance(preds, np.ndarray) and preds.ndim == 3: + preds = np.argmax(preds, axis=-1) # [B, T] + elif isinstance(preds, list): + preds = np.array(preds, dtype=object) # each row is a list + + # ── 3. Accumulate token & identifier stats ────────────────────── + all_true, all_pred, id_correct_flags = [], [], [] - true_preds = [] - true_labels = [] for pred_row, label_row in zip(preds, labels): - for p, l in zip(pred_row, label_row): - if l != -100: - true_preds.append(p) - true_labels.append(l) - - macro_f1 = f1_score(true_labels, true_preds, average="macro") - return {"eval_macro_f1": macro_f1} - - # 12) Trainer for this fold (with EarlyStopping) :contentReference[oaicite:12]{index=12} + ptr = 0 + example_correct = True + + for lbl in label_row: # iterate gold labels + if lbl == -100: # skip padding / specials + continue + + # pick the corresponding prediction + if isinstance(pred_row, (list, np.ndarray)): + pred_lbl = pred_row[ptr] + else: # pred_row is scalar + pred_lbl = pred_row + ptr += 1 + + all_true.append(lbl) + all_pred.append(pred_lbl) + if pred_lbl != lbl: + example_correct = False + + id_correct_flags.append(example_correct) + + # ── 4. Metrics ────────────────────────────────────────────────── + macro_f1 = f1_score(all_true, all_pred, average="macro") + token_acc = accuracy_score(all_true, all_pred) + id_acc = float(sum(id_correct_flags)) / len(id_correct_flags) + + return { + "eval_macro_f1": macro_f1, + "eval_token_accuracy": token_acc, + "eval_identifier_accuracy": id_acc, + } + + # 12) Trainer for this fold (with EarlyStopping) trainer = Trainer( model=model, args=training_args, @@ -242,7 +291,7 @@ def compute_metrics(eval_pred): print(f"\nBest fold model saved at: {best_model_dir}, Macro F1 = {best_macro_f1:.4f}") # 17) Final Evaluation on held-out val_df - best_model = DistilBertForTokenClassification.from_pretrained(best_model_dir) + best_model = DistilBertCRFForTokenClassification.from_pretrained(best_model_dir) best_model.to(device) # Build a fresh set of TrainingArguments that never runs evaluation epochs: @@ -295,3 +344,42 @@ def compute_metrics(eval_pred): final_macro_f1 = f1_score(flat_true, flat_pred, average="macro") print(f"\nFinal Macro F1 on Held-Out Set: {final_macro_f1:.4f}") + final_accuracy = accuracy_score(flat_true, flat_pred) + print(f"Final Token-level Accuracy on Held-Out Set: {final_accuracy:.4f}") + + # 18) Write hold-out predictions to CSV so that each row contains + # (tokens, true_tags, pred_tags) for sanity checking. + from .distilbert_tagger import DistilBertTagger + + # Re-instantiate the exact same DistilBERT tagger we saved + tagger = DistilBertTagger(best_model_dir) + + rows = [] + for _, row in val_df.iterrows(): + tokens = row["tokens"] # e.g. ["my", "Identifier", "Name"] + true_tags = row["tags"] # e.g. ["NM", "DT", "DT"] + context = row.get("CONTEXT", "") # e.g. "FUNCTION" + type_str = row.get("TYPE", "") # if present; otherwise "" + language = row.get("LANGUAGE", "") # if present; otherwise "" + system_name= row.get("SYSTEM_NAME", "") # if present; otherwise "" + + # `tag_identifier` now returns a list of string labels, not IDs + pred_tags = tagger.tag_identifier(tokens, context, type_str, language, system_name) + + rows.append({ + "tokens": " ".join(tokens), + "true_tags": " ".join(true_tags), + "pred_tags": " ".join(pred_tags) + }) + + preds_df = pd.DataFrame(rows) + csv_path = os.path.join(output_dir, "holdout_predictions.csv") + preds_df.to_csv(csv_path, index=False) + print(f"\nWrote hold-out predictions to: {csv_path}") + + # Now also compute identifier-level accuracy from the “flat_true/flat_pred” folds: + # We need to compare per-example (not flattened) again, so re-run a grouping logic. + df = pd.read_csv(os.path.join(output_dir, "holdout_predictions.csv")) + df["row_correct"] = df["true_tags"] == df["pred_tags"] + id_level_acc = df["row_correct"].mean() + print(f"Final Identifier-level Accuracy on Held-Out Set: {id_level_acc:.4f}") \ No newline at end of file From b30e9286054289a6021e2c0d94953ebf8a76e03b Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Wed, 4 Jun 2025 09:59:58 -0400 Subject: [PATCH 35/51] Fix bug with the masking --- src/lm_based_tagger/distilbert_preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lm_based_tagger/distilbert_preprocessing.py b/src/lm_based_tagger/distilbert_preprocessing.py index 1043463..5c905c6 100644 --- a/src/lm_based_tagger/distilbert_preprocessing.py +++ b/src/lm_based_tagger/distilbert_preprocessing.py @@ -143,7 +143,7 @@ def prepare_dataset(df: pd.DataFrame, label2id: dict): # - First 7 entries → -100 (because they are feature tokens) # - Then for each identifier token, [-100, label2id[tag]] ner_tags_with_pos = [val for tag in tags for val in (-100, label2id[tag])] - full_labels = [0] * 7 + ner_tags_with_pos # ← use 0, not -100 + full_labels = [-100] * 7 + ner_tags_with_pos # ← use 0, not -100 rows.append({ "tokens": full_tokens, From a84f3adba6b7d3d46d9625e6432eb3481dd6c00e Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Wed, 4 Jun 2025 13:54:07 -0400 Subject: [PATCH 36/51] Remove system as a feature --- src/lm_based_tagger/distilbert_preprocessing.py | 10 +++++----- src/lm_based_tagger/distilbert_tagger.py | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/lm_based_tagger/distilbert_preprocessing.py b/src/lm_based_tagger/distilbert_preprocessing.py index 5c905c6..8ee9c4c 100644 --- a/src/lm_based_tagger/distilbert_preprocessing.py +++ b/src/lm_based_tagger/distilbert_preprocessing.py @@ -109,11 +109,11 @@ def prepare_dataset(df: pd.DataFrame, label2id: dict): # 1. Build 7 feature tokens (context, system, hungarian, cvr, digit, sim, nltk) context_token = CONTEXT_MAP.get(row["CONTEXT"].strip().upper(), "@unknown") - system_token = f"@system_{row['SYSTEM_NAME'].strip().lower().replace(' ', '_')}" + # system_token = f"@system_{row['SYSTEM_NAME'].strip().lower().replace(' ', '_')}" hungarian_token = detect_hungarian_prefix(tokens[0]) if tokens else "@hung_none" cvr_token = consonant_vowel_ratio_bucket(tokens) digit_token = detect_digit_feature(tokens) - sim_token = system_prefix_similarity(tokens[0], row["SYSTEM_NAME"]) if tokens else "@sim_none" + # sim_token = system_prefix_similarity(tokens[0], row["SYSTEM_NAME"]) if tokens else "@sim_none" # 2. NLTK POS tags (universal tagset) nltk_tags = pos_tag(tokens, tagset="universal") @@ -131,11 +131,11 @@ def prepare_dataset(df: pd.DataFrame, label2id: dict): # 4. Build the “full” token list (7 feature tokens + 2*len(tokens) position‐identifier tokens) full_tokens = [ context_token, - system_token, + # system_token, hungarian_token, cvr_token, digit_token, - sim_token, + # sim_token, nltk_feature, ] + tokens_with_pos @@ -143,7 +143,7 @@ def prepare_dataset(df: pd.DataFrame, label2id: dict): # - First 7 entries → -100 (because they are feature tokens) # - Then for each identifier token, [-100, label2id[tag]] ner_tags_with_pos = [val for tag in tags for val in (-100, label2id[tag])] - full_labels = [-100] * 7 + ner_tags_with_pos # ← use 0, not -100 + full_labels = [-100] * 5 + ner_tags_with_pos # ← use 0, not -100 rows.append({ "tokens": full_tokens, diff --git a/src/lm_based_tagger/distilbert_tagger.py b/src/lm_based_tagger/distilbert_tagger.py index 4d62bb9..b3aa2fb 100644 --- a/src/lm_based_tagger/distilbert_tagger.py +++ b/src/lm_based_tagger/distilbert_tagger.py @@ -143,11 +143,11 @@ def tag_identifier(self, tokens, context, type_str, language, system_name): # Build the full input token sequence (exactly what training saw): input_tokens = [ context_token, - system_token, + # system_token, hungarian_token, cvr_token, digit_token, - sim_token, + # sim_token, nltk_feature, ] + tokens_with_pos @@ -183,11 +183,11 @@ def tag_identifier(self, tokens, context, type_str, language, system_name): if word_idx is None: continue # b) skip the 7 leading feature tokens - if word_idx < 7: + if word_idx < 5: continue # c) skip every @pos_* placeholder (@pos tokens sit at even # offsets after the 7 features: 7,9,11, … so (w‑7)%2 == 0) - if (word_idx - 7) % 2 == 0: + if (word_idx - 5) % 2 == 0: continue # d) skip duplicate word‑pieces if word_idx == previous_word_idx: From ca22c5954d60db1f848bcb486568e2cb463504f7 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Wed, 4 Jun 2025 16:01:27 -0400 Subject: [PATCH 37/51] Update to pull from huggingface or local based on --local --- main | 19 +++++++---- src/lm_based_tagger/distilbert_crf.py | 43 ++++++++++++++---------- src/lm_based_tagger/distilbert_tagger.py | 21 +++--------- src/tag_identifier.py | 3 +- 4 files changed, 45 insertions(+), 41 deletions(-) diff --git a/main b/main index 07d795b..83cfb18 100755 --- a/main +++ b/main @@ -43,6 +43,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="SCALAR identifier tagger") parser.add_argument("-v", "--version", action="store_true", help="print tagger application version") + parser.add_argument("--local", action="store_true", help="Use local model/tokenizer instead of HuggingFace repo.") # Core run/train model arguments parser.add_argument("--mode", choices=["train", "run"], required=True, help="Choose to 'train' or 'run' the model") parser.add_argument("--model_type", choices=["tree_based", "lm_based"], required=True, help="Specify which model type to use") @@ -88,11 +89,17 @@ if __name__ == "__main__": start_server(temp_config=config) elif args.model_type == "lm_based": - start_server(temp_config={ - 'script_dir': SCRIPT_DIR, - 'model': os.path.join(SCRIPT_DIR, 'output', 'best_model'), - 'model_type':'lm_based' - }) - + if not args.local: + start_server(temp_config={ + 'script_dir': SCRIPT_DIR, + 'model': 'sourceslicer/scalar_lm_best', + 'model_type':'lm_based', + }) + else: + start_server(temp_config={ + 'script_dir': SCRIPT_DIR, + 'model': os.path.join(SCRIPT_DIR, 'output', 'best_model'), + 'model_type':'lm_based', + }) else: parser.print_usage() \ No newline at end of file diff --git a/src/lm_based_tagger/distilbert_crf.py b/src/lm_based_tagger/distilbert_crf.py index 4fc5486..daa5fd0 100644 --- a/src/lm_based_tagger/distilbert_crf.py +++ b/src/lm_based_tagger/distilbert_crf.py @@ -80,12 +80,13 @@ def forward(self, best_paths = self.crf.decode(emissions, mask=crf_mask) return {"logits": emission_scores, "predictions": best_paths} - - from transformers import DistilBertConfig @classmethod - def from_pretrained(cls, ckpt_dir, **kw): - from safetensors import safe_open - cfg = DistilBertConfig.from_pretrained(ckpt_dir) + def from_pretrained(cls, ckpt_dir, local=False, **kw): + from safetensors.torch import load_file as load_safe_file + from huggingface_hub import hf_hub_download + import os + cfg = DistilBertConfig.from_pretrained(ckpt_dir, local_files_only=local) + model = cls( num_labels=cfg.num_labels, id2label=cfg.id2label, @@ -94,18 +95,24 @@ def from_pretrained(cls, ckpt_dir, **kw): **kw, ) - weight_path_pt = os.path.join(ckpt_dir, "pytorch_model.bin") - weight_path_safe = os.path.join(ckpt_dir, "model.safetensors") + # Attempt to load model.safetensors only + try: + if os.path.isdir(ckpt_dir): + # Load from local directory + weight_path = os.path.join(ckpt_dir, "model.safetensors") + if not os.path.exists(weight_path): + raise FileNotFoundError(f"No model.safetensors found in local path: {weight_path}") + else: + # Load from Hugging Face Hub + weight_path = hf_hub_download( + repo_id=ckpt_dir, + filename="model.safetensors", + local_files_only=local + ) - if os.path.exists(weight_path_pt): - state = torch.load(weight_path_pt, map_location="cpu") - elif os.path.exists(weight_path_safe): - state = {} - with safe_open(weight_path_safe, framework="pt", device="cpu") as f: - for k in f.keys(): - state[k] = f.get_tensor(k) - else: - raise FileNotFoundError("No weight file found in checkpoint directory.") + state_dict = load_safe_file(weight_path, device="cpu") + model.load_state_dict(state_dict) + return model - model.load_state_dict(state) - return model \ No newline at end of file + except Exception as e: + raise RuntimeError(f"Failed to load model.safetensors from {ckpt_dir}: {e}") \ No newline at end of file diff --git a/src/lm_based_tagger/distilbert_tagger.py b/src/lm_based_tagger/distilbert_tagger.py index b3aa2fb..8847487 100644 --- a/src/lm_based_tagger/distilbert_tagger.py +++ b/src/lm_based_tagger/distilbert_tagger.py @@ -81,26 +81,15 @@ def system_prefix_similarity(first_token, system_name): class DistilBertTagger: - def __init__(self, model_path: str): - """ - `model_path` must contain: - • config.json - • model.safetensors OR pytorch_model.bin - • tokenizer files (tokenizer.json, vocab.txt, …) - """ - self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_path) + def __init__(self, model_path: str, local: bool = False): + self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_path, local_files_only=local) - # Try CRF wrapper first (it can load .safetensors or .bin) try: - self.model = DistilBertCRFForTokenClassification.from_pretrained(model_path) + self.model = DistilBertCRFForTokenClassification.from_pretrained(model_path, local=local) except Exception: - # Fallback: plain DistilBERT head (no CRF layer present) - from transformers import DistilBertForTokenClassification - self.model = DistilBertForTokenClassification.from_pretrained(model_path) - - self.model.eval() # inference mode + self.model = DistilBertForTokenClassification.from_pretrained(model_path, local_files_only=local) - # id2label keys can be strings → convert to int + self.model.eval() self.id2label = {int(k): v for k, v in self.model.config.id2label.items()} def tag_identifier(self, tokens, context, type_str, language, system_name): diff --git a/src/tag_identifier.py b/src/tag_identifier.py index 0158bb1..c847aa9 100644 --- a/src/tag_identifier.py +++ b/src/tag_identifier.py @@ -158,7 +158,8 @@ def initialize_model(temp_config = {}): app.model_data = ModelData(modelTokens, modelMethods, modelGensimEnglish, word_count_df) elif model_type == "lm_based": print("Loading DistilBERT tagger...") - lm_model = DistilBertTagger(temp_config['model']) + is_local = temp_config.get("local", False) + lm_model = DistilBertTagger(temp_config['model'], local=is_local) print("DistilBERT tagger loaded!") def start_server(temp_config = {}): From e083b390c18db9026b8f48d3cd653ef2eb404ae0 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Wed, 4 Jun 2025 16:38:03 -0400 Subject: [PATCH 38/51] Fix requirements and I dunno how the crf imports are working --- requirements.txt | 134 +++++++++----------------- requirements_gpu.txt | 12 --- src/lm_based_tagger/distilbert_crf.py | 4 +- src/lm_based_tagger/train_model.py | 7 -- 4 files changed, 49 insertions(+), 108 deletions(-) delete mode 100644 requirements_gpu.txt diff --git a/requirements.txt b/requirements.txt index 51e31b1..74fc9d6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,104 +1,64 @@ -accelerate==1.3.0 -attrs==25.1.0 -beautifulsoup4==4.12.3 -bioc==2.1 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.9 +aiosignal==1.3.2 +attrs==25.3.0 blinker==1.9.0 -boto3==1.36.6 -botocore==1.36.6 -certifi==2024.12.14 -charset-normalizer==3.4.1 -click==8.1.8 -conllu==4.5.3 -contourpy==1.3.1 -cycler==0.12.1 -Deprecated==1.2.17 -docopt==0.6.2 -filelock==3.17.0 -flair==0.15.0 -Flask==3.1.0 -fonttools==4.55.6 -fsspec==2024.12.0 -ftfy==6.3.1 -gdown==5.2.0 -gensim==4.3.3 -huggingface-hub==0.27.1 -humanize==4.11.0 +certifi==2025.4.26 +charset-normalizer==3.4.2 +click==8.2.1 +datasets==3.6.0 +dill==0.3.8 +filelock==3.18.0 +Flask==3.1.1 +frozenlist==1.6.2 +fsspec==2025.3.0 +hf-xet==1.1.3 +huggingface-hub==0.32.4 +humanize==4.12.3 idna==3.10 -iniconfig==2.0.0 -intervaltree==3.1.0 +iniconfig==2.1.0 itsdangerous==2.2.0 -Jinja2==3.1.5 -jmespath==1.0.1 -joblib==1.4.2 -jsonlines==4.0.0 -kiwisolver==1.4.8 -langdetect==1.0.9 -lxml==5.3.0 +Jinja2==3.1.6 +joblib==1.5.1 MarkupSafe==3.0.2 -matplotlib==3.10.0 -more-itertools==10.6.0 -mpld3==0.5.10 mpmath==1.3.0 -networkx==3.4.2 +multidict==6.4.4 +multiprocess==0.70.16 +networkx==3.5 nltk==3.9.1 -numpy==1.26.4 -nvidia-cublas-cu12==12.4.5.8 -nvidia-cuda-cupti-cu12==12.4.127 -nvidia-cuda-nvrtc-cu12==12.4.127 -nvidia-cuda-runtime-cu12==12.4.127 -nvidia-cudnn-cu12==9.1.0.70 -nvidia-cufft-cu12==11.2.1.3 -nvidia-curand-cu12==10.3.5.147 -nvidia-cusolver-cu12==11.6.1.9 -nvidia-cusparse-cu12==12.3.1.170 -nvidia-nccl-cu12==2.21.5 -nvidia-nvjitlink-cu12==12.4.127 -nvidia-nvtx-cu12==12.4.127 -packaging==24.2 +numpy==2.2.6 +packaging==25.0 pandas==2.2.3 -pillow==11.1.0 -plac==1.4.3 -pluggy==1.5.0 -pptree==3.1 -protobuf==5.29.3 -psutil==6.1.1 -pyparsing==3.2.1 -PySocks==1.7.1 -pytest==8.3.4 +plac==1.4.5 +pluggy==1.6.0 +propcache==0.3.1 +pyarrow==20.0.0 +Pygments==2.19.1 +pytest==8.4.0 python-dateutil==2.9.0.post0 -pytorch_revgrad==0.2.0 -pytz==2024.2 +pytorch-crf==0.7.2 +pytz==2025.2 PyYAML==6.0.2 regex==2024.11.6 requests==2.32.3 -s3transfer==0.11.2 -safetensors==0.5.2 +safetensors==0.5.3 scikit-learn==1.6.1 -scipy==1.13.1 -segtok==1.5.11 -sentencepiece==0.2.0 -setuptools==75.8.0 +scipy==1.15.3 +setuptools==80.9.0 six==1.17.0 -smart-open==7.1.0 -sortedcontainers==2.4.0 -soupsieve==2.6 spiral @ git+https://github.com/cnewman/spiral.git@dff537320c15849c10e583968036df2d966eddee -sqlitedict==2.1.0 -sympy==1.13.1 -tabulate==0.9.0 -termcolor==2.5.0 -threadpoolctl==3.5.0 -tokenizers==0.21.0 -torch==2.5.1 +sympy==1.14.0 +termcolor==3.1.0 +threadpoolctl==3.6.0 +tokenizers==0.21.1 +torch==2.7.1 tqdm==4.67.1 -transformer-smaller-training-vocab==0.4.0 -transformers==4.48.1 -triton==3.1.0 -typing_extensions==4.12.2 -tzdata==2025.1 -urllib3==2.3.0 +transformers==4.52.4 +triton==3.3.1 +typing_extensions==4.14.0 +tzdata==2025.2 +urllib3==2.4.0 waitress==3.0.2 -wcwidth==0.2.13 Werkzeug==3.1.3 -Wikipedia-API==0.8.1 -wrapt==1.17.2 \ No newline at end of file +xxhash==3.5.0 +yarl==1.20.0 diff --git a/requirements_gpu.txt b/requirements_gpu.txt deleted file mode 100644 index c9a1ba1..0000000 --- a/requirements_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -nvidia-cublas-cu12==12.4.5.8 -nvidia-cuda-cupti-cu12==12.4.127 -nvidia-cuda-nvrtc-cu12==12.4.127 -nvidia-cuda-runtime-cu12==12.4.127 -nvidia-cudnn-cu12==9.1.1.17 -nvidia-cufft-cu12==11.2.1.3 -nvidia-curand-cu12==10.3.5.147 -nvidia-cusolver-cu12==11.6.1.9 -nvidia-cusparse-cu12==12.3.1.170 -nvidia-nccl-cu12==2.23.4 -nvidia-nvjitlink-cu12==12.4.127 -nvidia-nvtx-cu12==12.4.127 \ No newline at end of file diff --git a/src/lm_based_tagger/distilbert_crf.py b/src/lm_based_tagger/distilbert_crf.py index daa5fd0..d359aa0 100644 --- a/src/lm_based_tagger/distilbert_crf.py +++ b/src/lm_based_tagger/distilbert_crf.py @@ -1,7 +1,7 @@ # distilbert_crf.py -import torch, os +import torch +from torchcrf import CRF import torch.nn as nn -from TorchCRF import CRF from transformers import DistilBertModel, DistilBertConfig class DistilBertCRFForTokenClassification(nn.Module): diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py index 0139d25..ea9c0f8 100644 --- a/src/lm_based_tagger/train_model.py +++ b/src/lm_based_tagger/train_model.py @@ -113,12 +113,6 @@ def train_lm(script_dir: str): ) # 8) Build fresh model + config for this fold - config = DistilBertConfig.from_pretrained( - "distilbert-base-uncased", - num_labels=len(LABEL_LIST), - id2label=ID2LABEL, - label2id=LABEL2ID - ) model = DistilBertCRFForTokenClassification( num_labels=len(LABEL_LIST), id2label=ID2LABEL, @@ -126,7 +120,6 @@ def train_lm(script_dir: str): pretrained_name="distilbert-base-uncased", dropout_prob=0.1 ).to(device) - model.to(device) # 9) TrainingArguments (with early stopping) if device.type == "cpu": From 27035665bdd14379a349dbee5ad620066617a592 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Wed, 4 Jun 2025 16:57:22 -0400 Subject: [PATCH 39/51] Remove req that won't work on windows --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 74fc9d6..8a4e554 100644 --- a/requirements.txt +++ b/requirements.txt @@ -54,7 +54,7 @@ tokenizers==0.21.1 torch==2.7.1 tqdm==4.67.1 transformers==4.52.4 -triton==3.3.1 +# triton==3.3.1 - doesn't work on windows typing_extensions==4.14.0 tzdata==2025.2 urllib3==2.4.0 From e135cd6c8deeec976cb9039e3e53d74cddae1ded Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Wed, 4 Jun 2025 17:06:56 -0400 Subject: [PATCH 40/51] Greatly reduce the requirements.txt to just the top level reqs --- requirements.txt | 56 +----------------------------------------------- 1 file changed, 1 insertion(+), 55 deletions(-) diff --git a/requirements.txt b/requirements.txt index 74fc9d6..233014e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,64 +1,10 @@ -aiohappyeyeballs==2.6.1 aiohttp==3.12.9 -aiosignal==1.3.2 -attrs==25.3.0 -blinker==1.9.0 -certifi==2025.4.26 -charset-normalizer==3.4.2 -click==8.2.1 datasets==3.6.0 -dill==0.3.8 -filelock==3.18.0 Flask==3.1.1 -frozenlist==1.6.2 -fsspec==2025.3.0 -hf-xet==1.1.3 -huggingface-hub==0.32.4 -humanize==4.12.3 -idna==3.10 -iniconfig==2.1.0 -itsdangerous==2.2.0 -Jinja2==3.1.6 -joblib==1.5.1 -MarkupSafe==3.0.2 -mpmath==1.3.0 -multidict==6.4.4 -multiprocess==0.70.16 -networkx==3.5 -nltk==3.9.1 -numpy==2.2.6 -packaging==25.0 -pandas==2.2.3 -plac==1.4.5 -pluggy==1.6.0 -propcache==0.3.1 -pyarrow==20.0.0 -Pygments==2.19.1 -pytest==8.4.0 -python-dateutil==2.9.0.post0 +pipdeptree==2.26.1 pytorch-crf==0.7.2 -pytz==2025.2 -PyYAML==6.0.2 -regex==2024.11.6 -requests==2.32.3 -safetensors==0.5.3 scikit-learn==1.6.1 -scipy==1.15.3 -setuptools==80.9.0 -six==1.17.0 spiral @ git+https://github.com/cnewman/spiral.git@dff537320c15849c10e583968036df2d966eddee -sympy==1.14.0 -termcolor==3.1.0 -threadpoolctl==3.6.0 -tokenizers==0.21.1 torch==2.7.1 -tqdm==4.67.1 transformers==4.52.4 -triton==3.3.1 -typing_extensions==4.14.0 -tzdata==2025.2 -urllib3==2.4.0 waitress==3.0.2 -Werkzeug==3.1.3 -xxhash==3.5.0 -yarl==1.20.0 From 059eeb01e936f45c0e85f0540a0dbc7667a83541 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Wed, 4 Jun 2025 19:23:37 -0400 Subject: [PATCH 41/51] Make it so that classification report gets printed to a file --- src/lm_based_tagger/train_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py index ea9c0f8..92f5c5e 100644 --- a/src/lm_based_tagger/train_model.py +++ b/src/lm_based_tagger/train_model.py @@ -326,6 +326,8 @@ def compute_metrics(eval_pred): print("\nFinal Evaluation on Held-Out Set:") print(classification_report(flat_true, flat_pred)) + with open('holdout_report.txt', 'w') as f: + print(classification_report(flat_true, flat_pred), file=f) # Report inference speed total_tokens = sum(len(ex["tokens"]) for ex in val_dataset) From ecc88550f536ae1e390b73f7426aa17cf68fec07 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Wed, 4 Jun 2025 19:30:54 -0400 Subject: [PATCH 42/51] Update readme --- README.md | 161 ++---------------------------------------------------- 1 file changed, 5 insertions(+), 156 deletions(-) diff --git a/README.md b/README.md index fdd99e9..3d9ac8d 100644 --- a/README.md +++ b/README.md @@ -1,162 +1,11 @@ # SCALAR Part-of-speech tagger -This the official release of the SCALAR Part-of-speech tagger -There are two ways to run the tagger. This document describes both ways. +THIS IS AN EXPERIMENTAL VERSION OF SCALAR -1. Using Docker compose (which runs the tagger's built-in server for you) -2. Running the tagger's built-in server without Docker +Install requirements via `pip install -r requirements.txt` -## Current Metrics (this will be updated every time we update/change the model!) -| | Accuracy | Balanced Accuracy | Weighted Recall | Weighted Precision | Weighted F1 | Performance (seconds) | -|------------|:--------:|:------------------:|:---------------:|:------------------:|:-----------:|:---------------------:| -| **SCALAR** | **0.8216** | **0.9160** | **0.8216** | **0.8245** | **0.8220** | **249.05** | -| Ensemble | 0.7124 | 0.8311 | 0.7124 | 0.7597 | 0.7235 | 1149.44 | -| Flair | 0.6087 | 0.7844 | 0.6087 | 0.7755 | 0.6497 | 807.03 | +Run via `python3 main --mode run --model_type lm_based` -## Getting Started with Docker +You can attempt to traint it `python main --mode train --model_type lm_based` -- but I make no guarantees about how easily it will work at this stage -To run SCALAR in a Docker container you can clone the repository and pull the latest docker impage from `sourceslicer/scalar_tagger:latest` - -Make sure you have Docker and Docker Compose installed: - -https://docs.docker.com/engine/install/ - -https://docs.docker.com/compose/install/ - -``` -git clone git@github.com:SCANL/scanl_tagger.git -cd scanl_tagger -docker compose pull -docker compose up -``` - -## Getting Started without Docker -You will need `python3.12` installed. - -You'll need to install `pip` -- https://pip.pypa.io/en/stable/installation/ - -Set up a virtual environtment: `python -m venv /tmp/tagger` -- feel free to put it somewhere else (change /tmp/tagger) if you prefer - -Activate the virtual environment: `source /tmp/tagger/bin/activate` (you can find how to activate it here if `source` does not work for you -- https://docs.python.org/3/library/venv.html#how-venvs-work) - -After it's installed and your virtual environment is activated, in the root of the repo, run `pip install -r requirements.txt` - -Finally, we require the `token` and `target` vectors from [code2vec](https://github.com/tech-srl/code2vec). The tagger will attempt to automatically download them if it doesn't find them, but you could download them yourself if you like. It will place them in your local directory under `./code2vec/*` - -## Usage - -``` -usage: main [-h] [-v] [-r] [-t] [-a ADDRESS] [--port PORT] [--protocol PROTOCOL] - [--words WORDS] - -options: - -h, --help show this help message and exit - -v, --version print tagger application version - -r, --run run server for part of speech tagging requests - -t, --train run training set to retrain the model - -a ADDRESS, --address ADDRESS - configure server address - --port PORT configure server port - --protocol PROTOCOL configure whether the server uses http or https - --words WORDS provide path to a list of acceptable abbreviations -``` - -`./main -r` will start the server, which will listen for identifier names sent via HTTP over the route: - -http://127.0.0.1:8080/{identifier_name}/{code_context}/{database_name (optional)} - -"database name" specifies an sqlite database to be used for result caching and data collection. If the database specified does not exist, one will be created. - -You can check wehther or not a database exists by using the `/probe` route by sending an HTTP request like this: - -http://127.0.0.1:5000/probe/{database_name} - -"code context" is one of: -- FUNCTION -- ATTRIBUTE -- CLASS -- DECLARATION -- PARAMETER - -For example: - -Tag a declaration: ``http://127.0.0.1:8000/numberArray/DECLARATION/database`` - -Tag a function: ``http://127.0.0.1:8000/GetNumberArray/FUNCTION/database`` - -Tag an class: ``http://127.0.0.1:8000/PersonRecord/CLASS/database`` - -#### Note -Kebab case is not currently supported due to the limitations of Spiral. Attempting to send the tagger identifiers which are in kebab case will result in the entry of a single noun. - -You will need to have a way to parse code and filter out identifier names if you want to do some on-the-fly analysis of source code. We recommend [srcML](https://www.srcml.org/). Since the actual tagger is a web server, you don't have to use srcML. You could always use other AST-based code representations, or any other method of obtaining identifier information. - - -## Tagset - -**Supported Tagset** -| Abbreviation | Expanded Form | Examples | -|:------------:|:--------------------------------------------:|:--------------------------------------------:| -| N | noun | Disneyland, shoe, faucet, mother | -| DT | determiner | the, this, that, these, those, which | -| CJ | conjunction | and, for, nor, but, or, yet, so | -| P | preposition | behind, in front of, at, under, above | -| NPL | noun plural | Streets, cities, cars, people, lists | -| NM | noun modifier (**noun-adjunct**, adjective) | red, cold, hot, **bit**Set, **employee**Name | -| V | verb | Run, jump, spin, | -| VM | verb modifier (adverb) | Very, loudly, seriously, impatiently | -| D | digit | 1, 2, 10, 4.12, 0xAF | -| PRE | preamble | Gimp, GLEW, GL, G, p, m, b | - -**Penn Treebank to SCALAR tagset** - -| Penn Treebank Annotation | SCALAR Tagset | -|:---------------------------:|:------------------------:| -| Conjunction (CC) | Conjunction (CJ) | -| Digit (CD) | Digit (D) | -| Determiner (DT) | Determiner (DT) | -| Foreign Word (FW) | Noun (N) | -| Preposition (IN) | Preposition (P) | -| Adjective (JJ) | Noun Modifier (NM) | -| Comparative Adjective (JJR) | Noun Modifier (NM) | -| Superlative Adjective (JJS) | Noun Modifier (NM) | -| List Item (LS) | Noun (N) | -| Modal (MD) | Verb (V) | -| Noun Singular (NN) | Noun (N) | -| Proper Noun (NNP) | Noun (N) | -| Proper Noun Plural (NNPS) | Noun Plural (NPL) | -| Noun Plural (NNS) | Noun Plural (NPL) | -| Adverb (RB) | Verb Modifier (VM) | -| Comparative Adverb (RBR) | Verb Modifier (VM) | -| Particle (RP) | Verb Modifier (VM) | -| Symbol (SYM) | Noun (N) | -| To Preposition (TO) | Preposition (P) | -| Verb (VB) | Verb (V) | -| Verb (VBD) | Verb (V) | -| Verb (VBG) | Verb (V) | -| Verb (VBN) | Verb (V) | -| Verb (VBP) | Verb (V) | -| Verb (VBZ) | Verb (V) | - -## Training the tagger -You can train this tagger using the `-t` option (which will re-run the training routine). For the moment, most of this is hard-coded in, so if you want to use a different data set/different seeds, you'll need to modify the code. This will potentially change in the future. - -## Errors? -Please make an issue if you run into errors - -# Please Cite the Paper(s)! - -Newman, Christian, Scholten , Brandon, Testa, Sophia, Behler, Joshua, Banabilah, Syreen, Collard, Michael L., Decker, Michael, Mkaouer, Mohamed Wiem, Zampieri, Marcos, Alomar, Eman Abdullah, Alsuhaibani, Reem, Peruma, Anthony, Maletic, Jonathan I., (2025), “SCALAR: A Part-of-speech Tagger for Identifiers”, in the Proceedings of the 33rd IEEE/ACM International Conference on Program Comprehension - Tool Demonstrations Track (ICPC), Ottawa, ON, Canada, April 27 -28, 5 pages TO APPEAR. - -Christian D. Newman, Michael J. Decker, Reem S. AlSuhaibani, Anthony Peruma, Satyajit Mohapatra, Tejal Vishnoi, Marcos Zampieri, Mohamed W. Mkaouer, Timothy J. Sheldon, and Emily Hill, "An Ensemble Approach for Annotating Source Code Identifiers with Part-of-speech Tags," in IEEE Transactions on Software Engineering, doi: 10.1109/TSE.2021.3098242. - -# Training set -The data used to train this tagger can be found in the most recent database update in the repo -- https://github.com/SCANL/scanl_tagger/blob/master/input/scanl_tagger_training_db_11_29_2024.db - -# Interested in our other work? -Find our other research [at our webpage](https://www.scanl.org/) and check out the [Identifier Name Structure Catalogue](https://github.com/SCANL/identifier_name_structure_catalogue) - -# WordNet -This project uses WordNet to perform a dictionary lookup on the individual words in each identifier: - -Princeton University "About WordNet." [WordNet](https://wordnet.princeton.edu/). Princeton University. 2010 +It still technically supports the old gradientboost model, too... but no guarantees as to how well it functions in this branch. \ No newline at end of file From 6ea557f9216313ca0505877496507b434ac8b916 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Wed, 4 Jun 2025 22:36:47 -0400 Subject: [PATCH 43/51] DRY --- src/lm_based_tagger/distilbert_crf.py | 2 - .../distilbert_preprocessing.py | 110 ++++--------- src/lm_based_tagger/distilbert_tagger.py | 147 +++--------------- 3 files changed, 53 insertions(+), 206 deletions(-) diff --git a/src/lm_based_tagger/distilbert_crf.py b/src/lm_based_tagger/distilbert_crf.py index d359aa0..729c74b 100644 --- a/src/lm_based_tagger/distilbert_crf.py +++ b/src/lm_based_tagger/distilbert_crf.py @@ -52,8 +52,6 @@ def forward(self, sequence_output = self.dropout(outputs[0]) # [B, T, H] emission_scores = self.classifier(sequence_output) # [B, T, C] - seq_len = emission_scores.size(1) # original token length - # ============================== TRAINING ============================== if labels is not None: # 1. Drop [CLS] (idx 0) and [SEP] (idx –1) diff --git a/src/lm_based_tagger/distilbert_preprocessing.py b/src/lm_based_tagger/distilbert_preprocessing.py index 8ee9c4c..fee656d 100644 --- a/src/lm_based_tagger/distilbert_preprocessing.py +++ b/src/lm_based_tagger/distilbert_preprocessing.py @@ -1,5 +1,3 @@ -# distilbert_preprocessing.py - import re from nltk import pos_tag import nltk @@ -24,33 +22,40 @@ "CLASS": "@class" } +FEATURES = [ + "context", + "hungarian", + "cvr", + "digit", + "nltk" +] + +FEATURE_FUNCTIONS = { + "context": lambda row, tokens: CONTEXT_MAP.get(row["CONTEXT"].strip().upper(), "@unknown"), + "hungarian": lambda row, tokens: detect_hungarian_prefix(tokens[0]) if tokens else "@hung_none", + "cvr": lambda row, tokens: consonant_vowel_ratio_bucket(tokens), + "digit": lambda row, tokens: detect_digit_feature(tokens), + "nltk": lambda row, tokens: "@nltk_" + '-'.join(tag.lower() for _, tag in pos_tag(tokens, tagset="universal")) +} + +def get_feature_tokens(row, tokens): + return [FEATURE_FUNCTIONS[feat](row, tokens) for feat in FEATURES] + +NUMBER_OF_FEATURES = len(FEATURES) def detect_hungarian_prefix(first_token): - """ - If the first token starts with 1–3 letters followed by an uppercase or underscore, - return "@hung_". Otherwise "@hung_none". - """ m = re.match(r'^([a-zA-Z]{1,3})[A-Z_]', first_token) if m: return f"@hung_{m.group(1).lower()}" return "@hung_none" - def detect_digit_feature(tokens): - """ - If any token has a digit, return "@has_digit", else "@no_digit". - """ for token in tokens: if any(char.isdigit() for char in token): return "@has_digit" return "@no_digit" - def consonant_vowel_ratio_bucket(tokens): - """ - Compute the average consonant/vowel ratio across all alphabetic tokens, - then bucket into low/mid/high. - """ def ratio(tok): tok_lower = tok.lower() num_vowels = sum(1 for c in tok_lower if c in VOWELS) @@ -69,12 +74,7 @@ def ratio(tok): else: return "@cvr_high" - def system_prefix_similarity(first_token, system_name): - """ - Compute a SequenceMatcher ratio against the system name, then bucket: - >0.9 ➔ "@sim_high", >0.6 ➔ "@sim_mid", >0.3 ➔ "@sim_low", else "@sim_none". - """ if not first_token or not system_name: return "@sim_none" sys_lower = system_name.strip().lower() @@ -89,82 +89,41 @@ def system_prefix_similarity(first_token, system_name): else: return "@sim_none" +def normalize_type(type_str): + ts = type_str.strip().lower() + ts = ts.replace("*", "_ptr") + ts = ts.replace(" ", "_") + return f"@{ts}" + +def normalize_language(lang_str): + return "@lang_" + lang_str.strip().lower().replace("++", "pp").replace("#", "sharp") def prepare_dataset(df: pd.DataFrame, label2id: dict): - """ - Takes a DataFrame with columns: - - "tokens" : List[str] (split identifier) - - "tags" : List[str] (gold PoS tags, same length as tokens) - - "CONTEXT" : e.g. "FUNCTION", "PARAMETER", etc. - - "SYSTEM_NAME" : string - - Returns a HuggingFace `datasets.Dataset` with two fields: - - "tokens" : List[List[str]] (the FULL token sequence, including exactly 7 feature tokens + position tokens + identifier tokens) - - "ner_tags" : List[List[int]] (the aligned label IDs, with -100 in front for each feature token) - """ rows = [] for _, row in df.iterrows(): tokens = row["tokens"] tags = row["tags"] + feature_tokens = get_feature_tokens(row, tokens) - # 1. Build 7 feature tokens (context, system, hungarian, cvr, digit, sim, nltk) - context_token = CONTEXT_MAP.get(row["CONTEXT"].strip().upper(), "@unknown") - # system_token = f"@system_{row['SYSTEM_NAME'].strip().lower().replace(' ', '_')}" - hungarian_token = detect_hungarian_prefix(tokens[0]) if tokens else "@hung_none" - cvr_token = consonant_vowel_ratio_bucket(tokens) - digit_token = detect_digit_feature(tokens) - # sim_token = system_prefix_similarity(tokens[0], row["SYSTEM_NAME"]) if tokens else "@sim_none" - - # 2. NLTK POS tags (universal tagset) - nltk_tags = pos_tag(tokens, tagset="universal") - universal_tags = [tag.lower() for _, tag in nltk_tags] - nltk_feature = f"@nltk_{'-'.join(universal_tags)}" - - # 3. Position tags: interleave with identifier tokens length = len(tokens) - if length == 1: - pos_tokens = ["@pos_2"] - else: - pos_tokens = ["@pos_0"] + ["@pos_1"] * (length - 2) + ["@pos_2"] + pos_tokens = ["@pos_2"] if length == 1 else ["@pos_0"] + ["@pos_1"] * (length - 2) + ["@pos_2"] tokens_with_pos = [val for pair in zip(pos_tokens, tokens) for val in pair] - # 4. Build the “full” token list (7 feature tokens + 2*len(tokens) position‐identifier tokens) - full_tokens = [ - context_token, - # system_token, - hungarian_token, - cvr_token, - digit_token, - # sim_token, - nltk_feature, - ] + tokens_with_pos - - # 5. Build the aligned labels array: - # - First 7 entries → -100 (because they are feature tokens) - # - Then for each identifier token, [-100, label2id[tag]] + full_tokens = feature_tokens + tokens_with_pos ner_tags_with_pos = [val for tag in tags for val in (-100, label2id[tag])] - full_labels = [-100] * 5 + ner_tags_with_pos # ← use 0, not -100 + full_labels = [-100] * NUMBER_OF_FEATURES + ner_tags_with_pos rows.append({ - "tokens": full_tokens, + "tokens": full_tokens, "ner_tags": full_labels }) return Dataset.from_dict({ - "tokens": [r["tokens"] for r in rows], + "tokens": [r["tokens"] for r in rows], "ner_tags": [r["ner_tags"] for r in rows] }) - def tokenize_and_align_labels(example, tokenizer): - """ - example: a dict with - - "tokens" : List[str] (the full token sequence, including exactly 7 feature tokens) - - "ner_tags" : List[int] (same length as above) - - We run `tokenizer(example["tokens"], is_split_into_words=True, truncation=True)`, - then align `word_ids()` with `example["ner_tags"]` - """ tokenized = tokenizer( example["tokens"], truncation=True, @@ -180,7 +139,6 @@ def tokenize_and_align_labels(example, tokenizer): elif word_id < len(example["ner_tags"]): labels.append(example["ner_tags"][word_id]) else: - # Just in case of truncation labels.append(-100) tokenized["labels"] = labels diff --git a/src/lm_based_tagger/distilbert_tagger.py b/src/lm_based_tagger/distilbert_tagger.py index 8847487..394c340 100644 --- a/src/lm_based_tagger/distilbert_tagger.py +++ b/src/lm_based_tagger/distilbert_tagger.py @@ -1,84 +1,8 @@ -import re import torch from nltk import pos_tag -import nltk -from difflib import SequenceMatcher from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification from .distilbert_crf import DistilBertCRFForTokenClassification - -# Make sure we have the same NLTK tagset -nltk.download('averaged_perceptron_tagger_eng', quiet=True) -nltk.download('universal_tagset', quiet=True) - -VOWELS = set("aeiou") -CONTEXT_MAP = { - "FUNCTION": "@func", - "PARAMETER": "@param", - "ATTRIBUTE": "@attr", - "DECLARATION": "@decl", - "CLASS": "@class" -} - - -def detect_hungarian_prefix(first_token): - m = re.match(r'^([a-zA-Z]{1,3})[A-Z_]', first_token) - if m: - return f"@hung_{m.group(1).lower()}" - return "@hung_none" - - -def detect_digit_feature(tokens): - for token in tokens: - if any(char.isdigit() for char in token): - return "@has_digit" - return "@no_digit" - - -def consonant_vowel_ratio_bucket(tokens): - def ratio(tok): - tok_lower = tok.lower() - num_vowels = sum(1 for c in tok_lower if c in VOWELS) - num_consonants = sum(1 for c in tok_lower if c.isalpha() and c not in VOWELS) - return num_consonants / (num_vowels + 1e-5) - - ratios = [ratio(tok) for tok in tokens if tok.isalpha()] - if not ratios: - return "@cvr_none" - avg_ratio = sum(ratios) / len(ratios) - if avg_ratio < 1.5: - return "@cvr_low" - elif avg_ratio < 3.0: - return "@cvr_mid" - else: - return "@cvr_high" - - -def normalize_type(type_str): - ts = type_str.strip().lower() - ts = ts.replace("*", "_ptr") - ts = ts.replace(" ", "_") - return f"@{ts}" - - -def normalize_language(lang_str): - return "@lang_" + lang_str.strip().lower().replace("++", "pp").replace("#", "sharp") - - -def system_prefix_similarity(first_token, system_name): - if not first_token or not system_name: - return "@sim_none" - sys_lower = system_name.strip().lower() - tok_lower = first_token.strip().lower() - r = SequenceMatcher(None, tok_lower, sys_lower).ratio() - if r > 0.9: - return "@sim_high" - elif r > 0.6: - return "@sim_mid" - elif r > 0.3: - return "@sim_low" - else: - return "@sim_none" - +from .distilbert_preprocessing import * class DistilBertTagger: def __init__(self, model_path: str, local: bool = False): @@ -99,48 +23,24 @@ def tag_identifier(self, tokens, context, type_str, language, system_name): 3) Run the model, take argmax over token logits 4) Align via `word_ids()`, skipping: - Any word_id = None - - Any word_id < 9 (because first 9 tokens were “feature tokens” => labels = -100) + - Any word_id < N (number of feature tokens) => labels = -100 - Repeated word_ids (so we pick only the first sub-token of each “(pos, identifier-word)” pair) 5) Return a list of string labels by mapping numeric IDs through `self.id2label`. """ + row = { + "CONTEXT": context, + "SYSTEM_NAME": system_name, + "TYPE": type_str, + "LANGUAGE": language + } + feature_tokens = get_feature_tokens(row, tokens) - # 1. Re–compute exactly the same feature tokens as in training: - context_token = CONTEXT_MAP.get(context.strip().upper(), "@unknown") - system_token = f"@system_{system_name.strip().lower().replace(' ', '_')}" - hungarian_token = detect_hungarian_prefix(tokens[0]) if tokens else "@hung_none" - cvr_token = consonant_vowel_ratio_bucket(tokens) - digit_token = detect_digit_feature(tokens) - sim_token = system_prefix_similarity(tokens[0], system_name) if tokens else "@sim_none" - type_token = normalize_type(type_str) - lang_token = normalize_language(language) - - # Position tags for each identifier token length = len(tokens) - if length == 1: - pos_tokens = ["@pos_2"] - else: - pos_tokens = ["@pos_0"] + ["@pos_1"] * (length - 2) + ["@pos_2"] - - # NLTK POS feature - nltk_tags = pos_tag(tokens, tagset="universal") - universal_tags = [tag.lower() for _, tag in nltk_tags] - nltk_feature = f"@nltk_{'-'.join(universal_tags)}" - - # Interleave pos_tokens + identifier tokens + pos_tokens = ["@pos_2"] if length == 1 else ["@pos_0"] + ["@pos_1"] * (length - 2) + ["@pos_2"] tokens_with_pos = [val for pair in zip(pos_tokens, tokens) for val in pair] - # Build the full input token sequence (exactly what training saw): - input_tokens = [ - context_token, - # system_token, - hungarian_token, - cvr_token, - digit_token, - # sim_token, - nltk_feature, - ] + tokens_with_pos + input_tokens = feature_tokens + tokens_with_pos - # 2. Tokenize encoded = self.tokenizer( input_tokens, is_split_into_words=True, @@ -149,44 +49,35 @@ def tag_identifier(self, tokens, context, type_str, language, system_name): padding=True ) - # ─── 3. Inference ─────────────────────────────────────────── with torch.no_grad(): out = self.model( input_ids=encoded["input_ids"], attention_mask=encoded["attention_mask"], ) - # One label per *input* token - if isinstance(out, dict) and "predictions" in out: # CRF path - labels_per_token = out["predictions"][0] # list[int] - else: # logits + if isinstance(out, dict) and "predictions" in out: + labels_per_token = out["predictions"][0] + else: logits = out[0] if isinstance(out, (tuple, list)) else out labels_per_token = torch.argmax(logits, dim=-1).squeeze().tolist() - # ─── 4. Re‑align to identifier words ────────────────────── pred_labels, previous_word_idx = [], None - word_ids = encoded.word_ids() # same length as labels_per_token + word_ids = encoded.word_ids() for idx, word_idx in enumerate(word_ids): - # a) skip special tokens ([CLS]/[SEP]) if word_idx is None: continue - # b) skip the 7 leading feature tokens - if word_idx < 5: + if word_idx < NUMBER_OF_FEATURES: continue - # c) skip every @pos_* placeholder (@pos tokens sit at even - # offsets after the 7 features: 7,9,11, … so (w‑7)%2 == 0) - if (word_idx - 5) % 2 == 0: + if (word_idx - NUMBER_OF_FEATURES) % 2 == 0: continue - # d) skip duplicate word‑pieces if word_idx == previous_word_idx: continue - label_idx = idx - 1 # shift because [CLS] was removed + label_idx = idx - 1 if label_idx < len(labels_per_token): pred_labels.append(labels_per_token[label_idx]) previous_word_idx = word_idx - # Map numeric IDs → tag strings pred_tag_strings = [self.id2label[i] for i in pred_labels] - return pred_tag_strings \ No newline at end of file + return pred_tag_strings From dc5c8a4dfece3a2b769971cff5ea445239e2ae21 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Sun, 8 Jun 2025 02:08:07 -0400 Subject: [PATCH 44/51] Remove reliance on NLTK. Does not reduce effectiveness of the model, and actually makes it faster --- .../distilbert_preprocessing.py | 2 +- src/lm_based_tagger/train_model.py | 40 ++++++++++--------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/src/lm_based_tagger/distilbert_preprocessing.py b/src/lm_based_tagger/distilbert_preprocessing.py index fee656d..bd10c2b 100644 --- a/src/lm_based_tagger/distilbert_preprocessing.py +++ b/src/lm_based_tagger/distilbert_preprocessing.py @@ -27,7 +27,7 @@ "hungarian", "cvr", "digit", - "nltk" + #"nltk" ] FEATURE_FUNCTIONS = { diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py index 92f5c5e..2e11a7a 100644 --- a/src/lm_based_tagger/train_model.py +++ b/src/lm_based_tagger/train_model.py @@ -48,6 +48,9 @@ LABEL2ID = {label: i for i, label in enumerate(LABEL_LIST)} ID2LABEL = {i: label for label, i in LABEL2ID.items()} +def dual_print(*args, file, **kwargs): + print(*args, **kwargs) # stdout + print(*args, file=file, **kwargs) # file def train_lm(script_dir: str): # 1) Paths @@ -276,6 +279,7 @@ def compute_metrics(eval_pred): best_macro_f1 = fold_macro_f1 best_model_dir = os.path.join(output_dir, "best_model") trainer.save_model(best_model_dir) + model.config.save_pretrained(best_model_dir) tokenizer.save_pretrained(best_model_dir) fold += 1 @@ -324,24 +328,6 @@ def compute_metrics(eval_pred): if l != -100 ] - print("\nFinal Evaluation on Held-Out Set:") - print(classification_report(flat_true, flat_pred)) - with open('holdout_report.txt', 'w') as f: - print(classification_report(flat_true, flat_pred), file=f) - - # Report inference speed - total_tokens = sum(len(ex["tokens"]) for ex in val_dataset) - total_examples = len(val_dataset) - elapsed = end_time - start_time - print(f"\nInference Time: {elapsed:.2f}s for {total_examples} identifiers ({total_tokens} tokens)") - print(f"Tokens/sec: {total_tokens / elapsed:.2f}") - print(f"Identifiers/sec: {total_examples / elapsed:.2f}") - - final_macro_f1 = f1_score(flat_true, flat_pred, average="macro") - print(f"\nFinal Macro F1 on Held-Out Set: {final_macro_f1:.4f}") - final_accuracy = accuracy_score(flat_true, flat_pred) - print(f"Final Token-level Accuracy on Held-Out Set: {final_accuracy:.4f}") - # 18) Write hold-out predictions to CSV so that each row contains # (tokens, true_tags, pred_tags) for sanity checking. from .distilbert_tagger import DistilBertTagger @@ -377,4 +363,20 @@ def compute_metrics(eval_pred): df = pd.read_csv(os.path.join(output_dir, "holdout_predictions.csv")) df["row_correct"] = df["true_tags"] == df["pred_tags"] id_level_acc = df["row_correct"].mean() - print(f"Final Identifier-level Accuracy on Held-Out Set: {id_level_acc:.4f}") \ No newline at end of file + + # Report inference speed + total_tokens = sum(len(ex["tokens"]) for ex in val_dataset) + total_examples = len(val_dataset) + elapsed = end_time - start_time + final_macro_f1 = f1_score(flat_true, flat_pred, average="macro") + final_accuracy = accuracy_score(flat_true, flat_pred) + print("\nFinal Evaluation on Held-Out Set:") + with open('holdout_report.txt', 'w') as f: + report = classification_report(flat_true, flat_pred) + dual_print(report, file=f) + dual_print(f"\nInference Time: {elapsed:.2f}s for {total_examples} identifiers ({total_tokens} tokens)", file=f) + dual_print(f"Tokens/sec: {total_tokens / elapsed:.2f}", file=f) + dual_print(f"Identifiers/sec: {total_examples / elapsed:.2f}", file=f) + dual_print(f"\nFinal Macro F1 on Held-Out Set: {final_macro_f1:.4f}", file=f) + dual_print(f"Final Token-level Accuracy on Held-Out Set: {final_accuracy:.4f}", file=f) + dual_print(f"Final Identifier-level Accuracy on Held-Out Set: {id_level_acc:.4f}", file=f) \ No newline at end of file From f0671866408a62d2626803232b93063363cdcf0e Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Sun, 8 Jun 2025 02:19:12 -0400 Subject: [PATCH 45/51] Add current metrics --- README.md | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3d9ac8d..2bf4b7f 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,44 @@ Install requirements via `pip install -r requirements.txt` Run via `python3 main --mode run --model_type lm_based` -You can attempt to traint it `python main --mode train --model_type lm_based` -- but I make no guarantees about how easily it will work at this stage +You can attempt to train it `python main --mode train --model_type lm_based` -- but I make no guarantees about how easily it will work at this stage -It still technically supports the old gradientboost model, too... but no guarantees as to how well it functions in this branch. \ No newline at end of file +It still technically supports the old gradientboost model, too... but no guarantees as to how well it functions in this branch. + +## Evaluation Results (Held-Out Set) + +### Per-Class Metrics + +| Label | Precision | Recall | F1-Score | Support | +|-------|-----------|--------|----------|---------| +| CJ | 0.88 | 0.88 | 0.88 | 8 | +| D | 0.98 | 0.96 | 0.97 | 52 | +| DT | 0.95 | 0.93 | 0.94 | 45 | +| N | 0.94 | 0.94 | 0.94 | 418 | +| NM | 0.91 | 0.93 | 0.92 | 440 | +| NPL | 0.97 | 0.97 | 0.97 | 79 | +| P | 0.94 | 0.92 | 0.93 | 79 | +| PRE | 0.79 | 0.79 | 0.79 | 68 | +| V | 0.89 | 0.84 | 0.86 | 110 | +| VM | 0.79 | 0.85 | 0.81 | 13 | + +### Aggregate Metrics + +| Metric | Score | +|---------------------|--------| +| Accuracy | 0.92 | +| Macro Avg F1 | 0.90 | +| Weighted Avg F1 | 0.92 | +| Total Examples | 1312 | + +### Inference Statistics + +- **Inference Time:** 1.74s for 392 identifiers (3746 tokens) +- **Tokens/sec:** 2157.78 +- **Identifiers/sec:** 225.80 + +### Final Scores + +- **Final Macro F1 on Held-Out Set:** 0.9032 +- **Final Token-level Accuracy:** 0.9223 +- **Final Identifier-level Accuracy:** 0.8291 From 26857a12c4336bb590d9f33ebbcf6082dc55c908 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Mon, 9 Jun 2025 22:43:27 -0400 Subject: [PATCH 46/51] Tested tree and lm based run and train. Did some thorough documenting on how the nn code works. --- main | 3 +- requirements.txt | 3 +- src/lm_based_tagger/distilbert_crf.py | 121 ++++++--- .../distilbert_preprocessing.py | 73 +++++- src/lm_based_tagger/distilbert_tagger.py | 60 +++-- src/lm_based_tagger/train_model.py | 230 ++++++++++++------ 6 files changed, 349 insertions(+), 141 deletions(-) diff --git a/main b/main index 83cfb18..536cb71 100755 --- a/main +++ b/main @@ -67,12 +67,11 @@ if __name__ == "__main__": download_files() train_tree(config) elif args.model_type == "lm_based": - download_files() train_lm(SCRIPT_DIR) elif args.mode == "run": if args.model_type == "tree_based": - config = load_config_tree() + config = load_config_tree(SCRIPT_DIR) # Inject overrides download_files() config["model_type"] = args.model_type diff --git a/requirements.txt b/requirements.txt index 233014e..7ed1a05 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,5 +6,6 @@ pytorch-crf==0.7.2 scikit-learn==1.6.1 spiral @ git+https://github.com/cnewman/spiral.git@dff537320c15849c10e583968036df2d966eddee torch==2.7.1 -transformers==4.52.4 waitress==3.0.2 +gensim==4.3.3 +transformers[torch] diff --git a/src/lm_based_tagger/distilbert_crf.py b/src/lm_based_tagger/distilbert_crf.py index 729c74b..e1007c5 100644 --- a/src/lm_based_tagger/distilbert_crf.py +++ b/src/lm_based_tagger/distilbert_crf.py @@ -6,16 +6,39 @@ class DistilBertCRFForTokenClassification(nn.Module): """ - DistilBERT ➜ dropout ➜ linear projection ➜ CRF. - The CRF layer models label‑to‑label transitions, so the model - is optimised at *sequence* level rather than *token* level. + Token-level classifier that combines DistilBERT with a CRF layer for structured prediction. + + Architecture: + input_ids, attention_mask + ↓ + DistilBERT (pretrained encoder) + ↓ + Dropout + ↓ + Linear layer (projects hidden size → num_labels) + ↓ + CRF layer (models sequence-level transitions) + + Training: + - Uses negative log-likelihood from CRF as loss. + - Learns both emission scores (token-level confidence) and + transition scores (label-to-label sequence consistency). + + Inference: + - Uses Viterbi decoding to predict the most likely sequence of labels. + + Output: + During training: + {"loss": ..., "logits": ...} + During inference: + {"logits": ..., "predictions": List[List[int]]} + + Example input shape: + input_ids: [B, T] — e.g. [16, 128] + attention_mask: [B, T] — 1 for real tokens, 0 for padding + logits: [B, T, C] — C = number of label classes """ - def __init__(self, - num_labels: int, - id2label: dict, - label2id: dict, - pretrained_name: str = "distilbert-base-uncased", - dropout_prob: float = 0.1): + def __init__(self, num_labels: int, id2label: dict, label2id: dict, pretrained_name: str = "distilbert-base-uncased", dropout_prob: float = 0.1): super().__init__() self.config = DistilBertConfig.from_pretrained( @@ -29,11 +52,34 @@ def __init__(self, self.classifier = nn.Linear(self.config.hidden_size, num_labels) self.crf = CRF(num_labels, batch_first=True) - def forward(self, - input_ids=None, - attention_mask=None, - labels=None, - **kwargs): + def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs): + """ + Forward pass for training or inference. + + Args: + input_ids (Tensor): Token IDs of shape [B, T] + attention_mask (Tensor): Attention mask of shape [B, T] + labels (Tensor, optional): Ground-truth labels of shape [B, T]. Required during training. + kwargs: Any additional DistilBERT-compatible inputs (e.g., head_mask, position_ids, etc.) + + Returns: + If labels are provided (training mode): + dict with: + - loss (Tensor): scalar negative log-likelihood from CRF + - logits (Tensor): emission scores of shape [B, T, C] + + If labels are not provided (inference mode): + dict with: + - logits (Tensor): emission scores of shape [B, T, C] + - predictions (List[List[int]]): decoded label IDs from CRF, + one list per sequence, + each of length T-2 (excluding [CLS] and [SEP]) + + Notes: + - logits: [B, T, C], where B = batch size, T = sequence length, C = number of label classes + - predictions: List[List[int]], where each inner list has length T-2 + (i.e., excludes [CLS] and [SEP]) and contains Viterbi-decoded label IDs + """ # Hugging Face occasionally injects helper fields (e.g. num_items_in_batch) # Filter `kwargs` down to what DistilBertModel.forward actually accepts. @@ -48,36 +94,49 @@ def forward(self, attention_mask=attention_mask, **bert_kwargs, ) - # —— Build emissions once —————————————————————————————— - sequence_output = self.dropout(outputs[0]) # [B, T, H] - emission_scores = self.classifier(sequence_output) # [B, T, C] + # 1) Compute per-token emission scores + # Applies dropout to the BERT hidden states, then projects them to label logits. + # Shape: [B, T, C], where B=batch size, T=sequence length, C=number of classes + sequence_output = self.dropout(outputs[0]) + emission_scores = self.classifier(sequence_output) - # ============================== TRAINING ============================== if labels is not None: - # 1. Drop [CLS] (idx 0) and [SEP] (idx –1) - emissions = emission_scores[:, 1:-1, :] # [B, T‑2, C] - tags = labels[:, 1:-1].clone() # [B, T‑2] - crf_mask = (tags != -100) # True = keep + # 2) Remove [CLS] and [SEP] special tokens from emissions and labels + # These tokens were added by the tokenizer but are not part of the identifier + emissions = emission_scores[:, 1:-1, :] # [B, T-2, C] + tags = labels[:, 1:-1].clone() # [B, T-2] - # 2. For any position that’s masked‑off ➜ set tag to a valid id (0) + # 3) Create a mask: True where label is valid, False where label == -100 + # The CRF will use this to ignore special/padded tokens + crf_mask = (tags != -100) + + # 4) Replace invalid label positions (-100) with a dummy label (e.g., 0) + # This is required because CRF expects a label at every position, even if masked tags[~crf_mask] = 0 - # 3. Guarantee first timestep is ON for every sequence + # 5) Ensure the first token of every sequence is active in the CRF mask + # This avoids CRF errors when the first token is masked out (which breaks decoding) first_off = (~crf_mask[:, 0]).nonzero(as_tuple=True)[0] if len(first_off): - crf_mask[first_off, 0] = True # flip mask to ON - tags[first_off, 0] = 0 # give it tag 0 + crf_mask[first_off, 0] = True + tags[first_off, 0] = 0 # assign a dummy label + # 6) Compute CRF negative log-likelihood loss loss = -self.crf(emissions, tags, mask=crf_mask, reduction="mean") return {"loss": loss, "logits": emission_scores} - # ============================= INFERENCE ============================== else: - crf_mask = attention_mask[:, 1:-1].bool() # [B, T‑2] - emissions = emission_scores[:, 1:-1, :] # [B, T‑2, C] + # INFERENCE MODE + + # 2) Remove [CLS] and [SEP] from emissions and build CRF mask from attention + # Only use the inner content of the input sequence + crf_mask = attention_mask[:, 1:-1].bool() # [B, T-2] + emissions = emission_scores[:, 1:-1, :] # [B, T-2, C] + + # 3) Run Viterbi decoding to get best label sequence for each input best_paths = self.crf.decode(emissions, mask=crf_mask) - return {"logits": emission_scores, - "predictions": best_paths} + return {"logits": emission_scores, "predictions": best_paths} + @classmethod def from_pretrained(cls, ckpt_dir, local=False, **kw): from safetensors.torch import load_file as load_safe_file diff --git a/src/lm_based_tagger/distilbert_preprocessing.py b/src/lm_based_tagger/distilbert_preprocessing.py index bd10c2b..f386c28 100644 --- a/src/lm_based_tagger/distilbert_preprocessing.py +++ b/src/lm_based_tagger/distilbert_preprocessing.py @@ -1,14 +1,8 @@ import re -from nltk import pos_tag -import nltk from difflib import SequenceMatcher import pandas as pd from datasets import Dataset -# Download once (we’ll just do it quietly here) -nltk.download('averaged_perceptron_tagger_eng', quiet=True) -nltk.download('universal_tagset', quiet=True) - # === Constants === VOWELS = set("aeiou") LOW_FREQ_TAGS = {"CJ", "VM", "PRE", "V"} @@ -27,7 +21,6 @@ "hungarian", "cvr", "digit", - #"nltk" ] FEATURE_FUNCTIONS = { @@ -35,7 +28,6 @@ "hungarian": lambda row, tokens: detect_hungarian_prefix(tokens[0]) if tokens else "@hung_none", "cvr": lambda row, tokens: consonant_vowel_ratio_bucket(tokens), "digit": lambda row, tokens: detect_digit_feature(tokens), - "nltk": lambda row, tokens: "@nltk_" + '-'.join(tag.lower() for _, tag in pos_tag(tokens, tagset="universal")) } def get_feature_tokens(row, tokens): @@ -99,6 +91,38 @@ def normalize_language(lang_str): return "@lang_" + lang_str.strip().lower().replace("++", "pp").replace("#", "sharp") def prepare_dataset(df: pd.DataFrame, label2id: dict): + """ + Converts a DataFrame of identifier tokens and grammar tags into a HuggingFace Dataset + formatted for NER training with feature and position tokens. + + Each row in the input DataFrame should contain: + - tokens: List[str] (e.g., ['get', 'Employee', 'Name']) + - tags: List[str] (e.g., ['V', 'NM', 'N']) + - CONTEXT: str (e.g., 'function') + + The function adds: + - Feature tokens: ['@hung_get', '@no_digit', '@cvr_mid', '@func'] + - Interleaved position and real tokens: + ['@pos_0', 'get', '@pos_1', 'Employee', '@pos_2', 'Name'] + + The NER tags are aligned so that: + - Feature tokens and position markers get label -100 (ignored in loss) + - Real tokens are converted from grammar tags using `label2id` + + Example Input: + df = pd.DataFrame([{ + "tokens": ["get", "Employee", "Name"], + "tags": ["V", "NM", "N"], + "CONTEXT": "function" + }]) + + Example Output: + Dataset with: + tokens: ['@hung_get', '@no_digit', '@cvr_mid', '@func', + '@pos_0', 'get', '@pos_1', 'Employee', '@pos_2', 'Name'] + ner_tags: [-100, -100, -100, -100, + -100, 1, -100, 2, -100, 3] # assuming label2id = {"V": 1, "NM": 2, "N": 3} + """ rows = [] for _, row in df.iterrows(): tokens = row["tokens"] @@ -123,9 +147,34 @@ def prepare_dataset(df: pd.DataFrame, label2id: dict): "ner_tags": [r["ner_tags"] for r in rows] }) -def tokenize_and_align_labels(example, tokenizer): +def tokenize_and_align_labels(sample, tokenizer): + """ + Tokenizes an example and aligns NER labels with subword tokens. + + The input `example` comes from `prepare_dataset()` and contains: + - tokens: List[str], including feature and position tokens + - ner_tags: List[int], aligned with `tokens`, with -100 for ignored tokens + + This function: + - Uses `is_split_into_words=True` to tokenize each item in `tokens` + - Uses `tokenizer.word_ids()` to map each subword back to its original token index + - Assigns the corresponding label (or -100) for each subword token + + Example Input: + example = { + "tokens": ['@hung_get', '@no_digit', '@cvr_mid', '@func', + '@pos_0', 'get', '@pos_1', 'Employee', '@pos_2', 'Name'], + "ner_tags": [-100, -100, -100, -100, + -100, 1, -100, 2, -100, 3] + } + + Assuming 'Employee' is tokenized to ['Em', '##ployee'], + Example Output: + tokenized["labels"] = [-100, -100, -100, -100, + -100, 1, -100, 2, 2, -100, 3] + """ tokenized = tokenizer( - example["tokens"], + sample["tokens"], truncation=True, is_split_into_words=True ) @@ -136,8 +185,8 @@ def tokenize_and_align_labels(example, tokenizer): for word_id in word_ids: if word_id is None: labels.append(-100) - elif word_id < len(example["ner_tags"]): - labels.append(example["ner_tags"][word_id]) + elif word_id < len(sample["ner_tags"]): + labels.append(sample["ner_tags"][word_id]) else: labels.append(-100) diff --git a/src/lm_based_tagger/distilbert_tagger.py b/src/lm_based_tagger/distilbert_tagger.py index 394c340..9f89d00 100644 --- a/src/lm_based_tagger/distilbert_tagger.py +++ b/src/lm_based_tagger/distilbert_tagger.py @@ -1,31 +1,53 @@ import torch -from nltk import pos_tag from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification from .distilbert_crf import DistilBertCRFForTokenClassification from .distilbert_preprocessing import * class DistilBertTagger: + """ + A lightweight wrapper around a DistilBERT+CRF or DistilBERT-only model for tagging identifier tokens + with part-of-speech-like grammar labels (e.g., V, NM, N, etc.). + + Automatically handles: + - Tokenization (with custom feature and position tokens) + - Running the model + - Post-processing the raw logits or CRF predictions + - Aligning subword tokens back to word-level predictions + """ def __init__(self, model_path: str, local: bool = False): + # Load tokenizer from local directory or remote HuggingFace path self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_path, local_files_only=local) + # Try loading CRF-enhanced model; fallback to plain classifier if not available try: self.model = DistilBertCRFForTokenClassification.from_pretrained(model_path, local=local) except Exception: self.model = DistilBertForTokenClassification.from_pretrained(model_path, local_files_only=local) + # disable dropout, etc. for inference self.model.eval() + + # map label IDs to strings self.id2label = {int(k): v for k, v in self.model.config.id2label.items()} def tag_identifier(self, tokens, context, type_str, language, system_name): """ - 1) Build the “feature tokens + position tokens + identifier tokens” sequence - 2) Tokenize with `is_split_into_words=True` - 3) Run the model, take argmax over token logits - 4) Align via `word_ids()`, skipping: - - Any word_id = None - - Any word_id < N (number of feature tokens) => labels = -100 - - Repeated word_ids (so we pick only the first sub-token of each “(pos, identifier-word)” pair) - 5) Return a list of string labels by mapping numeric IDs through `self.id2label`. + Tag a split identifier using the model, returning a sequence of grammar pattern labels (e.g., ["V", "NM", "N"]). + + Steps: + 1) Build full input token list: + [feature tokens] + [@pos_0, w1, @pos_1, w2, ..., @pos_2, wn] + 2) Tokenize using HuggingFace tokenizer with is_split_into_words=True + 3) Run the model forward pass (handles CRF or logits automatically) + 4) Use word_ids() to align predictions back to full words + - Skip special tokens (None) + - Skip feature tokens (index < NUMBER_OF_FEATURES) + - Use only the *second* token in each [@pos_X, word] pair (the word) + - Skip repeated subword tokens (only use the first subtoken per word) + 5) Return a list of string labels corresponding to the original identifier tokens. + + Returns: + List[str]: a list of grammar tags (e.g., ['V', 'NM', 'N']) aligned to `tokens` """ row = { "CONTEXT": context, @@ -33,6 +55,8 @@ def tag_identifier(self, tokens, context, type_str, language, system_name): "TYPE": type_str, "LANGUAGE": language } + + # Step 1: Feature tokens + alternating position/word tokens feature_tokens = get_feature_tokens(row, tokens) length = len(tokens) @@ -41,6 +65,7 @@ def tag_identifier(self, tokens, context, type_str, language, system_name): input_tokens = feature_tokens + tokens_with_pos + # Step 2: Tokenize using word-alignment aware tokenizer encoded = self.tokenizer( input_tokens, is_split_into_words=True, @@ -49,35 +74,40 @@ def tag_identifier(self, tokens, context, type_str, language, system_name): padding=True ) + # Step 3: Forward pass with torch.no_grad(): out = self.model( input_ids=encoded["input_ids"], attention_mask=encoded["attention_mask"], ) + # Step 4: Get predictions depending on model type (CRF vs logits) if isinstance(out, dict) and "predictions" in out: labels_per_token = out["predictions"][0] else: logits = out[0] if isinstance(out, (tuple, list)) else out labels_per_token = torch.argmax(logits, dim=-1).squeeze().tolist() + # Step 5: Convert subtoken-level predictions to word-level predictions pred_labels, previous_word_idx = [], None word_ids = encoded.word_ids() for idx, word_idx in enumerate(word_ids): if word_idx is None: - continue + continue # special token (CLS, SEP, PAD, etc.) if word_idx < NUMBER_OF_FEATURES: - continue + continue # feature tokens (shouldn't be labeled) if (word_idx - NUMBER_OF_FEATURES) % 2 == 0: - continue + continue # position tokens (e.g., @pos_0) if word_idx == previous_word_idx: - continue - + continue # skip repeated subword tokens + + # Heuristic: labels lag by 1 position relative to input_ids label_idx = idx - 1 if label_idx < len(labels_per_token): pred_labels.append(labels_per_token[label_idx]) previous_word_idx = word_idx - + + # Step 6: Map label IDs back to string labels pred_tag_strings = [self.id2label[i] for i in pred_labels] return pred_tag_strings diff --git a/src/lm_based_tagger/train_model.py b/src/lm_based_tagger/train_model.py index 2e11a7a..49cd92d 100644 --- a/src/lm_based_tagger/train_model.py +++ b/src/lm_based_tagger/train_model.py @@ -14,8 +14,6 @@ Trainer, TrainingArguments, DistilBertTokenizerFast, - DistilBertConfig, - DistilBertForTokenClassification, DataCollatorForTokenClassification, EarlyStoppingCallback ) @@ -52,7 +50,120 @@ def dual_print(*args, file, **kwargs): print(*args, **kwargs) # stdout print(*args, file=file, **kwargs) # file + +# 11) compute_metrics function (macro-F1) +def compute_metrics(eval_pred): + """ + Computes macro-F1, token-level accuracy, and identifier-level accuracy. + + Supports both: + - Raw logits from the model (shape [B, T, C]) + - Viterbi-decoded label paths from CRF models (List[List[int]]) + + Args: + eval_pred: Either a tuple (preds, labels) or a HuggingFace EvalPrediction object. + `preds` can be: + • [B, T, C] logits (e.g., output of a classifier head) + • [B, T] label IDs + • List[List[int]] variable-length decoded paths (CRF) + + Returns: + dict with: + - "eval_macro_f1": F1 averaged over classes (not tokens) + - "eval_token_accuracy": token-level accuracy (ignores -100) + - "eval_identifier_accuracy": percentage of rows where all tokens matched + + Example (logits of shape [B=2, T=3, C=4]): + preds = np.array([ + [ # Example 1 (B=0) + [0.1, 2.5, 0.3, -1.0], # Token 1 → class 1 (NM) + [1.5, 0.4, 0.2, -0.5], # Token 2 → class 0 (V) + [0.3, 0.1, 3.2, 0.0], # Token 3 → class 2 (N) + ], + [ # Example 2 (B=1) + [0.2, 0.1, 0.4, 2.1], # Token 1 → class 3 (P) + [0.9, 1.0, 0.3, 0.0], # Token 2 → class 1 (NM) + [1.1, 1.1, 1.1, 1.1], # Token 3 → tie (say model picks class 0) + ] + ]) + + Converted via argmax(preds, axis=-1): + → [[1, 0, 2], # Example 1 predictions + [3, 1, 0]] # Example 2 predictions + + Gold: [V, NM, N] → label_row = [-100, 1, -100, 2, -100, 3] + Pred: [V, NM, N] → pred_row = [1, 2, 3] + All tokens match → example_correct = True + """ + # 1) Extract predictions and labels + if isinstance(eval_pred, tuple): # older HuggingFace versions + preds, labels = eval_pred + else: # EvalPrediction object + preds = eval_pred.predictions + labels = eval_pred.label_ids + + # 2) Normalize predictions format + # Convert [B, T, C] logits → [B, T] class IDs + if isinstance(preds, np.ndarray) and preds.ndim == 3: + preds = np.argmax(preds, axis=-1) + # Convert CRF list-of-lists → numpy object array + elif isinstance(preds, list): + preds = np.array(preds, dtype=object) + + # 3) Compare predictions to labels, ignoring -100 + all_true, all_pred, id_correct_flags = [], [], [] + + for pred_row, label_row in zip(preds, labels): + ptr = 0 + example_correct = True + + for lbl in label_row: # iterate gold labels + if lbl == -100: # skip padding / specials + continue + + # pick the corresponding prediction + if isinstance(pred_row, (list, np.ndarray)): + pred_lbl = pred_row[ptr] + else: # pred_row is scalar + pred_lbl = pred_row + ptr += 1 + + all_true.append(lbl) + all_pred.append(pred_lbl) + if pred_lbl != lbl: + example_correct = False + + id_correct_flags.append(example_correct) + + # 4) Compute metrics from flattened predictions + macro_f1 = f1_score(all_true, all_pred, average="macro") + token_acc = accuracy_score(all_true, all_pred) + id_acc = float(sum(id_correct_flags)) / len(id_correct_flags) + + return { + "eval_macro_f1": macro_f1, + "eval_token_accuracy": token_acc, + "eval_identifier_accuracy": id_acc, + } + def train_lm(script_dir: str): + """ + Trains a DistilBERT+CRF model using k-fold cross-validation for token-level grammar tagging. + Performs model selection based on macro F1 score, and evaluates the best model on a final hold-out set. + + Input TSV must contain: + - SPLIT: tokenized identifier as space-separated subtokens (e.g., "get Employee Name") + - GRAMMAR_PATTERN: space-separated labels (e.g., "V NM N") + - CONTEXT: usage context string (e.g., FUNCTION, PARAMETER, ...) + + Example input row: + SPLIT="get Employee Name", GRAMMAR_PATTERN="V NM N", CONTEXT="FUNCTION" + + Output: + - Trained model checkpoints (best fold + final eval) + - Hold-out predictions and metrics (saved to output/holdout_predictions.csv) + - Text report of macro-F1, token-level and identifier-level accuracy + """ # 1) Paths input_path = os.path.join(script_dir, "input", "tagger_data.tsv") output_dir = os.path.join(script_dir, "output") @@ -107,11 +218,11 @@ def train_lm(script_dir: str): # 7c) Tokenize + align labels (exactly as before) tokenized_train = fold_train_dataset.map( - lambda ex: tokenize_and_align_labels(ex, tokenizer), + lambda sample: tokenize_and_align_labels(sample, tokenizer), batched=False ) tokenized_test = fold_test_dataset.map( - lambda ex: tokenize_and_align_labels(ex, tokenizer), + lambda sample: tokenize_and_align_labels(sample, tokenizer), batched=False ) @@ -169,74 +280,18 @@ def train_lm(script_dir: str): dataloader_pin_memory=False ) - # 10) Data collator (dynamic padding) + # 10) Define collator that handles dynamic padding + label alignment + # For example, if two tokenized examples have: + # input_ids = [[101, 2121, 5661, 2171, 102], [101, 2064, 102]] + # the collator will pad them to the same length and align + # their attention_mask and labels accordingly. data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) - # 11) compute_metrics function (macro-F1) - - def compute_metrics(eval_pred): - """ - Works for both: - • Plain classifier logits → argmax along last dim - • CRF Viterbi paths (list/2‑D ndarray) → use directly - Returns: - - eval_macro_f1 - - eval_token_accuracy - - eval_identifier_accuracy - """ - # ── 1. Unpack ──────────────────────────────────────────────────── - if isinstance(eval_pred, tuple): # older HF (<4.38) - preds, labels = eval_pred - else: # EvalPrediction obj - preds = eval_pred.predictions - labels = eval_pred.label_ids - - # ── 2. Convert logits → label IDs if needed ───────────────────── - # * 3‑D tensor : [B, T, C] → argmax(C) - # * 2‑D tensor : already IDs - # * list/obj‑nd : variable‑length decode paths - if isinstance(preds, np.ndarray) and preds.ndim == 3: - preds = np.argmax(preds, axis=-1) # [B, T] - elif isinstance(preds, list): - preds = np.array(preds, dtype=object) # each row is a list - - # ── 3. Accumulate token & identifier stats ────────────────────── - all_true, all_pred, id_correct_flags = [], [], [] - - for pred_row, label_row in zip(preds, labels): - ptr = 0 - example_correct = True - - for lbl in label_row: # iterate gold labels - if lbl == -100: # skip padding / specials - continue - - # pick the corresponding prediction - if isinstance(pred_row, (list, np.ndarray)): - pred_lbl = pred_row[ptr] - else: # pred_row is scalar - pred_lbl = pred_row - ptr += 1 - - all_true.append(lbl) - all_pred.append(pred_lbl) - if pred_lbl != lbl: - example_correct = False - - id_correct_flags.append(example_correct) - - # ── 4. Metrics ────────────────────────────────────────────────── - macro_f1 = f1_score(all_true, all_pred, average="macro") - token_acc = accuracy_score(all_true, all_pred) - id_acc = float(sum(id_correct_flags)) / len(id_correct_flags) - - return { - "eval_macro_f1": macro_f1, - "eval_token_accuracy": token_acc, - "eval_identifier_accuracy": id_acc, - } - - # 12) Trainer for this fold (with EarlyStopping) + + # 11) Initialize Trainer for this fold with early stopping + # Trainer handles batching, optimizer, eval, LR scheduling, logging, etc. + # We also assign the tokenizer to `trainer.tokenizer` so that + # it is correctly saved with the model and used during predict(). trainer = Trainer( model=model, args=training_args, @@ -250,10 +305,19 @@ def compute_metrics(eval_pred): # Avoid deprecation warning (explicitly set tokenizer on trainer) trainer.tokenizer = tokenizer - # 13) Train this fold + # 12) Train model on this fold + # During training, the CRF computes loss using both: + # - emission scores (per-token label likelihoods from DistilBERT) + # - transition scores (likelihoods of label sequences) + # It uses the Viterbi algorithm to find the most likely label path + # and compares it to the true label sequence to compute loss. trainer.train() - # 14) Evaluate on this fold’s held-out split + + # 13) Evaluate fold performance on validation split + # We run inference and obtain predictions as either logits (softmax) or Viterbi-decoded paths. + # Here, since we use CRF; 'preds_logits' contains Viterbi sequences of label IDs. + # We then flatten and decode both true and predicted labels for macro-F1 calculation. preds_logits, labels, _ = trainer.predict(tokenized_test) preds = np.argmax(preds_logits, axis=-1) @@ -264,6 +328,7 @@ def compute_metrics(eval_pred): for (l, p) in zip(sent_labels, sent_preds) if l != -100 ] + pred_labels_list = [ ID2LABEL[p] for sent_labels, sent_preds in zip(labels, preds) @@ -274,7 +339,8 @@ def compute_metrics(eval_pred): fold_macro_f1 = f1_score(true_labels_list, pred_labels_list, average="macro") print(f"Fold {fold} Macro F1: {fold_macro_f1:.4f}") - # 15) If this fold’s model is the best so far, save it + # 14) Save model checkpoint if this fold is the best so far + # This ensures we retain the model with highest validation performance if fold_macro_f1 > best_macro_f1: best_macro_f1 = fold_macro_f1 best_model_dir = os.path.join(output_dir, "best_model") @@ -284,14 +350,15 @@ def compute_metrics(eval_pred): fold += 1 - # 16) After all folds, report best fold‐score & load best model for final evaluation + # 15) Final summary after cross-validation + # Reports where the best model is saved and its macro F1 on fold validation data print(f"\nBest fold model saved at: {best_model_dir}, Macro F1 = {best_macro_f1:.4f}") - # 17) Final Evaluation on held-out val_df + # 16) Load best model and prepare for final evaluation on held-out set best_model = DistilBertCRFForTokenClassification.from_pretrained(best_model_dir) best_model.to(device) - # Build a fresh set of TrainingArguments that never runs evaluation epochs: + # Use new TrainingArguments to disable evaluation during predict final_args = TrainingArguments( output_dir=os.path.join(output_dir, "final_eval"), per_device_eval_batch_size=16, @@ -301,6 +368,8 @@ def compute_metrics(eval_pred): report_to="none", seed=RAND_STATE ) + + # Set up Trainer to run inference on hold-out set val_trainer = Trainer( model=best_model, args=final_args, @@ -308,7 +377,8 @@ def compute_metrics(eval_pred): data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer) # ← note: no eval_dataset here, because we’ll call .predict(...) manually ) - + + # 17) Run prediction on hold-out set and record inference time start_time = time.perf_counter() val_preds_logits, val_labels, _ = val_trainer.predict(tokenized_val) end_time = time.perf_counter() @@ -328,8 +398,7 @@ def compute_metrics(eval_pred): if l != -100 ] - # 18) Write hold-out predictions to CSV so that each row contains - # (tokens, true_tags, pred_tags) for sanity checking. + # 18) Output predictions per row to CSV for inspection or error analysis from .distilbert_tagger import DistilBertTagger # Re-instantiate the exact same DistilBERT tagger we saved @@ -364,12 +433,13 @@ def compute_metrics(eval_pred): df["row_correct"] = df["true_tags"] == df["pred_tags"] id_level_acc = df["row_correct"].mean() - # Report inference speed + # Report evaluation metrics and timing info total_tokens = sum(len(ex["tokens"]) for ex in val_dataset) total_examples = len(val_dataset) elapsed = end_time - start_time final_macro_f1 = f1_score(flat_true, flat_pred, average="macro") final_accuracy = accuracy_score(flat_true, flat_pred) + print("\nFinal Evaluation on Held-Out Set:") with open('holdout_report.txt', 'w') as f: report = classification_report(flat_true, flat_pred) From bde70e21afc80ac5c3d762ebe45db7088dd1f5db Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Mon, 9 Jun 2025 23:16:50 -0400 Subject: [PATCH 47/51] Update readme with new arguments and data --- README.md | 201 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 166 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 2bf4b7f..fa1481b 100644 --- a/README.md +++ b/README.md @@ -1,49 +1,180 @@ -# SCALAR Part-of-speech tagger +# SCALAR Part-of-Speech Tagger for Identifiers -THIS IS AN EXPERIMENTAL VERSION OF SCALAR +**SCALAR** is a part-of-speech tagger for source code identifiers. It supports two model types: -Install requirements via `pip install -r requirements.txt` +- **DistilBERT-based model with CRF layer** (Recommended: faster, more accurate) +- Legacy Gradient Boosting model (for compatibility) -Run via `python3 main --mode run --model_type lm_based` +--- -You can attempt to train it `python main --mode train --model_type lm_based` -- but I make no guarantees about how easily it will work at this stage +## Installation -It still technically supports the old gradientboost model, too... but no guarantees as to how well it functions in this branch. +Make sure you have `python3.12` installed. Then: -## Evaluation Results (Held-Out Set) +```bash +git clone https://github.com/SCANL/scanl_tagger.git +cd scanl_tagger +python -m venv venv +source venv/bin/activate +pip install -r requirements.txt +``` -### Per-Class Metrics +--- -| Label | Precision | Recall | F1-Score | Support | -|-------|-----------|--------|----------|---------| -| CJ | 0.88 | 0.88 | 0.88 | 8 | -| D | 0.98 | 0.96 | 0.97 | 52 | -| DT | 0.95 | 0.93 | 0.94 | 45 | -| N | 0.94 | 0.94 | 0.94 | 418 | -| NM | 0.91 | 0.93 | 0.92 | 440 | -| NPL | 0.97 | 0.97 | 0.97 | 79 | -| P | 0.94 | 0.92 | 0.93 | 79 | -| PRE | 0.79 | 0.79 | 0.79 | 68 | -| V | 0.89 | 0.84 | 0.86 | 110 | -| VM | 0.79 | 0.85 | 0.81 | 13 | +## Usage -### Aggregate Metrics +You can run SCALAR in multiple ways: -| Metric | Score | -|---------------------|--------| -| Accuracy | 0.92 | -| Macro Avg F1 | 0.90 | -| Weighted Avg F1 | 0.92 | -| Total Examples | 1312 | +### CLI (with DistilBERT or GradientBoosting model) -### Inference Statistics +```bash +python main --mode run --model_type lm_based # DistilBERT (recommended) +python main --mode run --model_type tree_based # Legacy model +``` -- **Inference Time:** 1.74s for 392 identifiers (3746 tokens) -- **Tokens/sec:** 2157.78 -- **Identifiers/sec:** 225.80 +Then query like: -### Final Scores +``` +http://127.0.0.1:8080/GetValue/FUNCTION +``` -- **Final Macro F1 on Held-Out Set:** 0.9032 -- **Final Token-level Accuracy:** 0.9223 -- **Final Identifier-level Accuracy:** 0.8291 +Supports context types: +- FUNCTION +- CLASS +- ATTRIBUTE +- DECLARATION +- PARAMETER + +--- + +## Training + +You can retrain either model (default parameters are currently hardcoded): + +```bash +python main --mode train --model_type lm_based +python main --mode train --model_type tree_based +``` + +--- + +## Evaluation Results + +### DistilBERT (LM-Based Model) — Recommended + +| Metric | Score | +|--------------------------|---------| +| **Macro F1** | 0.9032 | +| **Token Accuracy** | 0.9223 | +| **Identifier Accuracy** | 0.8291 | + +| Label | Precision | Recall | F1 | Support | +|-------|-----------|--------|-------|---------| +| CJ | 0.88 | 0.88 | 0.88 | 8 | +| D | 0.98 | 0.96 | 0.97 | 52 | +| DT | 0.95 | 0.93 | 0.94 | 45 | +| N | 0.94 | 0.94 | 0.94 | 418 | +| NM | 0.91 | 0.93 | 0.92 | 440 | +| NPL | 0.97 | 0.97 | 0.97 | 79 | +| P | 0.94 | 0.92 | 0.93 | 79 | +| PRE | 0.79 | 0.79 | 0.79 | 68 | +| V | 0.89 | 0.84 | 0.86 | 110 | +| VM | 0.79 | 0.85 | 0.81 | 13 | + +**Inference Performance:** +- Identifiers/sec: 225.8 + +--- + +### Gradient Boost Model (Legacy) + +| Metric | Score | +|----------------------|-----------| +| Accuracy | 0.8216 | +| Balanced Accuracy | 0.9160 | +| Weighted Recall | 0.8216 | +| Weighted Precision | 0.8245 | +| Weighted F1 | 0.8220 | +| Inference Time | 249.05s | + +**Inference Performance:** +- Identifiers/sec: 8.6 + +--- + +## Supported Tagset + +| Tag | Meaning | Examples | +|-------|------------------------------------|--------------------------------| +| N | Noun | `user`, `Data`, `Array` | +| DT | Determiner | `this`, `that`, `those` | +| CJ | Conjunction | `and`, `or`, `but` | +| P | Preposition | `with`, `for`, `in` | +| NPL | Plural Noun | `elements`, `indices` | +| NM | Noun Modifier (adjective-like) | `max`, `total`, `employee` | +| V | Verb | `get`, `set`, `delete` | +| VM | Verb Modifier (adverb-like) | `quickly`, `deeply` | +| D | Digit | `1`, `2`, `10`, `0xAF` | +| PRE | Preamble / Prefix | `m`, `b`, `GL`, `p` | + +--- + +## Docker Support (Legacy only) + +For the legacy server, you can also use Docker: + +```bash +docker compose pull +docker compose up +``` + +--- + +## Notes + +- **Kebab case** is not supported (e.g., `do-something-cool`). +- Feature and position tokens (e.g., `@pos_0`) are inserted automatically. +- Internally uses [WordNet](https://wordnet.princeton.edu/) for lexical features. +- Input must be parsed into identifier tokens. We recommend [srcML](https://www.srcml.org/) but any AST-based parser works. + +--- + +## Citations + +Please cite: + +``` +@inproceedings{newman2025scalar, + author = {Christian Newman and Brandon Scholten and Sophia Testa and others}, + title = {SCALAR: A Part-of-speech Tagger for Identifiers}, + booktitle = {ICPC Tool Demonstrations Track}, + year = {2025} +} + +@article{newman2021ensemble, + title={An Ensemble Approach for Annotating Source Code Identifiers with Part-of-speech Tags}, + author={Newman, Christian and Decker, Michael and AlSuhaibani, Reem and others}, + journal={IEEE Transactions on Software Engineering}, + year={2021}, + doi={10.1109/TSE.2021.3098242} +} +``` + +--- + +## Training Data + +You can find the most recent SCALAR training dataset [here](https://github.com/SCANL/scanl_tagger/blob/master/input/tagger_data.tsv) + +--- + +## More from SCANL + +- [SCANL Website](https://www.scanl.org/) +- [Identifier Name Structure Catalogue](https://github.com/SCANL/identifier_name_structure_catalogue) + +--- + +## Trouble? + +Please [open an issue](https://github.com/SCANL/scanl_tagger/issues) if you encounter problems! From 89748a09bf25154a263fc39b6c3377e52e8968ca Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Tue, 10 Jun 2025 01:57:40 -0400 Subject: [PATCH 48/51] git workflow --- .github/workflows/tests.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1586a95..3cad9e6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -2,7 +2,7 @@ name: SCALAR Tagger CI on: push: - branches: [ master, develop ] + branches: [ master, develop, distilbert ] pull_request: branches: [ master, develop ] @@ -78,12 +78,12 @@ jobs: - name: Start tagger server run: | - ./main -r & + python main --mode run --model_type lm_based # Wait for up to 5 minutes for the service to start and load models timeout=300 while [ $timeout -gt 0 ]; do - if curl -s "http://localhost:8080/cache/numberArray/DECLARATION" > /dev/null; then + if curl -s "http://localhost:8080/numberArray/DECLARATION" > /dev/null; then echo "Service is ready" break fi @@ -101,7 +101,7 @@ jobs: - name: Test tagger endpoint run: | - response=$(curl -s "http://localhost:8080/cache/numberArray/DECLARATION") + response=$(curl -s "http://localhost:8080/numberArray/DECLARATION") if [ -z "$response" ]; then echo "No response from tagger" exit 1 From 28dd47c1c7bbf35362d380a1847425ce67b5fa99 Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Tue, 10 Jun 2025 10:24:56 -0400 Subject: [PATCH 49/51] Starting to see if I can get Doker up again. Update requirements with nltk for tree_based model --- Dockerfile | 9 +-------- requirements.txt | 1 + setup.py | 2 +- version.py | 2 +- 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index b747297..fc3234c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,18 +1,11 @@ FROM python:3.12-slim -#argument to enable GPU accelaration -ARG GPU=false - # Install (and build) requirements COPY requirements.txt /requirements.txt -COPY requirements_gpu.txt /requirements_gpu.txt RUN apt-get clean && rm -rf /var/lib/apt/lists/* && \ apt-get update --fix-missing && \ apt-get install --allow-unauthenticated -y git curl && \ pip install -r requirements.txt && \ - if [ "$GPU" = true ]; then \ - pip install -r requirements_gpu.txt; \ - fi && \ apt-get clean && rm -rf /var/lib/apt/lists/* COPY . . @@ -77,6 +70,6 @@ CMD date; \ fi; \ date; \ echo "Running..."; \ - /main -r --words words/abbreviationList.csv + /main --mode train --model_type lm_based --words words/abbreviationList.csv ENV TZ=US/Michigan diff --git a/requirements.txt b/requirements.txt index 7ed1a05..0eefe27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ spiral @ git+https://github.com/cnewman/spiral.git@dff537320c15849c10e583968036d torch==2.7.1 waitress==3.0.2 gensim==4.3.3 +nltk==3.9.1 transformers[torch] diff --git a/setup.py b/setup.py index 2532143..d96cf39 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,6 @@ ], }, python_requires='>=3.12', - author="Christian Newman", + author="Christian Newman, Anthony Peruma, Brandon Scholten, Syreen Banabilah", description="A machine learning based tagger for source code analysis", ) \ No newline at end of file diff --git a/version.py b/version.py index 2cc2f7f..6003a5c 100644 --- a/version.py +++ b/version.py @@ -1,2 +1,2 @@ -__version__ = "2.1.0" # Changed to match docstring version +__version__ = "2.2.0" # Changed to match docstring version __version_info__ = tuple(int(num) for num in __version__.split(".")) \ No newline at end of file From 4ed9cd3f7e599c9d56cf24fb3f40c24a8780479e Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Tue, 10 Jun 2025 10:39:57 -0400 Subject: [PATCH 50/51] add download_files() to lm execution flow for the way we are currently using cache-- this should probably be fixed --- main | 1 + 1 file changed, 1 insertion(+) diff --git a/main b/main index 536cb71..481d5ab 100755 --- a/main +++ b/main @@ -88,6 +88,7 @@ if __name__ == "__main__": start_server(temp_config=config) elif args.model_type == "lm_based": + download_files() if not args.local: start_server(temp_config={ 'script_dir': SCRIPT_DIR, From cd8d94e3cd2d77dd249b90fa7fe96a02bd69c59e Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Tue, 10 Jun 2025 11:04:44 -0400 Subject: [PATCH 51/51] Forgot to run process in the bg for github actions --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3cad9e6..2f1eccb 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -78,7 +78,7 @@ jobs: - name: Start tagger server run: | - python main --mode run --model_type lm_based + python main --mode run --model_type lm_based & # Wait for up to 5 minutes for the service to start and load models timeout=300