Merge pull request #9 from SCANL/develop

cnewman · web-flow · commit 087c498380df · 2025-01-17T13:09:16.000-05:00
Brandon's features
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 output/
 __pycache__/
-code2vec/
+code2vec/
+cache/
+input.txt
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,71 @@
+FROM python:3.10-slim
+
+# Install (and build) requirements
+COPY requirements.txt /requirements.txt
+RUN apt-get update && \
+    apt-get install -y git curl && \
+    pip install -r requirements.txt && \
+    rm -rf /var/lib/apt/lists/*
+
+# ntlk downloads
+RUN python3 -c "import nltk; nltk.download('averaged_perceptron_tagger');nltk.download('universal_tagset')"
+
+# Pythong scripts and data
+COPY classifier_multiclass.py \
+     download_code2vec_vectors.py \
+     feature_generator.py \
+     print_utility_functions.py \
+     tag_identifier.py \
+     create_models.py \
+     serve.json \
+     main \
+     /.
+COPY input/* /input/.
+COPY models/model_GradientBoostingClassifier.pkl /models/.
+
+CMD date; \
+    echo "Download..."; \
+    remote_target_date=$(curl -sI http://131.123.42.41/target_vecs.txt | grep -i "Last-Modified" | cut -d' ' -f2-); \
+    remote_token_date=$(curl -sI http://131.123.42.41/token_vecs.txt | grep -i "Last-Modified" | cut -d' ' -f2-); \
+    remote_words_date=$(curl -sI http://131.123.42.41/abbreviationList.csv | grep -i "Last-Modified" | cut -d' ' -f2-); \
+    remote_dictionary_date=$(curl -sI htp://131.123.42.41/en.txt | grep -i "Last-Modified" | cut -d' ' -f2-); \
+    if [ -n "$remote_target_date" ] && [ -n "$remote_token_date" ]; then \
+        remote_target_timestamp=$(date -d "$remote_target_date" +%s); \
+        remote_token_timestamp=$(date -d "$remote_token_date" +%s); \
+        remote_words_timestamp=$(date -d "$remote_words_date" +%s); \
+        remote_dictionary_timestamp=$(date -d "$remote_dictionary_date" +%s); \
+        if [ ! -f /code2vec/target_vecs.txt ] || [ $remote_target_timestamp -gt $(date -r /code2vec/target_vecs.txt +%s) ]; then \
+            curl -s -o /code2vec/target_vecs.txt http://131.123.42.41/target_vecs.txt; \
+            echo "target_vecs.txt updated"; \
+        else \
+            echo "target_vecs.txt not updated"; \
+        fi; \
+        if [ ! -f /code2vec/token_vecs.txt ] || [ $remote_token_timestamp -gt $(date -r /code2vec/token_vecs.txt +%s) ]; then \
+            curl -s -o /code2vec/token_vecs.txt http://131.123.42.41/token_vecs.txt; \
+            echo "token_vecs.txt updated"; \
+        else \
+            echo "token_vecs.txt not updated"; \
+        fi; \
+        if [ ! -r /words/abbreviationList.csv ] || [ $remote_words_timestamp -gt $(date -r /words/abbreviationList.csv +%s) ]; then \
+            curl -s -o /words/abbreviationList.csv http://131.123.42.41/abbreviationList.csv; \
+            echo "abbreviationList.csv updated"; \
+        else \
+            echo "abbreviationList.csv not updated"; \
+        fi; \
+        if [ ! -r /words/en.txt ] || [ $remote_dictionary_timestamp -gt $(date -r /words/en.txt +%s) ]; then \
+            curl -s -o /words/en.txt http://131.123.42.41/en.txt; \
+            echo "en.txt updated"; \
+        else \
+            echo "en.txt not updated"; \
+        fi; \
+    else \
+        echo "Failed to retrieve Last-Modified headers"; \
+    fi; \
+    date; \
+    echo "Training..."; \
+    /main -t; \
+    date; \
+    echo "Running..."; \
+    /main -r --words words/abbreviationList.csv
+
+ENV TZ=US/Michigan
diff --git a/README.md b/README.md
@@ -1,11 +1,19 @@
 # SCALAR Part-of-speech tagger
 This the official release of the SCALAR Part-of-speech tagger
 
-**NOTE**
-There is a fork of SCALAR which was designed to handle parallel http requests and cache SCALAR's output to increase its speed. You can find this version here: https://github.com/brandonscholten/scanl_tagger. These will be combined into a single application in the *very* near future.
+## Getting Started with Docker
+
+To run SCNL tagger in a Docker container you can clone the repository and pull the latest docker impage from `srcml/scanl_tagger:latest`
+
+```
+git clone https://github.com/brandonscholten/scanl_tagger.git
+cd scanl_tagger
+docker compose pull
+docker compose up
+```
 
 ## Setup and Run
-You will need `python3` installed. We will explicitly use the `python3` command below but, of course, if your environment is configured to use python3 by default, you do not need to. We have also only tested this on **Ubuntu 22** and **Ubuntu via WSL**. It most likely works in similar environments, but no guarantees.
+You will need `python3.10` installed. 
 
 You'll need to install `pip` -- https://pip.pypa.io/en/stable/installation/
 
@@ -19,17 +27,29 @@ Finally, we require the `token` and `target` vectors from [code2vec](https://git
 
 ## Usage
 
-```bash
-python main.py -v  # Display the application version.
-python main.py -r  # Start the server for tagging requests.
-python main.py -t  # Run the training set to retrain the model.
+```
+usage: main [-h] [-v] [-r] [-t] [-a ADDRESS] [--port PORT] [--protocol PROTOCOL]
+            [--words WORDS]
+
+options:
+  -h, --help            show this help message and exit
+  -v, --version         print tagger application version
+  -r, --run             run server for part of speech tagging requests
+  -t, --train           run training set to retrain the model
+  -a ADDRESS, --address ADDRESS
+                        configure server address
+  --port PORT           configure server port
+  --protocol PROTOCOL   configure whether the server uses http or https
+  --words WORDS         provide path to a list of acceptable abbreviations
 ```
 
-`python main.py -r` will start the server, which will listen for identifier names sent via HTTP over the route:
+`./main -r` will start the server, which will listen for identifier names sent via HTTP over the route:
 
-http://127.0.0.1:5000/{identifier_name}/{code_context}
+http://127.0.0.1:5000/{cache_selection}/{identifier_name}/{code_context}
 
-Where "code context" is one of:
+"cache selection" will save results to a separate cache if it is set to "student"
+
+"code context" is one of:
 - FUNCTION
 - ATTRIBUTE
 - CLASS
@@ -38,16 +58,19 @@ Where "code context" is one of:
 
 For example:
 
-Tag a declaration: ``http://127.0.0.1:5000/numberArray/DECLARATION``
+Tag a declaration: ``http://127.0.0.1:5000/cache/numberArray/DECLARATION``
+
+Tag a function: ``http://127.0.0.1:5000/cache/GetNumberArray/FUNCTION``
 
-Tag a function: ``http://127.0.0.1:5000/GetNumberArray/FUNCTION``
+Tag an class: ``http://127.0.0.1:5000/cache/PersonRecord/CLASS``
 
-Tag an class: ``http://127.0.0.1:5000/PersonRecord/CLASS``
+#### Note
+Kebab case is not currently supported due to the limitations of Spiral. Attempting to send the tagger identifiers which are in kebab case will result in the entry of a single noun. 
 
 You will need to have a way to parse code and filter out identifier names if you want to do some on-the-fly analysis of source code. We recommend [srcML](https://www.srcml.org/). Since the actual tagger is a web server, you don't have to use srcML. You could always use other AST-based code representations, or any other method of obtaining identifier information. 
 
 ## Training the tagger
-You can train this tagger using the `-t` option (which will re-run the training routine). For the moment, most of this is hard-coded in, so if you want to use a different data set/different seeds, you'll need to modify the code. This is will potentially change in the future.
+You can train this tagger using the `-t` option (which will re-run the training routine). For the moment, most of this is hard-coded in, so if you want to use a different data set/different seeds, you'll need to modify the code. This will potentially change in the future.
 
 ## Errors?
 Please make an issue if you run into errors
@@ -63,3 +86,9 @@ The data used to train this tagger can be found in the most recent database upda
 
 # Interested in our other work?
 Find our other research [at our webpage](https://www.scanl.org/) and check out the [Identifier Name Structure Catalogue](https://github.com/SCANL/identifier_name_structure_catalogue)
+
+# WordNet
+This project uses WordNet to perform a dictionary lookup on the individual words in each identifier:
+
+Princeton University "About WordNet." [WordNet](https://wordnet.princeton.edu/). Princeton University. 2010
+
diff --git a/compose.yml b/compose.yml
@@ -0,0 +1,22 @@
+volumes:
+  vectors:
+  gensim:
+  cache:
+  words:
+
+services:
+
+  tagger:
+    image: srcml/scanl_tagger
+    build:
+      context: ./
+      platforms:
+        - ${BUILD_PLATFORM:-linux/arm64}
+        - ${BUILD_PLATFORM:-linux/amd64}
+    volumes:
+      - vectors:/code2vec
+      - gensim:/root/gensim-data
+      - cache:/cache
+      - words:/words
+    ports:
+      - "${PORT-8080}:5000"
diff --git a/main b/main
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 import os, sqlite3, classifier_multiclass, random, nltk, argparse
 from datetime import datetime
 import classifier_multiclass
@@ -6,6 +8,7 @@
 import numpy as np
 from tag_identifier import start_server
 from download_code2vec_vectors import *
+import json
 from create_models import createModel, stable_features, mutable_feature_list
 
 # Get the directory of the current script
@@ -136,11 +139,16 @@ def train(config):
     - To check the application version, use: -v or --version.
     - To start a server for part-of-speech tagging requests, use: -r or --run.
     - To run a training set and retrain the model, use: -t or --train.
+    - To update server configuration, use -c or --config
 
     Example Usage:
-    python script.py -v  # Display the application version.
-    python script.py -r  # Start the server for tagging requests.
-    python script.py -t  # Run the training set to retrain the model.
+    python script.py -v                          # Display the application version.
+    python script.py -r                          # Start the server for tagging requests.
+    python script.py -t                          # Run the training set to retrain the model.
+    python script.py -a --address [host]         # Run the tagger on a specific IP address
+    python script.py --port [port]               # Run the tagger on a specific port
+    python script.py --protocol [http/https]     # Specify use of http or https
+    python script.py --words [path]              # Specify path of word list
 
     Note:
     If no arguments are provided or if there is an invalid argument, the script will display usage instructions.
@@ -153,14 +161,24 @@ def train(config):
     parser.add_argument("-v", "--version", action="store_true", help="print tagger application version")
     parser.add_argument("-r", "--run", action="store_true", help="run server for part of speech tagging requests") 
     parser.add_argument("-t", "--train", action="store_true", help="run training set to retrain the model")
+    parser.add_argument("-a", "--address", nargs=1, action="store", help="configure server address", )
+    parser.add_argument("--port", nargs=1, action="store", help="configure server port")
+    parser.add_argument("--protocol", nargs=1, action="store", help="configure whether the server uses http or https")
+    parser.add_argument("--words", nargs=1, action="store", help="provide path to a list of acceptable abbreviations")
 
     args = parser.parse_args()
 
     if args.version:
         print("SCANL Tagger version 1.5.0")
     elif args.run:
         download_files()
-        start_server()
+        temp_config = {}
+        print(args)
+        if args.address != None: temp_config["address"] = args.address[0]
+        if args.port != None: temp_config["port"] = args.port[0]
+        if args.protocol != None: temp_config["protocol"] = args.protocol[0]
+        if args.words != None: temp_config["words"] = args.words[0]
+        start_server(temp_config)
     elif args.train:
         download_files()
         # Define a configuration dictionary and pass it to the train function
@@ -177,4 +195,4 @@ def train(config):
         }
         train(config)
     else:
-        parser.print_usage()
+        parser.print_usage()
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
+utils
 flair
 Flask
 gensim
@@ -9,4 +10,6 @@ numpy
 pandas
 Requests
 scikit_learn
-scipy
+scipy
+git+https://github.com/cnewman/spiral.git
+waitress
diff --git a/scripts/test_waitress.py b/scripts/test_waitress.py
@@ -0,0 +1,23 @@
+import threading
+import requests
+
+host = input('hostname? [0.0.0.0]: ')
+port = input('port? [5000]: ')
+
+def req(word,type,id):
+    if host == '' or port == '':
+        print(str(id) + ": " + requests.get(url = f'http://127.0.0.1:5000/{word}/{type}', params = {}).text)
+    else:
+        print(str(id) + ": " + requests.get(url = f'http://{host}:{port}/{word}/{type}', params = {}).text)
+
+r1 = threading.Thread(target=req, args=("numberArray","DECLARATION",1))
+r2 = threading.Thread(target=req, args=("GetNumberArray","FUNCTION",2))
+r3 = threading.Thread(target=req, args=("PersonRecord","CLASS",3))
+
+r1.start()
+r2.start()
+r3.start()
+
+r1.join()
+r2.join()
+r3.join()
diff --git a/serve.json b/serve.json
@@ -0,0 +1,6 @@
+{
+    "address": "0.0.0.0",
+    "port": 5000,
+    "protocol": "https",
+    "words":""
+}
diff --git a/tag_identifier.py b/tag_identifier.py
diff --git a/validation_results.csv b/validation_results.csv