Skip to content

Commit 087c498

Browse files
authored
Merge pull request #9 from SCANL/develop
Brandon's features
2 parents b097a36 + bc33b25 commit 087c498

File tree

10 files changed

+1074
-45
lines changed

10 files changed

+1074
-45
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
output/
22
__pycache__/
3-
code2vec/
3+
code2vec/
4+
cache/
5+
input.txt

Dockerfile

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
FROM python:3.10-slim
2+
3+
# Install (and build) requirements
4+
COPY requirements.txt /requirements.txt
5+
RUN apt-get update && \
6+
apt-get install -y git curl && \
7+
pip install -r requirements.txt && \
8+
rm -rf /var/lib/apt/lists/*
9+
10+
# ntlk downloads
11+
RUN python3 -c "import nltk; nltk.download('averaged_perceptron_tagger');nltk.download('universal_tagset')"
12+
13+
# Pythong scripts and data
14+
COPY classifier_multiclass.py \
15+
download_code2vec_vectors.py \
16+
feature_generator.py \
17+
print_utility_functions.py \
18+
tag_identifier.py \
19+
create_models.py \
20+
serve.json \
21+
main \
22+
/.
23+
COPY input/* /input/.
24+
COPY models/model_GradientBoostingClassifier.pkl /models/.
25+
26+
CMD date; \
27+
echo "Download..."; \
28+
remote_target_date=$(curl -sI http://131.123.42.41/target_vecs.txt | grep -i "Last-Modified" | cut -d' ' -f2-); \
29+
remote_token_date=$(curl -sI http://131.123.42.41/token_vecs.txt | grep -i "Last-Modified" | cut -d' ' -f2-); \
30+
remote_words_date=$(curl -sI http://131.123.42.41/abbreviationList.csv | grep -i "Last-Modified" | cut -d' ' -f2-); \
31+
remote_dictionary_date=$(curl -sI htp://131.123.42.41/en.txt | grep -i "Last-Modified" | cut -d' ' -f2-); \
32+
if [ -n "$remote_target_date" ] && [ -n "$remote_token_date" ]; then \
33+
remote_target_timestamp=$(date -d "$remote_target_date" +%s); \
34+
remote_token_timestamp=$(date -d "$remote_token_date" +%s); \
35+
remote_words_timestamp=$(date -d "$remote_words_date" +%s); \
36+
remote_dictionary_timestamp=$(date -d "$remote_dictionary_date" +%s); \
37+
if [ ! -f /code2vec/target_vecs.txt ] || [ $remote_target_timestamp -gt $(date -r /code2vec/target_vecs.txt +%s) ]; then \
38+
curl -s -o /code2vec/target_vecs.txt http://131.123.42.41/target_vecs.txt; \
39+
echo "target_vecs.txt updated"; \
40+
else \
41+
echo "target_vecs.txt not updated"; \
42+
fi; \
43+
if [ ! -f /code2vec/token_vecs.txt ] || [ $remote_token_timestamp -gt $(date -r /code2vec/token_vecs.txt +%s) ]; then \
44+
curl -s -o /code2vec/token_vecs.txt http://131.123.42.41/token_vecs.txt; \
45+
echo "token_vecs.txt updated"; \
46+
else \
47+
echo "token_vecs.txt not updated"; \
48+
fi; \
49+
if [ ! -r /words/abbreviationList.csv ] || [ $remote_words_timestamp -gt $(date -r /words/abbreviationList.csv +%s) ]; then \
50+
curl -s -o /words/abbreviationList.csv http://131.123.42.41/abbreviationList.csv; \
51+
echo "abbreviationList.csv updated"; \
52+
else \
53+
echo "abbreviationList.csv not updated"; \
54+
fi; \
55+
if [ ! -r /words/en.txt ] || [ $remote_dictionary_timestamp -gt $(date -r /words/en.txt +%s) ]; then \
56+
curl -s -o /words/en.txt http://131.123.42.41/en.txt; \
57+
echo "en.txt updated"; \
58+
else \
59+
echo "en.txt not updated"; \
60+
fi; \
61+
else \
62+
echo "Failed to retrieve Last-Modified headers"; \
63+
fi; \
64+
date; \
65+
echo "Training..."; \
66+
/main -t; \
67+
date; \
68+
echo "Running..."; \
69+
/main -r --words words/abbreviationList.csv
70+
71+
ENV TZ=US/Michigan

README.md

Lines changed: 43 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,19 @@
11
# SCALAR Part-of-speech tagger
22
This the official release of the SCALAR Part-of-speech tagger
33

4-
**NOTE**
5-
There is a fork of SCALAR which was designed to handle parallel http requests and cache SCALAR's output to increase its speed. You can find this version here: https://github.com/brandonscholten/scanl_tagger. These will be combined into a single application in the *very* near future.
4+
## Getting Started with Docker
5+
6+
To run SCNL tagger in a Docker container you can clone the repository and pull the latest docker impage from `srcml/scanl_tagger:latest`
7+
8+
```
9+
git clone https://github.com/brandonscholten/scanl_tagger.git
10+
cd scanl_tagger
11+
docker compose pull
12+
docker compose up
13+
```
614

715
## Setup and Run
8-
You will need `python3` installed. We will explicitly use the `python3` command below but, of course, if your environment is configured to use python3 by default, you do not need to. We have also only tested this on **Ubuntu 22** and **Ubuntu via WSL**. It most likely works in similar environments, but no guarantees.
16+
You will need `python3.10` installed.
917

1018
You'll need to install `pip` -- https://pip.pypa.io/en/stable/installation/
1119

@@ -19,17 +27,29 @@ Finally, we require the `token` and `target` vectors from [code2vec](https://git
1927

2028
## Usage
2129

22-
```bash
23-
python main.py -v # Display the application version.
24-
python main.py -r # Start the server for tagging requests.
25-
python main.py -t # Run the training set to retrain the model.
30+
```
31+
usage: main [-h] [-v] [-r] [-t] [-a ADDRESS] [--port PORT] [--protocol PROTOCOL]
32+
[--words WORDS]
33+
34+
options:
35+
-h, --help show this help message and exit
36+
-v, --version print tagger application version
37+
-r, --run run server for part of speech tagging requests
38+
-t, --train run training set to retrain the model
39+
-a ADDRESS, --address ADDRESS
40+
configure server address
41+
--port PORT configure server port
42+
--protocol PROTOCOL configure whether the server uses http or https
43+
--words WORDS provide path to a list of acceptable abbreviations
2644
```
2745

28-
`python main.py -r` will start the server, which will listen for identifier names sent via HTTP over the route:
46+
`./main -r` will start the server, which will listen for identifier names sent via HTTP over the route:
2947

30-
http://127.0.0.1:5000/{identifier_name}/{code_context}
48+
http://127.0.0.1:5000/{cache_selection}/{identifier_name}/{code_context}
3149

32-
Where "code context" is one of:
50+
"cache selection" will save results to a separate cache if it is set to "student"
51+
52+
"code context" is one of:
3353
- FUNCTION
3454
- ATTRIBUTE
3555
- CLASS
@@ -38,16 +58,19 @@ Where "code context" is one of:
3858

3959
For example:
4060

41-
Tag a declaration: ``http://127.0.0.1:5000/numberArray/DECLARATION``
61+
Tag a declaration: ``http://127.0.0.1:5000/cache/numberArray/DECLARATION``
62+
63+
Tag a function: ``http://127.0.0.1:5000/cache/GetNumberArray/FUNCTION``
4264

43-
Tag a function: ``http://127.0.0.1:5000/GetNumberArray/FUNCTION``
65+
Tag an class: ``http://127.0.0.1:5000/cache/PersonRecord/CLASS``
4466

45-
Tag an class: ``http://127.0.0.1:5000/PersonRecord/CLASS``
67+
#### Note
68+
Kebab case is not currently supported due to the limitations of Spiral. Attempting to send the tagger identifiers which are in kebab case will result in the entry of a single noun.
4669

4770
You will need to have a way to parse code and filter out identifier names if you want to do some on-the-fly analysis of source code. We recommend [srcML](https://www.srcml.org/). Since the actual tagger is a web server, you don't have to use srcML. You could always use other AST-based code representations, or any other method of obtaining identifier information.
4871

4972
## Training the tagger
50-
You can train this tagger using the `-t` option (which will re-run the training routine). For the moment, most of this is hard-coded in, so if you want to use a different data set/different seeds, you'll need to modify the code. This is will potentially change in the future.
73+
You can train this tagger using the `-t` option (which will re-run the training routine). For the moment, most of this is hard-coded in, so if you want to use a different data set/different seeds, you'll need to modify the code. This will potentially change in the future.
5174

5275
## Errors?
5376
Please make an issue if you run into errors
@@ -63,3 +86,9 @@ The data used to train this tagger can be found in the most recent database upda
6386

6487
# Interested in our other work?
6588
Find our other research [at our webpage](https://www.scanl.org/) and check out the [Identifier Name Structure Catalogue](https://github.com/SCANL/identifier_name_structure_catalogue)
89+
90+
# WordNet
91+
This project uses WordNet to perform a dictionary lookup on the individual words in each identifier:
92+
93+
Princeton University "About WordNet." [WordNet](https://wordnet.princeton.edu/). Princeton University. 2010
94+

compose.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
volumes:
2+
vectors:
3+
gensim:
4+
cache:
5+
words:
6+
7+
services:
8+
9+
tagger:
10+
image: srcml/scanl_tagger
11+
build:
12+
context: ./
13+
platforms:
14+
- ${BUILD_PLATFORM:-linux/arm64}
15+
- ${BUILD_PLATFORM:-linux/amd64}
16+
volumes:
17+
- vectors:/code2vec
18+
- gensim:/root/gensim-data
19+
- cache:/cache
20+
- words:/words
21+
ports:
22+
- "${PORT-8080}:5000"

main.py renamed to main

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/usr/bin/env python
2+
13
import os, sqlite3, classifier_multiclass, random, nltk, argparse
24
from datetime import datetime
35
import classifier_multiclass
@@ -6,6 +8,7 @@
68
import numpy as np
79
from tag_identifier import start_server
810
from download_code2vec_vectors import *
11+
import json
912
from create_models import createModel, stable_features, mutable_feature_list
1013

1114
# Get the directory of the current script
@@ -136,11 +139,16 @@ def train(config):
136139
- To check the application version, use: -v or --version.
137140
- To start a server for part-of-speech tagging requests, use: -r or --run.
138141
- To run a training set and retrain the model, use: -t or --train.
142+
- To update server configuration, use -c or --config
139143
140144
Example Usage:
141-
python script.py -v # Display the application version.
142-
python script.py -r # Start the server for tagging requests.
143-
python script.py -t # Run the training set to retrain the model.
145+
python script.py -v # Display the application version.
146+
python script.py -r # Start the server for tagging requests.
147+
python script.py -t # Run the training set to retrain the model.
148+
python script.py -a --address [host] # Run the tagger on a specific IP address
149+
python script.py --port [port] # Run the tagger on a specific port
150+
python script.py --protocol [http/https] # Specify use of http or https
151+
python script.py --words [path] # Specify path of word list
144152
145153
Note:
146154
If no arguments are provided or if there is an invalid argument, the script will display usage instructions.
@@ -153,14 +161,24 @@ def train(config):
153161
parser.add_argument("-v", "--version", action="store_true", help="print tagger application version")
154162
parser.add_argument("-r", "--run", action="store_true", help="run server for part of speech tagging requests")
155163
parser.add_argument("-t", "--train", action="store_true", help="run training set to retrain the model")
164+
parser.add_argument("-a", "--address", nargs=1, action="store", help="configure server address", )
165+
parser.add_argument("--port", nargs=1, action="store", help="configure server port")
166+
parser.add_argument("--protocol", nargs=1, action="store", help="configure whether the server uses http or https")
167+
parser.add_argument("--words", nargs=1, action="store", help="provide path to a list of acceptable abbreviations")
156168

157169
args = parser.parse_args()
158170

159171
if args.version:
160172
print("SCANL Tagger version 1.5.0")
161173
elif args.run:
162174
download_files()
163-
start_server()
175+
temp_config = {}
176+
print(args)
177+
if args.address != None: temp_config["address"] = args.address[0]
178+
if args.port != None: temp_config["port"] = args.port[0]
179+
if args.protocol != None: temp_config["protocol"] = args.protocol[0]
180+
if args.words != None: temp_config["words"] = args.words[0]
181+
start_server(temp_config)
164182
elif args.train:
165183
download_files()
166184
# Define a configuration dictionary and pass it to the train function
@@ -177,4 +195,4 @@ def train(config):
177195
}
178196
train(config)
179197
else:
180-
parser.print_usage()
198+
parser.print_usage()

requirements.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
utils
12
flair
23
Flask
34
gensim
@@ -9,4 +10,6 @@ numpy
910
pandas
1011
Requests
1112
scikit_learn
12-
scipy
13+
scipy
14+
git+https://github.com/cnewman/spiral.git
15+
waitress

scripts/test_waitress.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import threading
2+
import requests
3+
4+
host = input('hostname? [0.0.0.0]: ')
5+
port = input('port? [5000]: ')
6+
7+
def req(word,type,id):
8+
if host == '' or port == '':
9+
print(str(id) + ": " + requests.get(url = f'http://127.0.0.1:5000/{word}/{type}', params = {}).text)
10+
else:
11+
print(str(id) + ": " + requests.get(url = f'http://{host}:{port}/{word}/{type}', params = {}).text)
12+
13+
r1 = threading.Thread(target=req, args=("numberArray","DECLARATION",1))
14+
r2 = threading.Thread(target=req, args=("GetNumberArray","FUNCTION",2))
15+
r3 = threading.Thread(target=req, args=("PersonRecord","CLASS",3))
16+
17+
r1.start()
18+
r2.start()
19+
r3.start()
20+
21+
r1.join()
22+
r2.join()
23+
r3.join()

serve.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"address": "0.0.0.0",
3+
"port": 5000,
4+
"protocol": "https",
5+
"words":""
6+
}

0 commit comments

Comments
 (0)