SCANL
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 115 additions & 0 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 115 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 15 additions & 12 deletions b/‎Dockerfile‎
Lines changed: 15 additions & 12 deletions
diff --git a/‎README.md‎
Lines changed: 66 additions & 8 deletions b/‎README.md‎
Lines changed: 66 additions & 8 deletions
diff --git a/‎__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,115 @@
+name: SCALAR Tagger CI
+
+on:
+  push:
+    branches: [ main, develop ]
+  pull_request:
+    branches: [ main, develop ]
+
+jobs:
+  test-docker:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      
+      - name: Pull pre-built image
+        run: docker pull sourceslicer/scalar_tagger:latest
+      
+      - name: Start container
+        run: |
+          docker run -d -p 8080:8080 sourceslicer/scalar_tagger:latest
+      
+      - name: Wait for service to start
+        run: |
+          # Wait for up to 10 minutes for the service to start
+          timeout=600
+          while [ $timeout -gt 0 ]; do
+            if curl -s "http://localhost:8080/cache/numberArray/DECLARATION" > /dev/null; then
+              echo "Service is ready"
+              break
+            fi
+            echo "Waiting for service to start... ($timeout seconds remaining)"
+            sleep 5
+            timeout=$((timeout - 5))
+          done
+          
+          if [ $timeout -le 0 ]; then
+            echo "Service failed to start within timeout"
+            docker logs $(docker ps -q)
+            exit 1
+          fi
+      
+      - name: Test tagger endpoint
+        run: |
+          response=$(curl -s "http://localhost:8080/cache/numberArray/DECLARATION")
+          if [ -z "$response" ]; then
+            echo "No response from tagger"
+            exit 1
+          fi
+          echo "Received response: $response"
+
+  test-native:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      
+      - name: Create and activate virtual environment
+        run: |
+          python -m venv /tmp/tagger
+          source /tmp/tagger/bin/activate
+      
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+      
+      - name: Download FastText model
+        run: |
+          python -c "
+          import gensim.downloader as api
+          print('Downloading FastText model...')
+          _ = api.load('fasttext-wiki-news-subwords-300')
+          print('FastText model downloaded successfully')
+          "
+      
+      - name: Start tagger server
+        run: |
+          ./main -r &
+          
+          # Wait for up to 5 minutes for the service to start and load models
+          timeout=300
+          while [ $timeout -gt 0 ]; do
+            if curl -s "http://localhost:8080/cache/numberArray/DECLARATION" > /dev/null; then
+              echo "Service is ready"
+              break
+            fi
+            echo "Waiting for service to start... ($timeout seconds remaining)"
+            sleep 10
+            timeout=$((timeout - 10))
+          done
+          
+          if [ $timeout -le 0 ]; then
+            echo "Service failed to start within timeout"
+            # Print logs or debug information
+            cat logs/*.log 2>/dev/null || true
+            exit 1
+          fi
+      
+      - name: Test tagger endpoint
+        run: |
+          response=$(curl -s "http://localhost:8080/cache/numberArray/DECLARATION")
+          if [ -z "$response" ]; then
+            echo "No response from tagger"
+            exit 1
+          fi
+          echo "Received response: $response"
+
+      - name: Cache FastText model
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/gensim-data/fasttext-wiki-news-subwords-300*
+          key: ${{ runner.os }}-fasttext-model
@@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.12-slim
 
 # Install (and build) requirements
 COPY requirements.txt /requirements.txt
@@ -7,16 +7,22 @@ RUN apt-get update && \
     pip install -r requirements.txt && \
     rm -rf /var/lib/apt/lists/*
 
+COPY . .
+RUN pip install -e .
+
+# Download FastText model during build
+RUN python3 -c "import gensim.downloader as api; api.load('fasttext-wiki-news-subwords-300')"
+
 # ntlk downloads
 RUN python3 -c "import nltk; nltk.download('averaged_perceptron_tagger');nltk.download('universal_tagset')"
 
-# Pythong scripts and data
-COPY classifier_multiclass.py \
-     download_code2vec_vectors.py \
-     feature_generator.py \
-     print_utility_functions.py \
-     tag_identifier.py \
-     create_models.py \
+# Python scripts and data
+COPY src/classifier_multiclass.py \
+     src/download_code2vec_vectors.py \
+     src/feature_generator.py \
+     src/tag_identifier.py \
+     src/create_models.py \
+     version.py \
      serve.json \
      main \
      /.
@@ -62,10 +68,7 @@ CMD date; \
         echo "Failed to retrieve Last-Modified headers"; \
     fi; \
     date; \
-    echo "Training..."; \
-    /main -t; \
-    date; \
     echo "Running..."; \
     /main -r --words words/abbreviationList.csv
 
-ENV TZ=US/Michigan
+ENV TZ=US/Michigan
@@ -1,27 +1,36 @@
 # SCALAR Part-of-speech tagger
 This the official release of the SCALAR Part-of-speech tagger
 
+There are two ways to run the tagger. This document describes both ways.
+
+1. Using Docker compose (which runs the tagger's built-in server for you)
+2. Running the tagger's built-in server without Docker
+
 ## Getting Started with Docker
 
 To run SCNL tagger in a Docker container you can clone the repository and pull the latest docker impage from `srcml/scanl_tagger:latest`
 
+Make sure you have Docker and Docker Compose installed:
+https://docs.docker.com/engine/install/
+https://docs.docker.com/compose/install/
+
 ```
-git clone https://github.com/brandonscholten/scanl_tagger.git
+git clone git@github.com:SCANL/scanl_tagger.git
 cd scanl_tagger
 docker compose pull
 docker compose up
 ```
 
-## Setup and Run
-You will need `python3.10` installed. 
+## Getting Started without Docker
+You will need `python3.12` installed. 
 
 You'll need to install `pip` -- https://pip.pypa.io/en/stable/installation/
 
-After it's installed, in the root of the repo, run `pip install -r requirements.txt`
+Set up a virtual environtment: `python -m venv /tmp/tagger` -- feel free to put it somewhere else (change /tmp/tagger) if you prefer
 
-Finally, you need to install Spiral, which we use for identifier splitting. The current version of Spiral on the official repo has a [problem](https://github.com/casics/spiral/issues/4), so consider installing the one from the link below:
+Activate the virtual environment: `source /tmp/tagger/bin/activate` (you can find how to activate it here if `source` does not work for you -- https://docs.python.org/3/library/venv.html#how-venvs-work)
 
-    pip install git+https://github.com/cnewman/spiral.git
+After it's installed and your virtual environment is activated, in the root of the repo, run `pip install -r requirements.txt`
 
 Finally, we require the `token` and `target` vectors from [code2vec](https://github.com/tech-srl/code2vec). The tagger will attempt to automatically download them if it doesn't find them, but you could download them yourself if you like. It will place them in your local directory under `./code2vec/*`
 
@@ -47,6 +56,8 @@ options:
 
 http://127.0.0.1:5000/{cache_selection}/{identifier_name}/{code_context}
 
+**NOTE: ** On docker, the port is 8080 instead of 5000.
+
 "cache selection" will save results to a separate cache if it is set to "student"
 
 "code context" is one of:
@@ -69,15 +80,62 @@ Kebab case is not currently supported due to the limitations of Spiral. Attempti
 
 You will need to have a way to parse code and filter out identifier names if you want to do some on-the-fly analysis of source code. We recommend [srcML](https://www.srcml.org/). Since the actual tagger is a web server, you don't have to use srcML. You could always use other AST-based code representations, or any other method of obtaining identifier information. 
 
+
+## Tagset
+
+**Supported Tagset**
+| Abbreviation |                 Expanded Form                |                   Examples                   |
+|:------------:|:--------------------------------------------:|:--------------------------------------------:|
+|       N      |                     noun                     | Disneyland, shoe, faucet, mother             |
+|      DT      |                  determiner                  | the, this, that, these, those, which         |
+|      CJ      |                  conjunction                 | and, for, nor, but, or, yet, so              |
+|       P      |                  preposition                 | behind, in front of, at, under, above        |
+|      NPL     |                  noun plural                 | Streets, cities, cars, people, lists         |
+|      NM      | noun modifier  (**noun-adjunct**, adjective) | red, cold, hot, **bit**Set, **employee**Name |
+|       V      |                     verb                     | Run, jump, spin,                             |
+|      VM      |            verb modifier  (adverb)           | Very, loudly, seriously, impatiently         |
+|       D      |                     digit                    | 1, 2, 10, 4.12, 0xAF                         |
+|      PRE     |                   preamble                   | Gimp, GLEW, GL, G, p, m, b                   |
+
+**Penn Treebank to SCALAR tagset**
+
+|   Penn Treebank Annotation  | SCALAR Tagset            |
+|:---------------------------:|:------------------------:|
+|       Conjunction (CC)      |     Conjunction (CJ)     |
+|          Digit (CD)         |         Digit (D)        |
+|       Determiner (DT)       |      Determiner (DT)     |
+|      Foreign Word (FW)      |         Noun (N)         |
+|       Preposition (IN)      |      Preposition (P)     |
+|        Adjective (JJ)       |    Noun Modifier (NM)    |
+| Comparative Adjective (JJR) |    Noun Modifier (NM)    |
+| Superlative Adjective (JJS) |    Noun Modifier (NM)    |
+|        List Item (LS)       |         Noun (N)         |
+|          Modal (MD)         |         Verb (V)         |
+|      Noun Singular (NN)     |         Noun (N)         |
+|      Proper Noun (NNP)      |         Noun (N)         |
+|  Proper Noun Plural (NNPS)  |     Noun Plural (NPL)    |
+|      Noun Plural (NNS)      |     Noun Plural (NPL)    |
+|         Adverb (RB)         |    Verb Modifier (VM)    |
+|   Comparative Adverb (RBR)  |    Verb Modifier (VM)    |
+|        Particle (RP)        |    Verb Modifier (VM)    |
+|         Symbol (SYM)        |         Noun (N)         |
+|     To Preposition (TO)     |      Preposition (P)     |
+|          Verb (VB)          |         Verb (V)         |
+|          Verb (VBD)         |         Verb (V)         |
+|          Verb (VBG)         |         Verb (V)         |
+|          Verb (VBN)         |         Verb (V)         |
+|          Verb (VBP)         |         Verb (V)         |
+|          Verb (VBZ)         |         Verb (V)         |
+
 ## Training the tagger
 You can train this tagger using the `-t` option (which will re-run the training routine). For the moment, most of this is hard-coded in, so if you want to use a different data set/different seeds, you'll need to modify the code. This will potentially change in the future.
 
 ## Errors?
 Please make an issue if you run into errors
 
-# Please Cite the Paper!
+# Please Cite the Paper(s)!
 
-No paper for now however the current tagger is based on our previous, so you could cite the previous one for now: 
+Newman, Christian, Scholten , Brandon, Testa, Sophia, Behler, Joshua, Banabilah, Syreen, Collard, Michael L., Decker, Michael, Mkaouer, Mohamed Wiem, Zampieri, Marcos, Alomar, Eman Abdullah, Alsuhaibani, Reem, Peruma, Anthony, Maletic, Jonathan I., (2025), “SCALAR: A Part-of-speech Tagger for Identifiers”, in the Proceedings of the 33rd IEEE/ACM International Conference on Program Comprehension - Tool Demonstrations Track (ICPC), Ottawa, ON, Canada, April 27 -28, 5 pages TO APPEAR.
 
 Christian  D.  Newman,  Michael  J.  Decker,  Reem  S.  AlSuhaibani,  Anthony  Peruma,  Satyajit  Mohapatra,  Tejal  Vishnoi, Marcos Zampieri, Mohamed W. Mkaouer, Timothy J. Sheldon, and Emily Hill, "An Ensemble Approach for Annotating Source Code Identifiers with Part-of-speech Tags," in IEEE Transactions on Software Engineering, doi: 10.1109/TSE.2021.3098242.
 
 
@@ -0,0 +1 @@
+from .version import __version__, __version_info__
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .version import __version__, __version_info__`