Skip to content

Commit 1fb0839

Browse files
authored
Merge pull request #12 from brandonscholten/merge-upstream-changes
Merge upstream changes
2 parents 76933c7 + 47dbb3b commit 1fb0839

File tree

11 files changed

+167
-83
lines changed

11 files changed

+167
-83
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@ output/
22
__pycache__/
33
code2vec/
44
cache/
5-
input.txt
5+
input.txt

Dockerfile

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,19 @@
11
FROM python:3.12-slim
22

3+
#argument to enable GPU accelaration
4+
ARG GPU=false
5+
36
# Install (and build) requirements
47
COPY requirements.txt /requirements.txt
5-
RUN apt-get update && \
6-
apt-get install -y git curl && \
8+
COPY requirements_gpu.txt /requirements_gpu.txt
9+
RUN apt-get clean && rm -rf /var/lib/apt/lists/* && \
10+
apt-get update --fix-missing && \
11+
apt-get install --allow-unauthenticated -y git curl && \
712
pip install -r requirements.txt && \
8-
rm -rf /var/lib/apt/lists/*
13+
if [ "$GPU" = true ]; then \
14+
pip install -r requirements_gpu.txt; \
15+
fi && \
16+
apt-get clean && rm -rf /var/lib/apt/lists/*
917

1018
COPY . .
1119
RUN pip install -e .
@@ -71,4 +79,4 @@ CMD date; \
7179
echo "Running..."; \
7280
/main -r --words words/abbreviationList.csv
7381

74-
ENV TZ=US/Michigan
82+
ENV TZ=US/Michigan

README.md

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,13 @@ options:
6363

6464
`./main -r` will start the server, which will listen for identifier names sent via HTTP over the route:
6565

66-
http://127.0.0.1:8080/{cache_selection}/{identifier_name}/{code_context}
66+
http://127.0.0.1:8080/{identifier_name}/{code_context}/{database_name (optional)}
6767

68-
"cache selection" will save results to a separate cache if it is set to "student"
68+
"database name" specifies an sqlite database to be used for result caching and data collection. If the database specified does not exist, one will be created.
69+
70+
You can check wehther or not a database exists by using the `/probe` route by sending an HTTP request like this:
71+
72+
http://127.0.0.1:5000/probe/{database_name}
6973

7074
"code context" is one of:
7175
- FUNCTION
@@ -76,11 +80,11 @@ http://127.0.0.1:8080/{cache_selection}/{identifier_name}/{code_context}
7680

7781
For example:
7882

79-
Tag a declaration: ``http://127.0.0.1:8080/cache/numberArray/DECLARATION``
83+
Tag a declaration: ``http://127.0.0.1:8000/numberArray/DECLARATION/database``
8084

81-
Tag a function: ``http://127.0.0.1:8080/cache/GetNumberArray/FUNCTION``
85+
Tag a function: ``http://127.0.0.1:8000/GetNumberArray/FUNCTION/database``
8286

83-
Tag an class: ``http://127.0.0.1:8080/cache/PersonRecord/CLASS``
87+
Tag an class: ``http://127.0.0.1:8000/PersonRecord/CLASS/database``
8488

8589
#### Note
8690
Kebab case is not currently supported due to the limitations of Spiral. Attempting to send the tagger identifiers which are in kebab case will result in the entry of a single noun.
@@ -156,4 +160,3 @@ Find our other research [at our webpage](https://www.scanl.org/) and check out t
156160
This project uses WordNet to perform a dictionary lookup on the individual words in each identifier:
157161

158162
Princeton University "About WordNet." [WordNet](https://wordnet.princeton.edu/). Princeton University. 2010
159-

compose.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,4 @@ services:
2020
- words:/words
2121
ports:
2222
- "${PORT-8080}:5000"
23+
restart: always

requirements.txt

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ filelock==3.17.0
1717
flair==0.15.0
1818
Flask==3.1.0
1919
fonttools==4.55.6
20-
fsspec==2024.12.0
20+
fsspec==2023.5.0
2121
ftfy==6.3.1
2222
gdown==5.2.0
2323
gensim==4.3.3
@@ -42,18 +42,6 @@ mpmath==1.3.0
4242
networkx==3.4.2
4343
nltk==3.9.1
4444
numpy==1.26.4
45-
nvidia-cublas-cu12==12.4.5.8
46-
nvidia-cuda-cupti-cu12==12.4.127
47-
nvidia-cuda-nvrtc-cu12==12.4.127
48-
nvidia-cuda-runtime-cu12==12.4.127
49-
nvidia-cudnn-cu12==9.1.0.70
50-
nvidia-cufft-cu12==11.2.1.3
51-
nvidia-curand-cu12==10.3.5.147
52-
nvidia-cusolver-cu12==11.6.1.9
53-
nvidia-cusparse-cu12==12.3.1.170
54-
nvidia-nccl-cu12==2.21.5
55-
nvidia-nvjitlink-cu12==12.4.127
56-
nvidia-nvtx-cu12==12.4.127
5745
packaging==24.2
5846
pandas==2.2.3
5947
pillow==11.1.0
@@ -93,7 +81,6 @@ torch==2.5.1
9381
tqdm==4.67.1
9482
transformer-smaller-training-vocab==0.4.0
9583
transformers==4.48.1
96-
triton==3.1.0
9784
typing_extensions==4.12.2
9885
tzdata==2025.1
9986
urllib3==2.3.0

requirements_gpu.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
nvidia-cublas-cu12==12.4.5.8
2+
nvidia-cuda-cupti-cu12==12.4.127
3+
nvidia-cuda-nvrtc-cu12==12.4.127
4+
nvidia-cuda-runtime-cu12==12.4.127
5+
nvidia-cudnn-cu12==9.1.1.17
6+
nvidia-cufft-cu12==11.2.1.3
7+
nvidia-curand-cu12==10.3.5.147
8+
nvidia-cusolver-cu12==11.6.1.9
9+
nvidia-cusparse-cu12==12.3.1.170
10+
nvidia-nccl-cu12==2.23.4
11+
nvidia-nvjitlink-cu12==12.4.127
12+
nvidia-nvtx-cu12==12.4.127

serve.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
"port": 8080,
44
"protocol": "https",
55
"words":""
6-
}
6+
}

src/classifier_multiclass.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,12 @@ def perform_classification(X, y, results_text_file, output_directory, TrainingAl
121121
Returns:
122122
None
123123
"""
124+
124125
X_train, X_test, y_train, y_test, X_train_original, X_test_original = build_datasets(X, y, output_directory, trainingSeed)
125126
labels = np.unique(y_train, return_counts=False)
126127

127128
algoData = TrainTestvalidationData(X_train, X_test, y_train, y_test, X_train_original, X_test_original, labels)
129+
128130
results_text_file.write("Training Seed: %s\n" % trainingSeed)
129131
results_text_file.write("Classifier Seed: %s\n" % classifierSeed)
130132

@@ -151,7 +153,6 @@ def write_importances(results_text_file, feature_names, presult, metric_name):
151153
results_text_file.write(f"{feature},{value}\n")
152154
results_text_file.write("\n")
153155

154-
155156
def analyzeGradientBoost(results_text_file, output_directory, scorersKey, algoData, classifierSeed, trainingSeed, columns_to_drop):
156157
"""
157158
Analyze a GradientBoostingClassifier for classification and report results.
@@ -180,6 +181,7 @@ def analyzeGradientBoost(results_text_file, output_directory, scorersKey, algoDa
180181
print("GradientBoostingClassifier")
181182

182183
# Drop SPLIT_IDENTIFIER and WORD columns from X_train
184+
183185
X_train_dropped = algoData.X_train.drop(columns=columns_to_drop, errors='ignore')
184186

185187
max_threads = max(1, multiprocessing.cpu_count() - 3)

src/download_code2vec_vectors.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def create_directory_if_not_exists(directory_path):
1414
if not os.path.exists(directory_path):
1515
os.makedirs(directory_path)
1616

17+
1718
def download_files():
1819

1920
nltk.download('averaged_perceptron_tagger_eng')

src/feature_generator.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ def createFeatures(data: pd.DataFrame, feature_list: List[str], modelTokens = No
274274
'POS': 'NOUN', # Possessive ending
275275
'RP': 'NOUN', # Particle
276276
'X': 'NOUN', # Unknown
277+
277278
'START':'START',
278279
'END':'END',
279280

@@ -456,6 +457,7 @@ def compute_similarity(verb_vector, target_word, model):
456457
similarity = np.dot(verb_vector, target_word_vector)
457458
return similarity
458459

460+
459461
def contrastive_embedding(target_vector, contrast_vectors, beta=0.1):
460462
"""
461463
Adjust a target embedding by pushing it away from contrast embeddings.
@@ -557,6 +559,7 @@ def createDeterminerVectorFeature(data, model):
557559
pandas.DataFrame: The input DataFrame with an additional 'DET_SCORE' column.
558560
"""
559561
words = data["WORD"]
562+
560563
# Convert sets to lists before merging
561564
non_determiners = list(nouns) + list(verbs) + list(prepositions) + list(conjunctions)
562565

@@ -570,6 +573,7 @@ def createDeterminerVectorFeature(data, model):
570573

571574
# Compute similarity
572575
scores = pd.DataFrame([compute_similarity(adjusted_determiner_vector, word.lower(), model) for word in words])
576+
573577
scores.columns = ['DET_SCORE']
574578

575579
data = pd.concat([data, scores], axis=1)
@@ -592,6 +596,7 @@ def createPrepositionVectorFeature(data, model):
592596
pandas.DataFrame: The input DataFrame with an additional 'PREP_SCORE' column.
593597
"""
594598
words = data["WORD"]
599+
595600
# Convert sets to lists before merging
596601
non_prepositions = list(nouns) + list(verbs) + list(determiners) + list(conjunctions)
597602

@@ -662,6 +667,7 @@ def createPreambleVectorFeature(name, data, model):
662667
The actual name of the new column will be 'name'+'PRE_SCORE' (e.g., 'CODEPRE_SCORE', 'METHODPRE_SCORE').
663668
"""
664669
words = data["WORD"]
670+
665671
# Convert sets to lists before merging
666672
non_preambles = list(nouns) + list(verbs) + list(determiners) + list(prepositions) + list(determiners)
667673

@@ -866,10 +872,12 @@ def createIdentifierClosedSetFeature(data, conjunctions=conjunctions, determiner
866872
pandas.DataFrame: Updated DataFrame with a 'CONTAINSCLOSEDSET' column.
867873
"""
868874
closed_set = set(conjunctions) | set(determiners) | set(prepositions)
875+
869876
words = data["WORD"]
870877
isClosedSet = pd.DataFrame([1 if word in closed_set else 0 for word in words])
871878
isClosedSet.columns = ["CONTAINSCLOSEDSET"]
872879
data = pd.concat([data, isClosedSet], axis=1)
880+
873881
return data
874882

875883
def createIdentifierContainsVerbFeature(data, verbs=verbs):
@@ -884,10 +892,12 @@ def createIdentifierContainsVerbFeature(data, verbs=verbs):
884892
pandas.DataFrame: Updated DataFrame with a 'CONTAINSVERB' column.
885893
"""
886894
verb_set = set(verbs)
895+
887896
words = data["WORD"]
888897
isVerb = pd.DataFrame([1 if word in verb_set else 0 for word in words])
889898
isVerb.columns = ["CONTAINSVERB"]
890899
data = pd.concat([data, isVerb], axis=1)
900+
891901
return data
892902

893903
def addMorphologicalPluralFeature(data):

0 commit comments

Comments
 (0)