From 3a028e8b94ec1c476341914230a79e00fa793cb5 Mon Sep 17 00:00:00 2001
From: Konstantin Korotaev
 <42615530+KonstantinKorotaev@users.noreply.github.com>
Date: Mon, 4 Jul 2022 20:49:49 +0300
Subject: [PATCH 1/2] fix: DEV-2523: Support webhook data loading in NER ml
 backend example

---
 label_studio_ml/examples/ner/ner.py              | 11 +++++++++--
 .../simple_text_classifier.py                    |  2 +-
 label_studio_ml/model.py                         |  7 ++++---
 label_studio_ml/utils.py                         | 16 ++++++++++++++++
 4 files changed, 30 insertions(+), 6 deletions(-)
diff --git a/label_studio_ml/examples/ner/ner.py b/label_studio_ml/examples/ner/ner.py
index ea2ac6ba..6ce42df4 100644
--- a/label_studio_ml/examples/ner/ner.py
+++ b/label_studio_ml/examples/ner/ner.py
@@ -25,9 +25,9 @@
 from transformers import AdamW, get_linear_schedule_with_warmup
 
 from label_studio_ml.model import LabelStudioMLBase
+from label_studio_ml.utils import get_annotated_dataset
 from utils import calc_slope
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -342,7 +342,7 @@ def __init__(self, **kwargs):
         self.to_name = self.info['to_name'][0]
         self.value = self.info['inputs'][0]['value']
 
-        if not self.train_output:
+        if not self.train_output or (not self.train_output.get('model_path')):
             self.labels = self.info['labels']
         else:
             self.load(self.train_output)
@@ -464,6 +464,13 @@ def fit(
         warmup_steps=0, save_steps=50, dump_dataset=True, cache_dir='~/.heartex/cache', train_logs=None,
         **kwargs
     ):
+        # check if training is from web hook
+        if kwargs.get('data'):
+            project_id = kwargs['data']['project']['id']
+            completions = get_annotated_dataset(project_id)
+        # assert that there annotations
+        assert len(completions) > 0
+
         train_logs = train_logs or os.path.join(workdir, 'train_logs')
         os.makedirs(train_logs, exist_ok=True)
         logger.debug('Prepare models')
diff --git a/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py b/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py
index 6d81193f..9d838c1d 100644
--- a/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py
+++ b/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py
@@ -40,7 +40,7 @@ def __init__(self, **kwargs):
         self.to_name = self.info['to_name'][0]
         self.value = self.info['inputs'][0]['value']
 
-        if not self.train_output:
+        if (not self.train_output) or (self.train_output and not self.train_output.get('model_file')):
             # If there is no trainings, define cold-started the simple TF-IDF text classifier
             self.reset_model()
             # This is an array of <Choice> labels
diff --git a/label_studio_ml/model.py b/label_studio_ml/model.py
index 4d5fd776..67172779 100644
--- a/label_studio_ml/model.py
+++ b/label_studio_ml/model.py
@@ -29,13 +29,14 @@
 from rq.job import Job
 from colorama import Fore
 
-from label_studio_tools.core.utils.params import get_bool_env
+from label_studio_tools.core.utils.params import get_bool_env, get_env
 from label_studio_tools.core.label_config import parse_config
 from label_studio_tools.core.utils.io import get_local_path
 
 logger = logging.getLogger(__name__)
 
 LABEL_STUDIO_ML_BACKEND_V2_DEFAULT = False
+LABEL_STUDIO_STRICT_ERRORS = get_env("LS_STRICT_ERRORS", False)
 
 @attr.s
 class ModelWrapper(object):
@@ -189,12 +190,12 @@ def _get_result_from_job_id(self, job_id):
         if not os.path.exists(job_dir):
             logger.warning(f"=> Warning: {job_id} dir doesn't exist. "
                            f"It seems that you don't have specified model dir.")
-            return None
+            return None if LABEL_STUDIO_STRICT_ERRORS else {}
         result_file = os.path.join(job_dir, self.JOB_RESULT)
         if not os.path.exists(result_file):
             logger.warning(f"=> Warning: {job_id} dir doesn't contain result file. "
                            f"It seems that previous training session ended with error.")
-            return None
+            return None if LABEL_STUDIO_STRICT_ERRORS else {}
         logger.debug(f'Read result from {result_file}')
         with open(result_file) as f:
             result = json.load(f)
diff --git a/label_studio_ml/utils.py b/label_studio_ml/utils.py
index be5ae742..3859e044 100644
--- a/label_studio_ml/utils.py
+++ b/label_studio_ml/utils.py
@@ -1,4 +1,6 @@
+import json
 import logging
+import requests
 
 from PIL import Image
 
@@ -48,3 +50,17 @@ def get_image_local_path(url, image_cache_dir=None, project_dir=None, image_dir=
 
 def get_image_size(filepath):
     return Image.open(filepath).size
+
+
+def get_annotated_dataset(project_id, hostname=None, api_key=None):
+    """Just for demo purposes: retrieve annotated data from Label Studio API"""
+    if hostname is None:
+        hostname = get_env('HOSTNAME')
+    if api_key is None:
+        api_key = get_env('API_KEY')
+    download_url = f'{hostname.rstrip("/")}/api/projects/{project_id}/export'
+    response = requests.get(download_url, headers={'Authorization': f'Token {api_key}'})
+    if response.status_code != 200:
+        raise Exception(f"Can't load task data using {download_url}, "
+                        f"response status_code = {response.status_code}")
+    return json.loads(response.content)

From 87a33b4f870cc3c8560cff44bc31484752391030 Mon Sep 17 00:00:00 2001
From: Konstantin Korotaev
 <42615530+KonstantinKorotaev@users.noreply.github.com>
Date: Thu, 7 Jul 2022 11:42:51 +0300
Subject: [PATCH 2/2] Update examples to use webhook training

---
 label_studio_ml/examples/bert/bert_classifier.py           | 7 +++++--
 label_studio_ml/examples/flair/ner_ml_backend.py           | 7 +++++++
 label_studio_ml/examples/mmdetection/mmdetection.py        | 3 +++
 label_studio_ml/examples/ner/ner.py                        | 2 +-
 .../pytorch_transfer_learning/pytorch_transfer_learning.py | 6 +++++-
 .../simple_text_classifier/simple_text_classifier.py       | 2 +-
 .../examples/substring_matching/substring_matching.py      | 4 ++++
 label_studio_ml/examples/tensorflow/mobilenet_finetune.py  | 7 ++++++-
 label_studio_ml/examples/tesseract/tesseract.py            | 6 ++++++
 9 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/label_studio_ml/examples/bert/bert_classifier.py b/label_studio_ml/examples/bert/bert_classifier.py
index 8410e5c8..624adff7 100644
--- a/label_studio_ml/examples/bert/bert_classifier.py
+++ b/label_studio_ml/examples/bert/bert_classifier.py
@@ -12,8 +12,7 @@
 
 from label_studio_ml.model import LabelStudioMLBase
 
-from utils import prepare_texts, calc_slope
-
+from utils import prepare_texts, calc_slope, get_annotated_dataset
 
 if torch.cuda.is_available():
     device = torch.device("cuda")
@@ -128,6 +127,10 @@ def predict(self, tasks, **kwargs):
         return predictions
 
     def fit(self, completions, workdir=None, cache_dir=None, **kwargs):
+        # check if training is from web hook and load tasks from api
+        if kwargs.get('data'):
+            project_id = kwargs['data']['project']['id']
+            completions = get_annotated_dataset(project_id)
         input_texts = []
         output_labels, output_labels_idx = [], []
         label2idx = {l: i for i, l in enumerate(self.labels)}
diff --git a/label_studio_ml/examples/flair/ner_ml_backend.py b/label_studio_ml/examples/flair/ner_ml_backend.py
index 343153a0..19bc371f 100644
--- a/label_studio_ml/examples/flair/ner_ml_backend.py
+++ b/label_studio_ml/examples/flair/ner_ml_backend.py
@@ -9,6 +9,9 @@
 import os
 
 #writing class with inheretance
+from label_studio_ml.utils import get_annotated_dataset
+
+
 class SequenceTaggerModel(LabelStudioMLBase):
     def __init__(self, **kwargs):
         #initialize base class
@@ -87,6 +90,10 @@ def convert_to_ls_annotation(self, flair_sentences):
         return results
     
     def fit(self, completions, workdir=None, **kwargs):
+        # check if training is from web hook
+        if kwargs.get('data'):
+            project_id = kwargs['data']['project']['id']
+            completions = get_annotated_dataset(project_id)
         #completions contain ALL the annotated samples.
         #train a model from scratch here.
         flair_sents = []
diff --git a/label_studio_ml/examples/mmdetection/mmdetection.py b/label_studio_ml/examples/mmdetection/mmdetection.py
index 637dd16c..5972635a 100644
--- a/label_studio_ml/examples/mmdetection/mmdetection.py
+++ b/label_studio_ml/examples/mmdetection/mmdetection.py
@@ -128,6 +128,9 @@ def predict(self, tasks, **kwargs):
             'score': avg_score
         }]
 
+    def fit(self, completions, workdir=None, **kwargs):
+        return {}
+
 
 def json_load(file, int_keys=False):
     with io.open(file, encoding='utf8') as f:
diff --git a/label_studio_ml/examples/ner/ner.py b/label_studio_ml/examples/ner/ner.py
index 6ce42df4..85373bd4 100644
--- a/label_studio_ml/examples/ner/ner.py
+++ b/label_studio_ml/examples/ner/ner.py
@@ -464,7 +464,7 @@ def fit(
         warmup_steps=0, save_steps=50, dump_dataset=True, cache_dir='~/.heartex/cache', train_logs=None,
         **kwargs
     ):
-        # check if training is from web hook
+        # check if training is from web hook and load tasks from api
         if kwargs.get('data'):
             project_id = kwargs['data']['project']['id']
             completions = get_annotated_dataset(project_id)
diff --git a/label_studio_ml/examples/pytorch_transfer_learning/pytorch_transfer_learning.py b/label_studio_ml/examples/pytorch_transfer_learning/pytorch_transfer_learning.py
index 0dc51289..89c98aee 100644
--- a/label_studio_ml/examples/pytorch_transfer_learning/pytorch_transfer_learning.py
+++ b/label_studio_ml/examples/pytorch_transfer_learning/pytorch_transfer_learning.py
@@ -14,7 +14,7 @@
 from torchvision import models, transforms
 
 from label_studio_ml.model import LabelStudioMLBase
-from label_studio_ml.utils import get_single_tag_keys, get_choice, is_skipped, get_local_path
+from label_studio_ml.utils import get_single_tag_keys, get_choice, is_skipped, get_local_path, get_annotated_dataset
 
 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 
@@ -177,6 +177,10 @@ def predict(self, tasks, **kwargs):
         return predictions
 
     def fit(self, completions, workdir=None, batch_size=32, num_epochs=10, **kwargs):
+        # check if training is from web hook and load tasks from api
+        if kwargs.get('data'):
+            project_id = kwargs['data']['project']['id']
+            completions = get_annotated_dataset(project_id)
         image_urls, image_classes = [], []
         print('Collecting annotations...')
         for completion in completions:
diff --git a/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py b/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py
index 9d838c1d..dac434b2 100644
--- a/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py
+++ b/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py
@@ -102,7 +102,7 @@ def _get_annotated_dataset(self, project_id):
         return json.loads(response.content)
 
     def fit(self, annotations, workdir=None, **kwargs):
-        # check if training is from web hook
+        # check if training is from web hook and load tasks from api
         if kwargs.get('data'):
             project_id = kwargs['data']['project']['id']
             tasks = self._get_annotated_dataset(project_id)
diff --git a/label_studio_ml/examples/substring_matching/substring_matching.py b/label_studio_ml/examples/substring_matching/substring_matching.py
index e269f376..979f87b4 100644
--- a/label_studio_ml/examples/substring_matching/substring_matching.py
+++ b/label_studio_ml/examples/substring_matching/substring_matching.py
@@ -91,3 +91,7 @@ def _extract_meta(task):
             meta['start'] = task['value']['start']
             meta['end'] = task['value']['end']
         return meta
+
+    def fit(self, completions, workdir=None, **kwargs):
+        # save some training outputs to the job result
+        return {'random': random.randint(1, 10)}
diff --git a/label_studio_ml/examples/tensorflow/mobilenet_finetune.py b/label_studio_ml/examples/tensorflow/mobilenet_finetune.py
index 48b93e51..563dfd84 100644
--- a/label_studio_ml/examples/tensorflow/mobilenet_finetune.py
+++ b/label_studio_ml/examples/tensorflow/mobilenet_finetune.py
@@ -6,7 +6,8 @@
 
 from PIL import Image
 from label_studio_ml.model import LabelStudioMLBase
-from label_studio_ml.utils import get_image_local_path, get_single_tag_keys, get_choice, is_skipped
+from label_studio_ml.utils import get_image_local_path, get_single_tag_keys, get_choice, is_skipped, \
+    get_annotated_dataset
 
 logger = logging.getLogger(__name__)
 feature_extractor_model = 'https://tfhub.dev/google/tf2-preview/mobilenet_v2/feature_vector/4'
@@ -62,6 +63,10 @@ def predict(self, tasks, **kwargs):
         }]
 
     def fit(self, completions, workdir=None, **kwargs):
+        # check if training is from web hook and load tasks from api
+        if kwargs.get('data'):
+            project_id = kwargs['data']['project']['id']
+            completions = get_annotated_dataset(project_id)
 
         annotations = []
         for completion in completions:
diff --git a/label_studio_ml/examples/tesseract/tesseract.py b/label_studio_ml/examples/tesseract/tesseract.py
index 1b00a733..b4292d3a 100644
--- a/label_studio_ml/examples/tesseract/tesseract.py
+++ b/label_studio_ml/examples/tesseract/tesseract.py
@@ -1,3 +1,5 @@
+import random
+
 from PIL import Image
 import pytesseract as pt
 from label_studio_ml.model import LabelStudioMLBase
@@ -74,3 +76,7 @@ def _extract_meta(task):
             meta["original_width"] = task['original_width']
             meta["original_height"] = task['original_height']
         return meta
+
+    def fit(self, completions, workdir=None, **kwargs):
+        # save some training outputs to the job result
+        return {'random': random.randint(1, 10)}
\ No newline at end of file