From 3a028e8b94ec1c476341914230a79e00fa793cb5 Mon Sep 17 00:00:00 2001 From: Konstantin Korotaev <42615530+KonstantinKorotaev@users.noreply.github.com> Date: Mon, 4 Jul 2022 20:49:49 +0300 Subject: [PATCH 1/2] fix: DEV-2523: Support webhook data loading in NER ml backend example --- label_studio_ml/examples/ner/ner.py | 11 +++++++++-- .../simple_text_classifier.py | 2 +- label_studio_ml/model.py | 7 ++++--- label_studio_ml/utils.py | 16 ++++++++++++++++ 4 files changed, 30 insertions(+), 6 deletions(-) diff --git a/label_studio_ml/examples/ner/ner.py b/label_studio_ml/examples/ner/ner.py index ea2ac6ba..6ce42df4 100644 --- a/label_studio_ml/examples/ner/ner.py +++ b/label_studio_ml/examples/ner/ner.py @@ -25,9 +25,9 @@ from transformers import AdamW, get_linear_schedule_with_warmup from label_studio_ml.model import LabelStudioMLBase +from label_studio_ml.utils import get_annotated_dataset from utils import calc_slope - logger = logging.getLogger(__name__) @@ -342,7 +342,7 @@ def __init__(self, **kwargs): self.to_name = self.info['to_name'][0] self.value = self.info['inputs'][0]['value'] - if not self.train_output: + if not self.train_output or (not self.train_output.get('model_path')): self.labels = self.info['labels'] else: self.load(self.train_output) @@ -464,6 +464,13 @@ def fit( warmup_steps=0, save_steps=50, dump_dataset=True, cache_dir='~/.heartex/cache', train_logs=None, **kwargs ): + # check if training is from web hook + if kwargs.get('data'): + project_id = kwargs['data']['project']['id'] + completions = get_annotated_dataset(project_id) + # assert that there annotations + assert len(completions) > 0 + train_logs = train_logs or os.path.join(workdir, 'train_logs') os.makedirs(train_logs, exist_ok=True) logger.debug('Prepare models') diff --git a/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py b/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py index 6d81193f..9d838c1d 100644 --- a/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py +++ b/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py @@ -40,7 +40,7 @@ def __init__(self, **kwargs): self.to_name = self.info['to_name'][0] self.value = self.info['inputs'][0]['value'] - if not self.train_output: + if (not self.train_output) or (self.train_output and not self.train_output.get('model_file')): # If there is no trainings, define cold-started the simple TF-IDF text classifier self.reset_model() # This is an array of labels diff --git a/label_studio_ml/model.py b/label_studio_ml/model.py index 4d5fd776..67172779 100644 --- a/label_studio_ml/model.py +++ b/label_studio_ml/model.py @@ -29,13 +29,14 @@ from rq.job import Job from colorama import Fore -from label_studio_tools.core.utils.params import get_bool_env +from label_studio_tools.core.utils.params import get_bool_env, get_env from label_studio_tools.core.label_config import parse_config from label_studio_tools.core.utils.io import get_local_path logger = logging.getLogger(__name__) LABEL_STUDIO_ML_BACKEND_V2_DEFAULT = False +LABEL_STUDIO_STRICT_ERRORS = get_env("LS_STRICT_ERRORS", False) @attr.s class ModelWrapper(object): @@ -189,12 +190,12 @@ def _get_result_from_job_id(self, job_id): if not os.path.exists(job_dir): logger.warning(f"=> Warning: {job_id} dir doesn't exist. " f"It seems that you don't have specified model dir.") - return None + return None if LABEL_STUDIO_STRICT_ERRORS else {} result_file = os.path.join(job_dir, self.JOB_RESULT) if not os.path.exists(result_file): logger.warning(f"=> Warning: {job_id} dir doesn't contain result file. " f"It seems that previous training session ended with error.") - return None + return None if LABEL_STUDIO_STRICT_ERRORS else {} logger.debug(f'Read result from {result_file}') with open(result_file) as f: result = json.load(f) diff --git a/label_studio_ml/utils.py b/label_studio_ml/utils.py index be5ae742..3859e044 100644 --- a/label_studio_ml/utils.py +++ b/label_studio_ml/utils.py @@ -1,4 +1,6 @@ +import json import logging +import requests from PIL import Image @@ -48,3 +50,17 @@ def get_image_local_path(url, image_cache_dir=None, project_dir=None, image_dir= def get_image_size(filepath): return Image.open(filepath).size + + +def get_annotated_dataset(project_id, hostname=None, api_key=None): + """Just for demo purposes: retrieve annotated data from Label Studio API""" + if hostname is None: + hostname = get_env('HOSTNAME') + if api_key is None: + api_key = get_env('API_KEY') + download_url = f'{hostname.rstrip("/")}/api/projects/{project_id}/export' + response = requests.get(download_url, headers={'Authorization': f'Token {api_key}'}) + if response.status_code != 200: + raise Exception(f"Can't load task data using {download_url}, " + f"response status_code = {response.status_code}") + return json.loads(response.content) From 87a33b4f870cc3c8560cff44bc31484752391030 Mon Sep 17 00:00:00 2001 From: Konstantin Korotaev <42615530+KonstantinKorotaev@users.noreply.github.com> Date: Thu, 7 Jul 2022 11:42:51 +0300 Subject: [PATCH 2/2] Update examples to use webhook training --- label_studio_ml/examples/bert/bert_classifier.py | 7 +++++-- label_studio_ml/examples/flair/ner_ml_backend.py | 7 +++++++ label_studio_ml/examples/mmdetection/mmdetection.py | 3 +++ label_studio_ml/examples/ner/ner.py | 2 +- .../pytorch_transfer_learning/pytorch_transfer_learning.py | 6 +++++- .../simple_text_classifier/simple_text_classifier.py | 2 +- .../examples/substring_matching/substring_matching.py | 4 ++++ label_studio_ml/examples/tensorflow/mobilenet_finetune.py | 7 ++++++- label_studio_ml/examples/tesseract/tesseract.py | 6 ++++++ 9 files changed, 38 insertions(+), 6 deletions(-) diff --git a/label_studio_ml/examples/bert/bert_classifier.py b/label_studio_ml/examples/bert/bert_classifier.py index 8410e5c8..624adff7 100644 --- a/label_studio_ml/examples/bert/bert_classifier.py +++ b/label_studio_ml/examples/bert/bert_classifier.py @@ -12,8 +12,7 @@ from label_studio_ml.model import LabelStudioMLBase -from utils import prepare_texts, calc_slope - +from utils import prepare_texts, calc_slope, get_annotated_dataset if torch.cuda.is_available(): device = torch.device("cuda") @@ -128,6 +127,10 @@ def predict(self, tasks, **kwargs): return predictions def fit(self, completions, workdir=None, cache_dir=None, **kwargs): + # check if training is from web hook and load tasks from api + if kwargs.get('data'): + project_id = kwargs['data']['project']['id'] + completions = get_annotated_dataset(project_id) input_texts = [] output_labels, output_labels_idx = [], [] label2idx = {l: i for i, l in enumerate(self.labels)} diff --git a/label_studio_ml/examples/flair/ner_ml_backend.py b/label_studio_ml/examples/flair/ner_ml_backend.py index 343153a0..19bc371f 100644 --- a/label_studio_ml/examples/flair/ner_ml_backend.py +++ b/label_studio_ml/examples/flair/ner_ml_backend.py @@ -9,6 +9,9 @@ import os #writing class with inheretance +from label_studio_ml.utils import get_annotated_dataset + + class SequenceTaggerModel(LabelStudioMLBase): def __init__(self, **kwargs): #initialize base class @@ -87,6 +90,10 @@ def convert_to_ls_annotation(self, flair_sentences): return results def fit(self, completions, workdir=None, **kwargs): + # check if training is from web hook + if kwargs.get('data'): + project_id = kwargs['data']['project']['id'] + completions = get_annotated_dataset(project_id) #completions contain ALL the annotated samples. #train a model from scratch here. flair_sents = [] diff --git a/label_studio_ml/examples/mmdetection/mmdetection.py b/label_studio_ml/examples/mmdetection/mmdetection.py index 637dd16c..5972635a 100644 --- a/label_studio_ml/examples/mmdetection/mmdetection.py +++ b/label_studio_ml/examples/mmdetection/mmdetection.py @@ -128,6 +128,9 @@ def predict(self, tasks, **kwargs): 'score': avg_score }] + def fit(self, completions, workdir=None, **kwargs): + return {} + def json_load(file, int_keys=False): with io.open(file, encoding='utf8') as f: diff --git a/label_studio_ml/examples/ner/ner.py b/label_studio_ml/examples/ner/ner.py index 6ce42df4..85373bd4 100644 --- a/label_studio_ml/examples/ner/ner.py +++ b/label_studio_ml/examples/ner/ner.py @@ -464,7 +464,7 @@ def fit( warmup_steps=0, save_steps=50, dump_dataset=True, cache_dir='~/.heartex/cache', train_logs=None, **kwargs ): - # check if training is from web hook + # check if training is from web hook and load tasks from api if kwargs.get('data'): project_id = kwargs['data']['project']['id'] completions = get_annotated_dataset(project_id) diff --git a/label_studio_ml/examples/pytorch_transfer_learning/pytorch_transfer_learning.py b/label_studio_ml/examples/pytorch_transfer_learning/pytorch_transfer_learning.py index 0dc51289..89c98aee 100644 --- a/label_studio_ml/examples/pytorch_transfer_learning/pytorch_transfer_learning.py +++ b/label_studio_ml/examples/pytorch_transfer_learning/pytorch_transfer_learning.py @@ -14,7 +14,7 @@ from torchvision import models, transforms from label_studio_ml.model import LabelStudioMLBase -from label_studio_ml.utils import get_single_tag_keys, get_choice, is_skipped, get_local_path +from label_studio_ml.utils import get_single_tag_keys, get_choice, is_skipped, get_local_path, get_annotated_dataset device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') @@ -177,6 +177,10 @@ def predict(self, tasks, **kwargs): return predictions def fit(self, completions, workdir=None, batch_size=32, num_epochs=10, **kwargs): + # check if training is from web hook and load tasks from api + if kwargs.get('data'): + project_id = kwargs['data']['project']['id'] + completions = get_annotated_dataset(project_id) image_urls, image_classes = [], [] print('Collecting annotations...') for completion in completions: diff --git a/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py b/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py index 9d838c1d..dac434b2 100644 --- a/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py +++ b/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py @@ -102,7 +102,7 @@ def _get_annotated_dataset(self, project_id): return json.loads(response.content) def fit(self, annotations, workdir=None, **kwargs): - # check if training is from web hook + # check if training is from web hook and load tasks from api if kwargs.get('data'): project_id = kwargs['data']['project']['id'] tasks = self._get_annotated_dataset(project_id) diff --git a/label_studio_ml/examples/substring_matching/substring_matching.py b/label_studio_ml/examples/substring_matching/substring_matching.py index e269f376..979f87b4 100644 --- a/label_studio_ml/examples/substring_matching/substring_matching.py +++ b/label_studio_ml/examples/substring_matching/substring_matching.py @@ -91,3 +91,7 @@ def _extract_meta(task): meta['start'] = task['value']['start'] meta['end'] = task['value']['end'] return meta + + def fit(self, completions, workdir=None, **kwargs): + # save some training outputs to the job result + return {'random': random.randint(1, 10)} diff --git a/label_studio_ml/examples/tensorflow/mobilenet_finetune.py b/label_studio_ml/examples/tensorflow/mobilenet_finetune.py index 48b93e51..563dfd84 100644 --- a/label_studio_ml/examples/tensorflow/mobilenet_finetune.py +++ b/label_studio_ml/examples/tensorflow/mobilenet_finetune.py @@ -6,7 +6,8 @@ from PIL import Image from label_studio_ml.model import LabelStudioMLBase -from label_studio_ml.utils import get_image_local_path, get_single_tag_keys, get_choice, is_skipped +from label_studio_ml.utils import get_image_local_path, get_single_tag_keys, get_choice, is_skipped, \ + get_annotated_dataset logger = logging.getLogger(__name__) feature_extractor_model = 'https://tfhub.dev/google/tf2-preview/mobilenet_v2/feature_vector/4' @@ -62,6 +63,10 @@ def predict(self, tasks, **kwargs): }] def fit(self, completions, workdir=None, **kwargs): + # check if training is from web hook and load tasks from api + if kwargs.get('data'): + project_id = kwargs['data']['project']['id'] + completions = get_annotated_dataset(project_id) annotations = [] for completion in completions: diff --git a/label_studio_ml/examples/tesseract/tesseract.py b/label_studio_ml/examples/tesseract/tesseract.py index 1b00a733..b4292d3a 100644 --- a/label_studio_ml/examples/tesseract/tesseract.py +++ b/label_studio_ml/examples/tesseract/tesseract.py @@ -1,3 +1,5 @@ +import random + from PIL import Image import pytesseract as pt from label_studio_ml.model import LabelStudioMLBase @@ -74,3 +76,7 @@ def _extract_meta(task): meta["original_width"] = task['original_width'] meta["original_height"] = task['original_height'] return meta + + def fit(self, completions, workdir=None, **kwargs): + # save some training outputs to the job result + return {'random': random.randint(1, 10)} \ No newline at end of file