diff --git a/pyproject.toml b/pyproject.toml index ee3389f..fbd1715 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,4 +6,30 @@ requires = [ "datasets", "seqeval" ] -build-backend = "setuptools.build_meta" \ No newline at end of file +build-backend = "setuptools.build_meta" + +[tool.black] +line-length = 120 +target-version = ['py36', 'py37', 'py38'] +exclude = ''' +( + /( + \.eggs + | \.git + | \.pytest_cache + | build + | dist + )/ +) +''' +[tool.pytest.ini_options] +flake8-max-line-length = 210 +flake8-ignore = ["E203", "W503"] # See https://github.com/PyCQA/pycodestyle/issues/373 +markers = [ + "integration", +] +[tool.mypy] +ignore_missing_imports = true + +[tool.isort] +profile = "black" \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index 37af714..72c198b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,2 +1,10 @@ -transformers -datasets \ No newline at end of file +pytest +pytest-mypy +pytest-isort +pytest-flake8 +flake8-black +flake8<4.0.0 +types-Deprecated +types-dataclasses +types-tabulate +types-requests \ No newline at end of file diff --git a/src/dlkp/kp_dataset/datasets.py b/src/dlkp/kp_dataset/datasets.py index ba14d28..97721b0 100644 --- a/src/dlkp/kp_dataset/datasets.py +++ b/src/dlkp/kp_dataset/datasets.py @@ -1,6 +1,8 @@ -import os, sys +import os +import sys from dataclasses import dataclass, field from typing import Optional + from datasets import ClassLabel, load_dataset @@ -38,9 +40,7 @@ def set_labels(self): def load_kp_datasets(self): if self.data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. - self.datasets = load_dataset( - self.data_args.dataset_name, self.data_args.dataset_config_name - ) + self.datasets = load_dataset(self.data_args.dataset_name, self.data_args.dataset_config_name) else: data_files = {} if self.data_args.train_file is not None: @@ -63,9 +63,7 @@ def load_kp_datasets(self): column_names = self.datasets["test"].column_names features = self.datasets["test"].features else: - raise AssertionError( - "neither train, validation nor test dataset is availabel" - ) + raise AssertionError("neither train, validation nor test dataset is availabel") if self.text_column_name is None: self.text_column_name = ( @@ -75,9 +73,7 @@ def load_kp_datasets(self): assert self.text_column_name in column_names if self.label_column_name is None: - self.label_column_name = ( - "doc_bio_tags" if "doc_bio_tags" in column_names else None - ) + self.label_column_name = "doc_bio_tags" if "doc_bio_tags" in column_names else None if len(column_names) > 2: self.label_column_name = column_names[2] @@ -132,11 +128,7 @@ def tokenize_and_align_labels_(self, examples): # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: - label_ids.append( - self.label_to_id[label[word_idx]] - if self.data_args.label_all_tokens - else -100 - ) + label_ids.append(self.label_to_id[label[word_idx]] if self.data_args.label_all_tokens else -100) # to avoid error change -100 to 'O' tag i.e. 2 class # label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else 2) previous_word_idx = word_idx @@ -165,11 +157,7 @@ def extract_kp_from_tags(self, examples, idx): ids = examples["input_ids"] atn_mask = examples["special_tokens_mask"] tokens = self.tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True) - tags = [ - self.id_to_label[p] - for (p, m) in zip(self.predicted_labels[idx], atn_mask) - if m == 0 - ] + tags = [self.id_to_label[p] for (p, m) in zip(self.predicted_labels[idx], atn_mask) if m == 0] assert len(tokens) == len( tags ), "number of tags (={}) in prediction and tokens(={}) are not same for {}th".format( diff --git a/src/dlkp/kp_metrics/metrics.py b/src/dlkp/kp_metrics/metrics.py index 5901ec4..cf3a7c3 100644 --- a/src/dlkp/kp_metrics/metrics.py +++ b/src/dlkp/kp_metrics/metrics.py @@ -1,6 +1,6 @@ -from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score -from seqeval.scheme import IOB2, IOB1 import numpy as np +from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score +from seqeval.scheme import IOB1, IOB2 def compute_metrics(p): @@ -27,9 +27,7 @@ def compute_metrics(p): results = {} # print("cal precisi") # mode="strict" - results["overall_precision"] = precision_score( - true_labels, true_predictions, scheme=IOB2 - ) + results["overall_precision"] = precision_score(true_labels, true_predictions, scheme=IOB2) results["overall_recall"] = recall_score(true_labels, true_predictions, scheme=IOB2) # print("cal f1") results["overall_f1"] = f1_score(true_labels, true_predictions, scheme=IOB2) diff --git a/src/dlkp/models/ke/crf/crf.py b/src/dlkp/models/ke/crf/crf.py index 27786b5..c36f5e6 100644 --- a/src/dlkp/models/ke/crf/crf.py +++ b/src/dlkp/models/ke/crf/crf.py @@ -1,10 +1,11 @@ # add models having crf classification layer with option of bilstm layers -from .crf_utils import * -from typing import List, Tuple, Dict, Union +from typing import Dict, List, Tuple, Union import torch +from .crf_utils import * + VITERBI_DECODING = Tuple[List[int], float] @@ -64,9 +65,7 @@ def reset_parameters(self): torch.nn.init.normal_(self.start_transitions) torch.nn.init.normal_(self.end_transitions) - def _input_likelihood( - self, logits: torch.Tensor, mask: torch.BoolTensor - ) -> torch.Tensor: + def _input_likelihood(self, logits: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: """ Computes the (batch_size,) denominator term for the log-likelihood, which is the sum of the likelihoods across all possible state sequences. @@ -100,9 +99,7 @@ def _input_likelihood( # In valid positions (mask == True) we want to take the logsumexp over the current_tag dimension # of `inner`. Otherwise (mask == False) we want to retain the previous alpha. - alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * ( - ~mask[i] - ).view(batch_size, 1) + alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * (~mask[i]).view(batch_size, 1) # Every sequence needs to end with a transition to the stop_tag. if self.include_start_end_transitions: @@ -113,9 +110,7 @@ def _input_likelihood( # Finally we log_sum_exp along the num_tags dim, result is (batch_size,) return logsumexp(stops) - def _joint_likelihood( - self, logits: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor - ) -> torch.Tensor: + def _joint_likelihood(self, logits: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: """ Computes the numerator term for the log-likelihood, which is just score(inputs, tags) """ @@ -163,18 +158,14 @@ def _joint_likelihood( # Add the last input if it's not masked. last_inputs = logits[-1] # (batch_size, num_tags) - last_input_score = last_inputs.gather( - 1, last_tags.view(-1, 1) - ) # (batch_size, 1) + last_input_score = last_inputs.gather(1, last_tags.view(-1, 1)) # (batch_size, 1) last_input_score = last_input_score.squeeze() # (batch_size,) score = score + last_transition_score + last_input_score * mask[-1] return score - def forward( - self, inputs: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor = None - ) -> torch.Tensor: + def forward(self, inputs: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor = None) -> torch.Tensor: """ Computes the log likelihood. """ @@ -226,33 +217,21 @@ def viterbi_tags( transitions = torch.Tensor(num_tags + 2, num_tags + 2).fill_(-10000.0) # Apply transition constraints - constrained_transitions = self.transitions * self._constraint_mask[ - :num_tags, :num_tags - ] + -10000.0 * (1 - self._constraint_mask[:num_tags, :num_tags]) + constrained_transitions = self.transitions * self._constraint_mask[:num_tags, :num_tags] + -10000.0 * ( + 1 - self._constraint_mask[:num_tags, :num_tags] + ) transitions[:num_tags, :num_tags] = constrained_transitions.data if self.include_start_end_transitions: - transitions[ + transitions[start_tag, :num_tags] = self.start_transitions.detach() * self._constraint_mask[ start_tag, :num_tags - ] = self.start_transitions.detach() * self._constraint_mask[ - start_tag, :num_tags - ].data + -10000.0 * ( - 1 - self._constraint_mask[start_tag, :num_tags].detach() - ) - transitions[ + ].data + -10000.0 * (1 - self._constraint_mask[start_tag, :num_tags].detach()) + transitions[:num_tags, end_tag] = self.end_transitions.detach() * self._constraint_mask[ :num_tags, end_tag - ] = self.end_transitions.detach() * self._constraint_mask[ - :num_tags, end_tag - ].data + -10000.0 * ( - 1 - self._constraint_mask[:num_tags, end_tag].detach() - ) + ].data + -10000.0 * (1 - self._constraint_mask[:num_tags, end_tag].detach()) else: - transitions[start_tag, :num_tags] = -10000.0 * ( - 1 - self._constraint_mask[start_tag, :num_tags].detach() - ) - transitions[:num_tags, end_tag] = -10000.0 * ( - 1 - self._constraint_mask[:num_tags, end_tag].detach() - ) + transitions[start_tag, :num_tags] = -10000.0 * (1 - self._constraint_mask[start_tag, :num_tags].detach()) + transitions[:num_tags, end_tag] = -10000.0 * (1 - self._constraint_mask[:num_tags, end_tag].detach()) best_paths = [] # Pad the max sequence length by 2 to account for start_tag + end_tag. diff --git a/src/dlkp/models/ke/crf/crf_trainer.py b/src/dlkp/models/ke/crf/crf_trainer.py index 073f29e..912249f 100644 --- a/src/dlkp/models/ke/crf/crf_trainer.py +++ b/src/dlkp/models/ke/crf/crf_trainer.py @@ -1,11 +1,9 @@ -from transformers import ( - Trainer, - set_seed, -) -from transformers.trainer import * -from transformers.trainer_utils import PredictionOutput from torch import nn from torch.utils.data.dataloader import DataLoader +from transformers import Trainer, set_seed +from transformers.trainer import * +from transformers.trainer_utils import PredictionOutput + # from torch.utils.data.dataset import Dataset # from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -26,17 +24,13 @@ def prediction_loop( if not isinstance(dataloader.dataset, collections.abc.Sized): raise ValueError("dataset must implement __len__") prediction_loss_only = ( - prediction_loss_only - if prediction_loss_only is not None - else self.args.prediction_loss_only + prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only ) if self.args.deepspeed and not self.args.do_train: # no harm, but flagging to the user that deepspeed config is ignored for eval # flagging only for when --do_train wasn't passed as only then it's redundant - logger.info( - "Detected the deepspeed argument but it will not be used for evaluation" - ) + logger.info("Detected the deepspeed argument but it will not be used for evaluation") model = self._wrap_model(self.model, training=False) @@ -56,39 +50,27 @@ def prediction_loop( world_size = max(1, self.args.world_size) - eval_losses_gatherer = DistributedTensorGatherer( - world_size, num_examples, make_multiple_of=batch_size - ) + eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size) if not prediction_loss_only: # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass # a batch size to the sampler) make_multiple_of = None - if hasattr(dataloader, "sampler") and isinstance( - dataloader.sampler, SequentialDistributedSampler - ): + if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, SequentialDistributedSampler): make_multiple_of = dataloader.sampler.batch_size - preds_gatherer = DistributedTensorGatherer( - world_size, num_examples, make_multiple_of=make_multiple_of - ) - labels_gatherer = DistributedTensorGatherer( - world_size, num_examples, make_multiple_of=make_multiple_of - ) + preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) + labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) if self.args.past_index >= 0: self._past = None model.eval() if is_torch_tpu_available(): - dataloader = pl.ParallelLoader( - dataloader, [self.args.device] - ).per_device_loader(self.args.device) + dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) self.callback_handler.eval_dataloader = dataloader for step, inputs in enumerate(dataloader): - loss, logits, labels = self.prediction_step( - model, inputs, prediction_loss_only, ignore_keys=ignore_keys - ) + loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) best_path = self.eval_step(model, logits, inputs["attention_mask"]) # best_path= self.eval_step(model, logits) @@ -112,42 +94,19 @@ def prediction_loop( # assert logits.shape==labels.shape if loss is not None: losses = loss.repeat(batch_size) - losses_host = ( - losses - if losses_host is None - else torch.cat((losses_host, losses), dim=0) - ) + losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0) if logits is not None: - preds_host = ( - logits - if preds_host is None - else nested_concat(preds_host, logits, padding_index=-100) - ) + preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) if labels is not None: - labels_host = ( - labels - if labels_host is None - else nested_concat(labels_host, labels, padding_index=-100) - ) - self.control = self.callback_handler.on_prediction_step( - self.args, self.state, self.control - ) + labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) + self.control = self.callback_handler.on_prediction_step(self.args, self.state, self.control) # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. - if ( - self.args.eval_accumulation_steps is not None - and (step + 1) % self.args.eval_accumulation_steps == 0 - ): - eval_losses_gatherer.add_arrays( - self._gather_and_numpify(losses_host, "eval_losses") - ) + if self.args.eval_accumulation_steps is not None and (step + 1) % self.args.eval_accumulation_steps == 0: + eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) if not prediction_loss_only: - preds_gatherer.add_arrays( - self._gather_and_numpify(preds_host, "eval_preds") - ) - labels_gatherer.add_arrays( - self._gather_and_numpify(labels_host, "eval_label_ids") - ) + preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) + labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) # Set back to None to begin a new accumulation losses_host, preds_host, labels_host = None, None, None @@ -157,29 +116,17 @@ def prediction_loop( delattr(self, "_past") # Gather all remaining tensors and put them back on the CPU - eval_losses_gatherer.add_arrays( - self._gather_and_numpify(losses_host, "eval_losses") - ) + eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) if not prediction_loss_only: - preds_gatherer.add_arrays( - self._gather_and_numpify(preds_host, "eval_preds") - ) - labels_gatherer.add_arrays( - self._gather_and_numpify(labels_host, "eval_label_ids") - ) + preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) + labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) eval_loss = eval_losses_gatherer.finalize() preds = preds_gatherer.finalize() if not prediction_loss_only else None label_ids = labels_gatherer.finalize() if not prediction_loss_only else None - if ( - self.compute_metrics is not None - and preds is not None - and label_ids is not None - ): - metrics = self.compute_metrics( - EvalPrediction(predictions=preds, label_ids=label_ids) - ) + if self.compute_metrics is not None and preds is not None and label_ids is not None: + metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} diff --git a/src/dlkp/models/ke/crf/crf_utils.py b/src/dlkp/models/ke/crf/crf_utils.py index e9e3818..f83c335 100644 --- a/src/dlkp/models/ke/crf/crf_utils.py +++ b/src/dlkp/models/ke/crf/crf_utils.py @@ -1,11 +1,11 @@ """ Conditional random field utilis file """ -from typing import List, Tuple, Dict, Union, Optional +import logging +import math +from typing import Dict, List, Optional, Tuple, Union import torch -import math -import logging # from allennlp.common.checks import ConfigurationError # import allennlp.nn.util as util @@ -15,9 +15,7 @@ # logger = logging.get_logger(__name__) -def allowed_transitions( - constraint_type: str, labels: Dict[int, str] -) -> List[Tuple[int, int]]: +def allowed_transitions(constraint_type: str, labels: Dict[int, str]) -> List[Tuple[int, int]]: """ Given labels and a constraint type, returns the allowed transitions. It will additionally include transitions for the start and end states, which are used @@ -56,16 +54,12 @@ def allowed_transitions( else: to_tag = to_label[0] to_entity = to_label[1:] - if is_transition_allowed( - constraint_type, from_tag, from_entity, to_tag, to_entity - ): + if is_transition_allowed(constraint_type, from_tag, from_entity, to_tag, to_entity): allowed.append((from_label_index, to_label_index)) return allowed -def is_transition_allowed( - constraint_type: str, from_tag: str, from_entity: str, to_tag: str, to_entity: str -): +def is_transition_allowed(constraint_type: str, from_tag: str, from_entity: str, to_tag: str, to_entity: str): """ Given a constraint type and strings `from_tag` and `to_tag` that represent the origin and destination of the transition, return whether @@ -108,9 +102,7 @@ def is_transition_allowed( from_tag in ("O", "L", "U") and to_tag in ("O", "B", "U"), # B-x can only transition to I-x or L-x # I-x can only transition to I-x or L-x - from_tag in ("B", "I") - and to_tag in ("I", "L") - and from_entity == to_entity, + from_tag in ("B", "I") and to_tag in ("I", "L") and from_entity == to_entity, ] ) elif constraint_type == "BIO": @@ -161,9 +153,7 @@ def is_transition_allowed( print("error in constrint type") -def logsumexp( - tensor: torch.Tensor, dim: int = -1, keepdim: bool = False -) -> torch.Tensor: +def logsumexp(tensor: torch.Tensor, dim: int = -1, keepdim: bool = False) -> torch.Tensor: """ A numerically stable computation of logsumexp. This is mathematically equivalent to `tensor.exp().sum(dim, keep=keepdim).log()`. This function is typically used for summing log @@ -236,15 +226,11 @@ def viterbi_decode( elif top_k >= 1: flatten_output = False else: - raise ValueError( - f"top_k must be either None or an integer >=1. Instead received {top_k}" - ) + raise ValueError(f"top_k must be either None or an integer >=1. Instead received {top_k}") sequence_length, num_tags = list(tag_sequence.size()) - has_start_end_restrictions = ( - allowed_end_transitions is not None or allowed_start_transitions is not None - ) + has_start_end_restrictions = allowed_end_transitions is not None or allowed_start_transitions is not None if has_start_end_restrictions: @@ -259,12 +245,8 @@ def viterbi_decode( # Start and end transitions are fully defined, but cannot transition between each other. - allowed_start_transitions = torch.cat( - [allowed_start_transitions, torch.tensor([-math.inf, -math.inf])] - ) - allowed_end_transitions = torch.cat( - [allowed_end_transitions, torch.tensor([-math.inf, -math.inf])] - ) + allowed_start_transitions = torch.cat([allowed_start_transitions, torch.tensor([-math.inf, -math.inf])]) + allowed_end_transitions = torch.cat([allowed_end_transitions, torch.tensor([-math.inf, -math.inf])]) # First define how we may transition FROM the start and end tags. new_transition_matrix[-2, :] = allowed_start_transitions diff --git a/src/dlkp/models/ke/extract_kp_text.py b/src/dlkp/models/ke/extract_kp_text.py index ba1e3b6..4c05ad4 100644 --- a/src/dlkp/models/ke/extract_kp_text.py +++ b/src/dlkp/models/ke/extract_kp_text.py @@ -1,11 +1,11 @@ -import os, json -from .extraction_utils import TrainingArguments, DataTrainingArguments, ModelArguments +import json +import os + +from .extraction_utils import DataTrainingArguments, ModelArguments, TrainingArguments from .kpe import run_extraction_model -def extract_from_text( - text_list, model_name_or_path, use_CRF=False, output_dir="eval_output" -): +def extract_from_text(text_list, model_name_or_path, use_CRF=False, output_dir="eval_output"): # if output_dir is None: # output_dir = # create a file and pass to extractor diff --git a/src/dlkp/models/ke/extraction_utils.py b/src/dlkp/models/ke/extraction_utils.py index 0ba2ebb..eeb0343 100644 --- a/src/dlkp/models/ke/extraction_utils.py +++ b/src/dlkp/models/ke/extraction_utils.py @@ -3,6 +3,7 @@ import sys from dataclasses import dataclass, field from typing import Optional + from transformers import TrainingArguments @@ -13,39 +14,27 @@ class ModelArguments: """ model_name_or_path: str = field( - metadata={ - "help": "Path to pretrained model or model identifier from huggingface.co/models" - } + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} ) model_family_name: str = field( default="auto", - metadata={ - "help": "name of the family of model, bert, longformer, reformer etc." - }, + metadata={"help": "name of the family of model, bert, longformer, reformer etc."}, ) config_name: Optional[str] = field( default=None, - metadata={ - "help": "Pretrained config name or path if not the same as model_name" - }, + metadata={"help": "Pretrained config name or path if not the same as model_name"}, ) tokenizer_name: Optional[str] = field( default=None, - metadata={ - "help": "Pretrained tokenizer name or path if not the same as model_name" - }, + metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}, ) cache_dir: Optional[str] = field( default=None, - metadata={ - "help": "Where do you want to store the pretrained models downloaded from huggingface.co" - }, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) model_revision: str = field( default="main", - metadata={ - "help": "The specific model version to use (can be a branch name, tag name or commit id)." - }, + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) use_CRF: bool = field( default=False, @@ -63,9 +52,7 @@ class DataTrainingArguments: Arguments pertaining to what data we are going to input our model for training and eval. """ - task_name: Optional[str] = field( - default="token", metadata={"help": "The name of the task token, crf"} - ) + task_name: Optional[str] = field(default="token", metadata={"help": "The name of the task token, crf"}) train_file: Optional[str] = field( default=None, @@ -73,28 +60,20 @@ class DataTrainingArguments: ) validation_file: Optional[str] = field( default=None, - metadata={ - "help": "An optional input evaluation data file to evaluate on (a csv or JSON file)." - }, + metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."}, ) test_file: Optional[str] = field( default=None, - metadata={ - "help": "An optional input test data file to predict on (a csv or JSON file)." - }, + metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."}, ) text_column_name: Optional[str] = field( default=None, - metadata={ - "help": "An optional input test data file to predict on (a csv or JSON file)." - }, + metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."}, ) label_column_name: Optional[str] = field( default=None, - metadata={ - "help": "An optional input test data file to predict on (a csv or JSON file)." - }, + metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."}, ) train_data_percent: Optional[int] = field( default=100, @@ -134,9 +113,7 @@ class DataTrainingArguments: ) return_entity_level_metrics: bool = field( default=False, - metadata={ - "help": "Whether to return all the entity levels during evaluation or just the overall ones." - }, + metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."}, ) dataset_name: Optional[str] = field( default=None, @@ -144,9 +121,7 @@ class DataTrainingArguments: ) dataset_config_name: Optional[str] = field( default="extraction", - metadata={ - "help": "The configuration name of the dataset to use (via the datasets library)." - }, + metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}, ) cache_file_name: Optional[str] = field( default=None, @@ -162,9 +137,7 @@ def __post_init__(self): and self.validation_file is None and self.test_file is None ): - raise ValueError( - "Need either a dataset name or a training/validation file." - ) + raise ValueError("Need either a dataset name or a training/validation file.") else: if self.train_file is not None: extension = self.train_file.split(".")[-1] @@ -185,15 +158,10 @@ def __post_init__(self): "json", ], "`test_file` should be a csv or a json file." self.task_name = self.task_name.lower() - assert ( - self.train_data_percent + self.test_data_percent + self.valid_data_percent - == 100 - ) + assert self.train_data_percent + self.test_data_percent + self.valid_data_percent == 100 -def tokenize_and_align_labels( - examples, tokenizer, text_column_name, padding, label_column_name=None -): +def tokenize_and_align_labels(examples, tokenizer, text_column_name, padding, label_column_name=None): tokenized_inputs = tokenizer( examples[text_column_name], padding=padding, @@ -218,9 +186,7 @@ def tokenize_and_align_labels( # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: - label_ids.append( - label_to_id[label[word_idx]] if data_args.label_all_tokens else -100 - ) + label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100) # to avoid error change -100 to 'O' tag i.e. 2 class # label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else 2) previous_word_idx = word_idx diff --git a/src/dlkp/models/ke/kpe.py b/src/dlkp/models/ke/kpe.py index 38b1d3c..75cc7d1 100644 --- a/src/dlkp/models/ke/kpe.py +++ b/src/dlkp/models/ke/kpe.py @@ -29,34 +29,31 @@ import numpy as np import pandas as pd - import transformers from transformers import ( AutoConfig, AutoModelForTokenClassification, AutoTokenizer, + BertForTokenClassification, DataCollatorForTokenClassification, HfArgumentParser, PreTrainedTokenizerFast, Trainer, TrainingArguments, set_seed, - BertForTokenClassification, ) from transformers.trainer_utils import get_last_checkpoint, is_main_process -from .transformer.crf_models import ( - BERT_CRFforTokenClassification, - AutoCRFforTokenClassification, -) -from .transformer.token_classification_models import ( - LongformerForTokenClassification, -) -from .crf.crf_trainer import CRF_Trainer +from ...kp_dataset.datasets import KpExtractionDatasets # from extraction_utils import ModelArguments, DataTrainingArguments from ...kp_metrics.metrics import compute_metrics -from ...kp_dataset.datasets import KpExtractionDatasets +from .crf.crf_trainer import CRF_Trainer +from .transformer.crf_models import ( + AutoCRFforTokenClassification, + BERT_CRFforTokenClassification, +) +from .transformer.token_classification_models import LongformerForTokenClassification logger = logging.getLogger(__name__) @@ -86,11 +83,7 @@ def run_extraction_model(model_args, data_args, training_args): # Detecting last checkpoint. last_checkpoint = None - if ( - os.path.isdir(training_args.output_dir) - and training_args.do_train - and not training_args.overwrite_output_dir - ): + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( @@ -132,9 +125,7 @@ def run_extraction_model(model_args, data_args, training_args): # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name - if model_args.tokenizer_name - else model_args.model_name_or_path, + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, add_prefix_space=True, @@ -156,20 +147,14 @@ def run_extraction_model(model_args, data_args, training_args): # config config = AutoConfig.from_pretrained( - model_args.config_name - if model_args.config_name - else model_args.model_name_or_path, + model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, cache_dir=model_args.cache_dir, ) config.use_CRF = model_args.use_CRF # model - model = ( - AutoCRFforTokenClassification - if model_args.use_CRF - else AutoModelForTokenClassification - ) + model = AutoCRFforTokenClassification if model_args.use_CRF else AutoModelForTokenClassification model = model.from_pretrained( model_args.model_name_or_path, config=config, @@ -177,9 +162,7 @@ def run_extraction_model(model_args, data_args, training_args): ) # Data collator - data_collator = DataCollatorForTokenClassification( - tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None - ) + data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) # Initialize our Trainer trainer = TRAINER_DICT["crf" if model_args.use_CRF else "token"]( @@ -212,18 +195,14 @@ def run_extraction_model(model_args, data_args, training_args): writer.write(f"{key} = {value}\n") # Need to save the state, since Trainer.save_model saves only the tokenizer with the model - trainer.state.save_to_json( - os.path.join(training_args.output_dir, "trainer_state.json") - ) + trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json")) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") results = trainer.evaluate() - output_eval_file = os.path.join( - training_args.output_dir, "eval_results_KPE.txt" - ) + output_eval_file = os.path.join(training_args.output_dir, "eval_results_KPE.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") @@ -246,25 +225,17 @@ def run_extraction_model(model_args, data_args, training_args): # for prediction in predictions # ] - output_test_results_file = os.path.join( - training_args.output_dir, "test_results.txt" - ) + output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_process_zero(): with open(output_test_results_file, "w") as writer: for key, value in sorted(metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") - output_test_predictions_file = os.path.join( - training_args.output_dir, "test_predictions.csv" - ) - output_test_predictions_BIO_file = os.path.join( - training_args.output_dir, "test_predictions_BIO.txt" - ) + output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.csv") + output_test_predictions_BIO_file = os.path.join(training_args.output_dir, "test_predictions_BIO.txt") if trainer.is_world_process_zero(): - predicted_kps = dataset.get_extracted_keyphrases( - predicted_labels=predictions - ) + predicted_kps = dataset.get_extracted_keyphrases(predicted_labels=predictions) df = pd.DataFrame.from_dict({"extractive_keyphrase": predicted_kps}) df.to_csv(output_test_predictions_file, index=False) diff --git a/src/dlkp/models/ke/transformer/crf_models.py b/src/dlkp/models/ke/transformer/crf_models.py index 75f2fff..322ac7e 100644 --- a/src/dlkp/models/ke/transformer/crf_models.py +++ b/src/dlkp/models/ke/transformer/crf_models.py @@ -1,17 +1,18 @@ # all token classification model with crf head -from transformers import ( - AutoModelForPreTraining, +import collections + +from transformers import ( # PretrainedModel, AutoModel, + AutoModelForPreTraining, + AutoModelForTokenClassification, BertModel, BertPreTrainedModel, LongformerModel, PreTrainedModel, - AutoModelForTokenClassification, - # PretrainedModel, ) from transformers.modeling_outputs import TokenClassifierOutput -import collections from transformers.models.longformer.modeling_longformer import LongformerPreTrainedModel + from ..crf.crf import ConditionalRandomField @@ -42,9 +43,7 @@ def forward( output_attentions=None, return_dict=None, ): - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.base_model( input_ids, @@ -96,9 +95,7 @@ def __init__(self, config): self.classifier = nn.Linear(config.hidden_size, config.num_labels) # self.crf= nn.Linear(config.num_labels,1) # self.crf= ConditionalRandomField(self.num_labels) - self.crf = ConditionalRandomField( - self.num_labels, label_encoding="BIO", idx2tag={0: "B", 1: "I", 2: "0"} - ) + self.crf = ConditionalRandomField(self.num_labels, label_encoding="BIO", idx2tag={0: "B", 1: "I", 2: "0"}) self.init_weights() def forward( @@ -113,9 +110,7 @@ def forward( output_attentions=None, return_dict=None, ): - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -181,9 +176,7 @@ def forward( output_attentions=None, return_dict=None, ): - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.longformer( input_ids, diff --git a/src/dlkp/models/ke/transformer/token_classification_models.py b/src/dlkp/models/ke/transformer/token_classification_models.py index d286ac1..aacf4ff 100644 --- a/src/dlkp/models/ke/transformer/token_classification_models.py +++ b/src/dlkp/models/ke/transformer/token_classification_models.py @@ -7,23 +7,21 @@ import numpy as np from datasets import ClassLabel, load_dataset, load_metric - from transformers import ( AutoConfig, + AutoModel, AutoModelForTokenClassification, AutoTokenizer, - AutoModel, DataCollatorForTokenClassification, HfArgumentParser, + LongformerForTokenClassification, PreTrainedTokenizerFast, Trainer, TrainingArguments, set_seed, - LongformerForTokenClassification, ) from transformers.trainer_utils import get_last_checkpoint, is_main_process - logger = logging.getLogger(__name__) from transformers.models.reformer.modeling_reformer import * @@ -51,9 +49,7 @@ def forward( output_attentions=None, return_dict=None, ): - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.reformer( input_ids, diff --git a/src/run_auto_ke.py b/src/run_auto_ke.py index bbeeee5..898ae0a 100644 --- a/src/run_auto_ke.py +++ b/src/run_auto_ke.py @@ -1,10 +1,11 @@ from statistics import mode -from dlkp.models.ke.kpe import run_extraction_model + from dlkp.models.ke.extraction_utils import ( DataTrainingArguments, ModelArguments, TrainingArguments, ) +from dlkp.models.ke.kpe import run_extraction_model training_args = TrainingArguments( output_dir="/media/nas_mount/Debanjan/amardeep/dlkp_out/inpec_debug_eval", # todo