midas-research · debanjanbhucs · Mar 11, 2022
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,4 +6,30 @@ requires = [
     "datasets",
     "seqeval"
 ]
-build-backend = "setuptools.build_meta"
+build-backend = "setuptools.build_meta"
+
+[tool.black]
+line-length = 120
+target-version = ['py36', 'py37', 'py38']
+exclude = '''
+(
+  /(
+      \.eggs
+    | \.git
+    | \.pytest_cache
+    | build
+    | dist
+  )/
+)
+'''
+[tool.pytest.ini_options]
+flake8-max-line-length = 210
+flake8-ignore = ["E203", "W503"]  # See https://github.com/PyCQA/pycodestyle/issues/373
+markers = [
+    "integration",
+]
+[tool.mypy]
+ignore_missing_imports = true
+
+[tool.isort]
+profile = "black"
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,2 +1,10 @@
-transformers
-datasets
+pytest
+pytest-mypy
+pytest-isort
+pytest-flake8
+flake8-black
+flake8<4.0.0
+types-Deprecated
+types-dataclasses
+types-tabulate
+types-requests
diff --git a/src/dlkp/kp_dataset/datasets.py b/src/dlkp/kp_dataset/datasets.py
@@ -1,6 +1,8 @@
-import os, sys
+import os
+import sys
 from dataclasses import dataclass, field
 from typing import Optional
+
 from datasets import ClassLabel, load_dataset
 
 
@@ -38,9 +40,7 @@ def set_labels(self):
     def load_kp_datasets(self):
         if self.data_args.dataset_name is not None:
             # Downloading and loading a dataset from the hub.
-            self.datasets = load_dataset(
-                self.data_args.dataset_name, self.data_args.dataset_config_name
-            )
+            self.datasets = load_dataset(self.data_args.dataset_name, self.data_args.dataset_config_name)
         else:
             data_files = {}
             if self.data_args.train_file is not None:
@@ -63,9 +63,7 @@ def load_kp_datasets(self):
             column_names = self.datasets["test"].column_names
             features = self.datasets["test"].features
         else:
-            raise AssertionError(
-                "neither train, validation nor test dataset is availabel"
-            )
+            raise AssertionError("neither train, validation nor test dataset is availabel")
 
         if self.text_column_name is None:
             self.text_column_name = (
@@ -75,9 +73,7 @@ def load_kp_datasets(self):
         assert self.text_column_name in column_names
 
         if self.label_column_name is None:
-            self.label_column_name = (
-                "doc_bio_tags" if "doc_bio_tags" in column_names else None
-            )
+            self.label_column_name = "doc_bio_tags" if "doc_bio_tags" in column_names else None
             if len(column_names) > 2:
                 self.label_column_name = column_names[2]
 
@@ -132,11 +128,7 @@ def tokenize_and_align_labels_(self, examples):
                 # For the other tokens in a word, we set the label to either the current label or -100, depending on
                 # the label_all_tokens flag.
                 else:
-                    label_ids.append(
-                        self.label_to_id[label[word_idx]]
-                        if self.data_args.label_all_tokens
-                        else -100
-                    )
+                    label_ids.append(self.label_to_id[label[word_idx]] if self.data_args.label_all_tokens else -100)
                     # to avoid error change -100 to 'O' tag i.e. 2 class
                     # label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else 2)
                 previous_word_idx = word_idx
@@ -165,11 +157,7 @@ def extract_kp_from_tags(self, examples, idx):
         ids = examples["input_ids"]
         atn_mask = examples["special_tokens_mask"]
         tokens = self.tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
-        tags = [
-            self.id_to_label[p]
-            for (p, m) in zip(self.predicted_labels[idx], atn_mask)
-            if m == 0
-        ]
+        tags = [self.id_to_label[p] for (p, m) in zip(self.predicted_labels[idx], atn_mask) if m == 0]
         assert len(tokens) == len(
             tags
         ), "number of tags (={}) in prediction and tokens(={}) are not same for {}th".format(

diff --git a/src/dlkp/kp_metrics/metrics.py b/src/dlkp/kp_metrics/metrics.py
@@ -1,6 +1,6 @@
-from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
-from seqeval.scheme import IOB2, IOB1
 import numpy as np
+from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
+from seqeval.scheme import IOB1, IOB2
 
 
 def compute_metrics(p):
@@ -27,9 +27,7 @@ def compute_metrics(p):
     results = {}
     # print("cal precisi")
     # mode="strict"
-    results["overall_precision"] = precision_score(
-        true_labels, true_predictions, scheme=IOB2
-    )
+    results["overall_precision"] = precision_score(true_labels, true_predictions, scheme=IOB2)
     results["overall_recall"] = recall_score(true_labels, true_predictions, scheme=IOB2)
     # print("cal f1")
     results["overall_f1"] = f1_score(true_labels, true_predictions, scheme=IOB2)

diff --git a/src/dlkp/models/ke/crf/crf.py b/src/dlkp/models/ke/crf/crf.py
@@ -1,10 +1,11 @@
 # add models having crf classification layer with option of bilstm layers
 
-from .crf_utils import *
-from typing import List, Tuple, Dict, Union
+from typing import Dict, List, Tuple, Union
 
 import torch
 
+from .crf_utils import *
+
 VITERBI_DECODING = Tuple[List[int], float]
 
 
@@ -64,9 +65,7 @@ def reset_parameters(self):
             torch.nn.init.normal_(self.start_transitions)
             torch.nn.init.normal_(self.end_transitions)
 
-    def _input_likelihood(
-        self, logits: torch.Tensor, mask: torch.BoolTensor
-    ) -> torch.Tensor:
+    def _input_likelihood(self, logits: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
         """
         Computes the (batch_size,) denominator term for the log-likelihood, which is the
         sum of the likelihoods across all possible state sequences.
@@ -100,9 +99,7 @@ def _input_likelihood(
 
             # In valid positions (mask == True) we want to take the logsumexp over the current_tag dimension
             # of `inner`. Otherwise (mask == False) we want to retain the previous alpha.
-            alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * (
-                ~mask[i]
-            ).view(batch_size, 1)
+            alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * (~mask[i]).view(batch_size, 1)
 
         # Every sequence needs to end with a transition to the stop_tag.
         if self.include_start_end_transitions:
@@ -113,9 +110,7 @@ def _input_likelihood(
         # Finally we log_sum_exp along the num_tags dim, result is (batch_size,)
         return logsumexp(stops)
 
-    def _joint_likelihood(
-        self, logits: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor
-    ) -> torch.Tensor:
+    def _joint_likelihood(self, logits: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
         """
         Computes the numerator term for the log-likelihood, which is just score(inputs, tags)
         """
@@ -163,18 +158,14 @@ def _joint_likelihood(
 
         # Add the last input if it's not masked.
         last_inputs = logits[-1]  # (batch_size, num_tags)
-        last_input_score = last_inputs.gather(
-            1, last_tags.view(-1, 1)
-        )  # (batch_size, 1)
+        last_input_score = last_inputs.gather(1, last_tags.view(-1, 1))  # (batch_size, 1)
         last_input_score = last_input_score.squeeze()  # (batch_size,)
 
         score = score + last_transition_score + last_input_score * mask[-1]
 
         return score
 
-    def forward(
-        self, inputs: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor = None
-    ) -> torch.Tensor:
+    def forward(self, inputs: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor = None) -> torch.Tensor:
         """
         Computes the log likelihood.
         """
@@ -226,33 +217,21 @@ def viterbi_tags(
         transitions = torch.Tensor(num_tags + 2, num_tags + 2).fill_(-10000.0)
 
         # Apply transition constraints
-        constrained_transitions = self.transitions * self._constraint_mask[
-            :num_tags, :num_tags
-        ] + -10000.0 * (1 - self._constraint_mask[:num_tags, :num_tags])
+        constrained_transitions = self.transitions * self._constraint_mask[:num_tags, :num_tags] + -10000.0 * (
+            1 - self._constraint_mask[:num_tags, :num_tags]
+        )
         transitions[:num_tags, :num_tags] = constrained_transitions.data
 
         if self.include_start_end_transitions:
-            transitions[
+            transitions[start_tag, :num_tags] = self.start_transitions.detach() * self._constraint_mask[
                 start_tag, :num_tags
-            ] = self.start_transitions.detach() * self._constraint_mask[
-                start_tag, :num_tags
-            ].data + -10000.0 * (
-                1 - self._constraint_mask[start_tag, :num_tags].detach()
-            )
-            transitions[
+            ].data + -10000.0 * (1 - self._constraint_mask[start_tag, :num_tags].detach())
+            transitions[:num_tags, end_tag] = self.end_transitions.detach() * self._constraint_mask[
                 :num_tags, end_tag
-            ] = self.end_transitions.detach() * self._constraint_mask[
-                :num_tags, end_tag
-            ].data + -10000.0 * (
-                1 - self._constraint_mask[:num_tags, end_tag].detach()
-            )
+            ].data + -10000.0 * (1 - self._constraint_mask[:num_tags, end_tag].detach())
         else:
-            transitions[start_tag, :num_tags] = -10000.0 * (
-                1 - self._constraint_mask[start_tag, :num_tags].detach()
-            )
-            transitions[:num_tags, end_tag] = -10000.0 * (
-                1 - self._constraint_mask[:num_tags, end_tag].detach()
-            )
+            transitions[start_tag, :num_tags] = -10000.0 * (1 - self._constraint_mask[start_tag, :num_tags].detach())
+            transitions[:num_tags, end_tag] = -10000.0 * (1 - self._constraint_mask[:num_tags, end_tag].detach())
 
         best_paths = []
         # Pad the max sequence length by 2 to account for start_tag + end_tag.